<a href="https://colab.research.google.com/github/ddkryptonite/-semiconductor-market-data-aws-pipeline/blob/main/DataScrappingPythonPowerBI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import os
import boto3
import io

In [None]:
# AWS S3 Setup
S3_BUCKET = "your-s3-bucket-name"
S3_FILE_NAME = "semiconductor_scraped_data.csv"

In [None]:
# Initialize S3 client
s3 = boto3.client("s3")

In [None]:
#pip install boto3

In [None]:


def scrape_data():
    url = "https://en.wikipedia.org/wiki/Semiconductor_industry#:~:text=The%20global%20semiconductor%20industry%20is,significant%20presence%20in%20the%20field.&text=Unique%20features%20of%20the%20industry,cyclical%20pattern%20with%20high%20volatility."
    headers = {"User-Agent": "Mozilla/5.0"}

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        print("Data retrieved successfully!")
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")



    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find_all('table', class_ = 'wikitable')[0]

    header_row = table.find("tr")

    table_titles = header_row.find_all('th')

    complete_table_titles = [title.text.strip() for title in table_titles]
    #print( complete_table_titles)

    #soup.find('div')
    #print(soup.find_all('table', class_ = 'wikitable')[0])
#scrape_data()

    data = []
    for row in table.find_all("tr")[1:]:
        cols = row.find_all(["th", "td"])
        data.append([col.text.strip() for col in cols])

    #print(complete_table_titles)
    #print(data)


    df = pd.DataFrame(data, columns=complete_table_titles)

    df = df.drop(columns = 'Ref.')          # Remove missing values
    df.drop_duplicates(inplace=True) # Remove duplicate rows
    #df['Year'] = df['Year'].astype(int)  # Convert to Integer
    df['Year'] = pd.to_datetime(df['Year'])  # Convert to Date

    return df




def upload_to_s3(df):
    #Uploads DataFrame to S3 as a CSV file
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False)

    s3.put_object(
        Bucket=S3_BUCKET,
        Key=S3_FILE_NAME,
        Body=csv_buffer.getvalue()
    )

    print(f"Data uploaded to s3://{S3_BUCKET}/{S3_FILE_NAME}")

def lambda_handler(event, context):
    df = scrape_data()
    if df is not None:
      upload_to_s3(df)
    return {"statusCode": 200, "body": "ETL Pipeline Completed"}
