In [14]:
from bs4 import BeautifulSoup
from tabulate import tabulate

# Load the HTML file
with open('C://Users//deekc//OneDrive//Documents//Deeksha_BV//DSC540//Project//dataset_amazon-bestsellers_scraped.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table element
table = soup.find('table')

# Check if the table exists
if table:
    # Step 1: Extract column headers
    headers = table.find('thead').find_all('th')
    column_names = [header.text.strip() for header in headers]

    # Step 2: Extract data rows
    rows = table.find_all('tr')
    
    # Initialize an empty list to store the data
    data = []

    # Start iterating from index 1 to skip the header row
    for row in rows[1:]:
        # Step 3: Extract data from each cell
        cells = row.find_all('td')
        row_data = [cell.text.strip() for cell in cells]
        
        # Step 4: Convert data to a dictionary with column names as keys
        row_dict = dict(zip(column_names, row_data))
        
        # Step 5: Append row data to the list
        data.append(row_dict)

    # Step 1: Remove leading or trailing whitespaces from column names
    column_names = [header.strip() for header in column_names]

    # Step 2: Convert 'numberOfOffers' column to integer type
    for item in data:
        if 'numberOfOffers' in item and item['numberOfOffers']:
            item['numberOfOffers'] = int(item['numberOfOffers'])

    # Step 3: Remove rows with missing or empty 'price/value' column
    data = [item for item in data if item.get('price/value')]

    # Step 4: Convert 'price/value' column to float type
    for item in data:
        item['price/value'] = float(item['price/value'].replace('$', '').replace(',', ''))

    # Step 5: Remove duplicate rows based on 'asin' column
    unique_data = [dict(t) for t in {tuple(d.items()) for d in data}]

    # Display the transformed data in a simple row and column format
    print(tabulate(unique_data, headers='keys', tablefmt='plain'))
else:
    print("Table not found in the HTML.")
    
    
#### Ethical Implications ####
"""
In data wrangling for the Amazon bestsellers dataset, several transformations were applied
to clean and prepare the data for analysis. These included removing leading or trailing 
whitespaces from column names, converting the 'numberOfOffers' column to integer type, 
removing rows with missing or empty 'price/value' column, converting the 'price/value' 
column to float type, and removing duplicate rows based on the 'asin' column. While there 
may not be specific legal or regulatory guidelines for this dataset, ethical considerations 
are paramount. Risks could arise from inaccuracies introduced during transformations, 
potentially impacting decision-making processes based on the data. Assumptions might 
have been made regarding the consistency and completeness of the original data. 
The data was sourced from a web scraping process, which raises ethical considerations 
regarding data ownership and usage rights. To mitigate ethical implications, 
it's essential to ensure transparency in data collection methods, obtain data from reputable sources,
and adhere to relevant privacy and data protection regulations. Additionally, implementing robust data
validation and verification processes can enhance the credibility and reliability of the data.
"""


asin        categoryFullName             categoryName    categoryUrl                                                             name                                                                                                                                                                                                      numberOfOffers      position  price    price/currency      price/value    reviewsCount    stars  thumbnailUrl                                                                           url
B06X16Z7DZ  Best Sellers in Electronics  Electronics     https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/?pg=2  Apple EarPods Headphones with 3.5mm Plug, Wired Ear Buds with Built-in Remote to Control Music, Phone Calls, and Volume                                                                                                             67           $                         18.79          284883      4.6  https://images-na.ssl-images-amazon.com/images/I/410e

"\nIn data wrangling for the Amazon bestsellers dataset, several transformations were applied\nto clean and prepare the data for analysis. These included removing leading or trailing \nwhitespaces from column names, converting the 'numberOfOffers' column to integer type, \nremoving rows with missing or empty 'price/value' column, converting the 'price/value' \ncolumn to float type, and removing duplicate rows based on the 'asin' column. While there \nmay not be specific legal or regulatory guidelines for this dataset, ethical considerations \nare paramount. Risks could arise from inaccuracies introduced during transformations, \npotentially impacting decision-making processes based on the data. Assumptions might \nhave been made regarding the consistency and completeness of the original data. \nThe data was sourced from a web scraping process, which raises ethical considerations \nregarding data ownership and usage rights. To mitigate ethical implications, \nit's essential to ensure tr