In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import re
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from tqdm.notebook import tqdm

# Creating PrettyPrinter Instance
import pprint
pp = pprint.PrettyPrinter(indent=2)

## Helper Functions

In [108]:
# Function to write data to a JSON file
def write_to_json(data, filename):
    """
    Write data to a JSON file.

    Args:
        data: The data (dictionary, list, etc.) to be written to the JSON file.
        filename (str): The name of the JSON file to write.

    Returns:
        bool: True if the data was successfully written to the file, False otherwise.
    """
    try:
        with open(filename, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        return True
    except Exception as e:
        print(f"Error writing to JSON file: {e}")
        return False

# Function to load data from a JSON file
def load_from_json(filename):
    """
    Load data from a JSON file.

    Args:
        filename (str): The name of the JSON file to read data from.

    Returns:
        dict or list: The loaded data from the JSON file, or an empty dictionary/list if the file doesn't exist.
    """
    try:
        with open(filename, 'r') as json_file:
            data = json.load(json_file)
        return data
    except FileNotFoundError:
        print(f"JSON file '{filename}' not found. Returning an empty dictionary.")
        return {}
    except Exception as e:
        print(f"Error loading data from JSON file: {e}")
        return {}

In [2]:
# Function to load a pickle file and extract titles
def load_pickle(file_path):
    """
    Load data from a pickle file.

    :param file_path: The path to the output pickle file.
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
        
    return data

def write_pickle(data, file_path):
    """
    Write data to a pickle file.

    :param data: The data to be written to the file.
    :param file_path: The path to the output pickle file.
    """
    try:
        with open(file_path, 'wb') as file:
            pickle.dump(data, file)
        #print(f'Data has been written to {file_path}')
    except Exception as e:
        print(f'Error writing to {file_path}: {e}')

In [102]:
# Function to extract raw data from Amazon Website
def product_information(soup):
    title_element = None
    rating_element = None
    total_rating_element = None
    price_element = None
    description_element_list = []
    product_features = []
    description_element = None

    try:
        title_element = soup.find('span', id='productTitle').text.strip()
    except AttributeError:
        title_element = 'NaN'

    try:
        rating_element = soup.find('span', id='acrPopover').get('title')
    except AttributeError:
        rating_element = 'NaN'

    try:
        total_rating_element = soup.find('span', id="acrCustomerReviewText").text
    except AttributeError:
        total_rating_element = 'NaN'

    try:
        price_element = soup.find('span', class_="aok-offscreen").text.strip()
    except AttributeError:
        price_element = 'NaN'

    try:
        description_element = soup.find('table', class_="a-normal a-spacing-micro").find_all('tr')
        for i in description_element:
            description_element_list.append(i.find('td', class_='a-span3').text.strip() + ': ' + i.find('td', class_='a-span9').text.strip())
    except AttributeError:
        description_element_list = 'NaN'

    try:
        product_feat = soup.find_all('li', class_="a-spacing-mini")
        for i in product_feat:
            product_features.append(i.text.strip())
    except AttributeError:
        product_features = 'NaN'

    try:
        description_element = soup.find('div', id='productDescription').text.strip()
    except AttributeError:
        description_element = 'NaN'

    return title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element


In [4]:
# Function to extract reviews from Amazon Website
def review_information(soup):
    review_elements = soup.select("div.review")
    if not review_elements:
        print('No Reviews Available')
    else:
        scraped_reviews = []
        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author = r_author_element.text if r_author_element else None
            r_rating_element = review.select_one("i.review-rating")
            r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None
            r_title_element = review.select_one("a.review-title")
            r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
            r_title = r_title_span_element.text if r_title_span_element else None
            r_content_element = review.select_one("span.review-text")
            r_content = r_content_element.text if r_content_element else None
            r_date_element = review.select_one("span.review-date")
            r_date = r_date_element.text if r_date_element else None
            r_verified_element = review.select_one("span.a-size-mini")
            r_verified = r_verified_element.text if r_verified_element else None
            print(r_author,r_rating,r_title,r_content,r_date,r_verified)
            print('\n')

## Scraping Information from Electronics Categories

In [138]:
# electronics url 
driver = webdriver.Chrome()
url = 'https://www.amazon.sg/Buy-Electronics-Online/b/?ie=UTF8&node=6314449051&ref_=nav_cs_electronics'
amazon_base_url = 'https://www.amazon.sg'
driver.get(url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

In [139]:
# getting the links to all subcategories
subcategory_url = {}
subcategory_container = soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link")
for element in subcategory_container:
    subcategory_url[element['title']] = amazon_base_url + element['href']
    print(element['title'] + ": " + amazon_base_url + element['href'])

Computers, Components & Accessories: https://www.amazon.sg/b/?_encoding=UTF8&node=6436071051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_0&pd_rd_w=yzJDF&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=WKMMJ2W6KH3QDG0E41RM&pd_rd_wg=a0E3s&pd_rd_r=a259d34e-26db-4642-b408-699746ef3128
Mobile Phones & Communication: https://www.amazon.sg/b/?_encoding=UTF8&node=6436074051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_1&pd_rd_w=yzJDF&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=WKMMJ2W6KH3QDG0E41RM&pd_rd_wg=a0E3s&pd_rd_r=a259d34e-26db-4642-b408-699746ef3128
Home Cinema, TV & Video: https://www.amazon.sg/b/?_encoding=UTF8&node=6436073051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_2&pd_rd_w=yzJDF&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=WKMMJ2W6KH3QDG0E41RM&pd_rd_wg=a0E3s&pd_rd_r=a259d34e-26db-464

In [151]:
category_list = []
for title in subcategory_url:
    category_list.append(title)

In [152]:
category_list

['Computers, Components & Accessories',
 'Mobile Phones & Communication',
 'Home Cinema, TV & Video',
 'Headphones, Earbuds & Accessories',
 'Car & Vehicle Electronics',
 'Tablets',
 'Hi-Fi & Home Audio',
 'Camera & Photo',
 'Wearable Technology',
 'Portable Sound & Vision',
 'Sat Nav, GPS, Navigation & Accessories',
 'Household Batteries & Chargers',
 'Power Accessories',
 'Telephones, VoIP & Accessories',
 'Radio Communication']

In [153]:
# save subcatergory names
write_pickle(category_list, 'subcategory_names.pkl')

## Extract Subcategories Information

### Mobile Phones & Communication

In [116]:
# code to get urls
driver = webdriver.Chrome()
temp_url = subcategory_url['Mobile Phones & Communication']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
mobile_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    mobile_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Mobile Phones & Smartphones: https://www.amazon.sg/b/?_encoding=UTF8&node=6436149051&bbn=6314449051&ref_=Oct_d_odnav_d_6436074051_0&pd_rd_w=aR6yd&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=16687MPAEE33AVQFGYYE&pd_rd_wg=s4fus&pd_rd_r=d197d04d-526f-4c27-ac3a-5fe2e33c3967
Smartwatches: https://www.amazon.sg/b/?_encoding=UTF8&node=6436180051&bbn=6314449051&ref_=Oct_d_odnav_d_6436074051_1&pd_rd_w=aR6yd&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=16687MPAEE33AVQFGYYE&pd_rd_wg=s4fus&pd_rd_r=d197d04d-526f-4c27-ac3a-5fe2e33c3967
Accessories: https://www.amazon.sg/b/?_encoding=UTF8&node=6436145051&bbn=6314449051&ref_=Oct_d_odnav_d_6436074051_2&pd_rd_w=aR6yd&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=16687MPAEE33AVQFGYYE&pd_rd_wg=s4fus&pd_rd_r=d197d04d-526f-4c27-ac3a-5fe2e33c3967


In [106]:
# code for Results Page 
driver = webdriver.Chrome()
mobile_dict = {}
for value in tqdm(mobile_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # extract the title of the Sub-Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 5 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 6), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    mobile_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

In [107]:
mobile_dict

{'Mobile Phones & Smartphones': [{'Name': 'DORLAND Unicorn9 4G Rugged Phone, Unlocked Mobile Phone 5000mAh Explosion-Proof Smartphone 8GB+128GB IP68 6.3HD+Mobiles, 4G Dual SIM, 48MP Camera NFC OTG, GPS Android 12.0 Zone1/2',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'S$1,233.00',
   'Information': ['Brand: DORLAND',
    'Model name: Unicorn 9',
    'Wireless service provider: Unlocked for All Carriers',
    'Operating system: Android 12.0',
    'Cellular technology: 4G',
    'Memory storage capacity: 128 GB',
    'Connectivity technology: Bluetooth, Wi-Fi, USB, NFC',
    'Colour: Black',
    'Screen size: 6.3 Inches',
    'Wireless network technology: CDMA'],
   'Features': ['Explosion-proof,water-proof, dust-proof,shockproof,anti-rolling',
    '6.3" FHD 1080*2340 FHD+',
    '5000 mAh explosion-proof large battery',
    '(256+8)GB / (128+8)GB & Android 12.0 system',
    'Custom buttons and multi-function toolbox'],
   'Description': 'DORLAND 4G Ex Explosion-

In [109]:
mobile_dict.keys()

dict_keys(['Mobile Phones & Smartphones', 'Smartwatches', 'Mobile Phone Accessories'])

In [105]:
mobile_dict_copy

{'Mobile Phones & Smartphones': [{'Name': 'DORLAND Unicorn9 4G Rugged Phone, Unlocked Mobile Phone 5000mAh Explosion-Proof Smartphone 8GB+128GB IP68 6.3HD+Mobiles, 4G Dual SIM, 48MP Camera NFC OTG, GPS Android 12.0 Zone1/2',
   'Ratings': 'Rating not found',
   'Total Number of Ratings': 'Total Number of Ratings not found',
   'Price': 'S$1,233.00',
   'Information': ['Brand: DORLAND',
    'Model name: Unicorn 9',
    'Wireless service provider: Unlocked for All Carriers',
    'Operating system: Android 12.0',
    'Cellular technology: 4G',
    'Memory storage capacity: 128 GB',
    'Connectivity technology: Bluetooth, Wi-Fi, USB, NFC',
    'Colour: Black',
    'Screen size: 6.3 Inches',
    'Wireless network technology: CDMA'],
   'Features': ['Explosion-proof,water-proof, dust-proof,shockproof,anti-rolling',
    '6.3" FHD 1080*2340 FHD+',
    '5000 mAh explosion-proof large battery',
    '(256+8)GB / (128+8)GB & Android 12.0 system',
    'Custom buttons and multi-function toolbox'],


In [110]:
# save information to json file
write_to_json(mobile_dict, 'electronics_set/Mobile Phones & Communications.json')

True

In [111]:
# example usage to load json file
load_from_json('electronics_set/Mobile Phones & Communications.json')

{'Mobile Phones & Smartphones': [{'Name': 'DORLAND Unicorn9 4G Rugged Phone, Unlocked Mobile Phone 5000mAh Explosion-Proof Smartphone 8GB+128GB IP68 6.3HD+Mobiles, 4G Dual SIM, 48MP Camera NFC OTG, GPS Android 12.0 Zone1/2',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'S$1,233.00',
   'Information': ['Brand: DORLAND',
    'Model name: Unicorn 9',
    'Wireless service provider: Unlocked for All Carriers',
    'Operating system: Android 12.0',
    'Cellular technology: 4G',
    'Memory storage capacity: 128 GB',
    'Connectivity technology: Bluetooth, Wi-Fi, USB, NFC',
    'Colour: Black',
    'Screen size: 6.3 Inches',
    'Wireless network technology: CDMA'],
   'Features': ['Explosion-proof,water-proof, dust-proof,shockproof,anti-rolling',
    '6.3" FHD 1080*2340 FHD+',
    '5000 mAh explosion-proof large battery',
    '(256+8)GB / (128+8)GB & Android 12.0 system',
    'Custom buttons and multi-function toolbox'],
   'Description': 'DORLAND 4G Ex Explosion-

### Computer and Accessories

In [117]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Computers, Components & Accessories']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
computers_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    computers_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Laptops: https://www.amazon.sg/b/?_encoding=UTF8&node=6436117051&bbn=6314449051&ref_=Oct_d_odnav_d_6436071051_0&pd_rd_w=RY7kD&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=J31ED5DBQYWMJPN5H5P8&pd_rd_wg=9tgh7&pd_rd_r=4bc87205-7e53-4315-9d5f-ee13641bb999
Monitors: https://www.amazon.sg/b/?_encoding=UTF8&node=6436118051&bbn=6314449051&ref_=Oct_d_odnav_d_6436071051_1&pd_rd_w=RY7kD&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=J31ED5DBQYWMJPN5H5P8&pd_rd_wg=9tgh7&pd_rd_r=4bc87205-7e53-4315-9d5f-ee13641bb999
Desktops: https://www.amazon.sg/b/?_encoding=UTF8&node=6436116051&bbn=6314449051&ref_=Oct_d_odnav_d_6436071051_2&pd_rd_w=RY7kD&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=J31ED5DBQYWMJPN5H5P8&pd_rd_wg=9tgh7&pd_rd_r=4bc87205-7e53-4315-9d5f-ee13641bb999
Tablets: https://www.amazon.sg/b/?_encoding=

In [118]:
# code for Results Page 
driver = webdriver.Chrome()
computers_dict = {}
for value in tqdm(computers_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # extract the title of the Sub-Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 5 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 6), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    computers_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/11 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

In [119]:
# access the dictionary information
computers_dict

{'Laptops': [{'Name': 'NaN',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'NaN',
   'Information': 'NaN',
   'Features': [],
   'Description': 'NaN',
   'URL': 'https://www.amazon.sg/sspa/click?ie=UTF8&spc=MTo3NTMzNTQwMjA1MDEyNjg4OjE3MDA0NzA0ODc6c3BfYXRmX2Jyb3dzZToxMzMyODkxNDk1OTg6OjA6Og&url=%2FDORLAND-Explosion-proof-Laptop-NB09S%2Fdp%2FB0C6LYSJ9Q%2Fref%3Dsr_1_25_sspa%3Fqid%3D1700470487%26s%3Delectronics%26sr%3D1-25-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGZfYnJvd3Nl%26psc%3D1'},
  {'Name': 'DORLAND EX NB07S PC, 11th Gen Intel Core i7-1165G7, 13.3" FHD Rugged Notebook, Industrial Explosion-proof Laptop, 16G RAM, 512G SSD, Wifi, Bluetooth, HDMI, Windows 11 (Home)',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'S$5,978.00',
   'Information': ['Brand: DORLAND',
    'Model name: EX NB07S',
    'Screen size: 13.3 Inches',
    'Colour: Home',
    'Hard disk size: 256 GB',
    'CPU model: Core i7',
    'Installed RAM memory size: 16 GB',
    'O

In [131]:
temp_copy = computers_dict

In [132]:
temp_copy.keys()

dict_keys(['Laptops', 'Monitors', 'Desktop Computers', 'Tablets', 'Computer Data Storage', 'Computer Components', 'Computer Accessories', 'Printers & Accessories', 'Single-Board Computers & Accessories', 'Networking Devices', 'Scanners & Accessories'])

In [120]:
# save information to json file
write_to_json(computers_dict, 'electronics_set/Computers, Components & Accessories.json')

True

In [121]:
load_from_json('electronics_set/Computers, Components & Accessories.json')

{'Laptops': [{'Name': 'NaN',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'NaN',
   'Information': 'NaN',
   'Features': [],
   'Description': 'NaN',
   'URL': 'https://www.amazon.sg/sspa/click?ie=UTF8&spc=MTo3NTMzNTQwMjA1MDEyNjg4OjE3MDA0NzA0ODc6c3BfYXRmX2Jyb3dzZToxMzMyODkxNDk1OTg6OjA6Og&url=%2FDORLAND-Explosion-proof-Laptop-NB09S%2Fdp%2FB0C6LYSJ9Q%2Fref%3Dsr_1_25_sspa%3Fqid%3D1700470487%26s%3Delectronics%26sr%3D1-25-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGZfYnJvd3Nl%26psc%3D1'},
  {'Name': 'DORLAND EX NB07S PC, 11th Gen Intel Core i7-1165G7, 13.3" FHD Rugged Notebook, Industrial Explosion-proof Laptop, 16G RAM, 512G SSD, Wifi, Bluetooth, HDMI, Windows 11 (Home)',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'S$5,978.00',
   'Information': ['Brand: DORLAND',
    'Model name: EX NB07S',
    'Screen size: 13.3 Inches',
    'Colour: Home',
    'Hard disk size: 256 GB',
    'CPU model: Core i7',
    'Installed RAM memory size: 16 GB',
    'O

### Home Cinema, TV & Video

In [140]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Home Cinema, TV & Video']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
home_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    home_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Projectors: https://www.amazon.sg/b/?_encoding=UTF8&node=6436138051&bbn=6314449051&ref_=Oct_d_odnav_d_6436073051_0&pd_rd_w=KFU86&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=D9B0EF4YP63YF9VPK0KK&pd_rd_wg=T1ZLM&pd_rd_r=b27399b9-fcf9-42ce-9ade-ca27806722cc
TVs: https://www.amazon.sg/b/?_encoding=UTF8&node=6436141051&bbn=6314449051&ref_=Oct_d_odnav_d_6436073051_1&pd_rd_w=KFU86&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=D9B0EF4YP63YF9VPK0KK&pd_rd_wg=T1ZLM&pd_rd_r=b27399b9-fcf9-42ce-9ade-ca27806722cc
Media Streaming Devices: https://www.amazon.sg/b/?_encoding=UTF8&node=6436127051&bbn=6314449051&ref_=Oct_d_odnav_d_6436073051_2&pd_rd_w=KFU86&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=D9B0EF4YP63YF9VPK0KK&pd_rd_wg=T1ZLM&pd_rd_r=b27399b9-fcf9-42ce-9ade-ca27806722cc
Accessories: https://www.amazon

In [141]:
# code for Results Page 
driver = webdriver.Chrome()
home_dict = {}
for value in tqdm(home_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # Within the <div> tag, find the <h1> tag and extract the title of the Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # Getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 3 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 4), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    home_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/9 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

In [145]:
# save information to json file
write_to_json(home_dict, 'electronics_set/Home Cinema, TV & Video.json')

True

### Headphones, Earbuds & Accessories

In [122]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Headphones, Earbuds & Accessories']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
headphones_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    headphones_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Headphones & Earphones: https://www.amazon.sg/b/?_encoding=UTF8&node=6436190051&bbn=6314449051&ref_=Oct_d_odnav_d_6436081051_0&pd_rd_w=Jyfpx&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C4QZ27SAA1B114A5XWSM&pd_rd_wg=edANQ&pd_rd_r=875c3ce7-0a7e-413f-8210-a9c96577d319
Cases: https://www.amazon.sg/b/?_encoding=UTF8&node=6436186051&bbn=6314449051&ref_=Oct_d_odnav_d_6436081051_1&pd_rd_w=Jyfpx&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C4QZ27SAA1B114A5XWSM&pd_rd_wg=edANQ&pd_rd_r=875c3ce7-0a7e-413f-8210-a9c96577d319
Earpads: https://www.amazon.sg/b/?_encoding=UTF8&node=6436187051&bbn=6314449051&ref_=Oct_d_odnav_d_6436081051_2&pd_rd_w=Jyfpx&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C4QZ27SAA1B114A5XWSM&pd_rd_wg=edANQ&pd_rd_r=875c3ce7-0a7e-413f-8210-a9c96577d319
Replacement Cables: https://www.a

In [128]:
# code for Results Page 
driver = webdriver.Chrome()
headphones_dict = {}
for value in tqdm(headphones_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # Within the <div> tag, find the <h1> tag and extract the title of the Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # Getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 3 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 4), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    headphones_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/31 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/31 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/31 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/27 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/27 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/27 [00:00<?, ?it/s]

In [129]:
headphones_dict

{'Headphones & Earphones': [{'Name': 'NaN',
   'Ratings': 'NaN',
   'Total Number of Ratings': 'NaN',
   'Price': 'NaN',
   'Information': 'NaN',
   'Features': [],
   'Description': 'NaN',
   'URL': 'https://www.amazon.sg/sspa/click?ie=UTF8&spc=MToxNzA2NTczOTQxMjE0Njg4OjE3MDA0Nzk5MzY6c3BfYXRmX2Jyb3dzZToxMjQxNDAwOTM0OTg6OjA6Og&url=%2FCancelling-Headphones-Headphone-Reduction-Office-Red%2Fdp%2FB0B6BLN5YY%2Fref%3Dsr_1_25_sspa%3Fqid%3D1700479936%26s%3Delectronics%26sr%3D1-25-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGZfYnJvd3Nl%26psc%3D1'},
  {'Name': 'pollini Bluetooth Headphones Wireless, 40H Playtime Foldable Over Ear Headphones with Microphone, Deep Bass Stereo Headset with Soft Memory-Protein Earmuffs for iPhone/Android Cell Phone/PC (White)',
   'Ratings': '4.3 out of 5 stars',
   'Total Number of Ratings': '11,031 ratings',
   'Price': 'S$35.19 with 38 percent savings',
   'Information': ['Brand: pollini',
    'Model name: TP 19',
    'Colour: White',
    'Form factor: Over Ear',
    '

In [130]:
# save information to json file
write_to_json(headphones_dict, 'electronics_set/Headphones, Earbuds & Accessories.json')

True

### Portable Sound & Vision

In [143]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Portable Sound & Vision']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
sound_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    sound_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Shortwave Receivers: https://www.amazon.sg/b/?_encoding=UTF8&node=6436291051&bbn=6314449051&ref_=Oct_d_odnav_d_6436075051_0&pd_rd_w=N7vLX&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HX45Z6CFGC086BMQGPVR&pd_rd_wg=Ce2eT&pd_rd_r=3d6b51ef-6ad9-4054-9f8d-27762ef1d282
MP3 & Digital Media Players: https://www.amazon.sg/b/?_encoding=UTF8&node=6436152051&bbn=6314449051&ref_=Oct_d_odnav_d_6436075051_1&pd_rd_w=N7vLX&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HX45Z6CFGC086BMQGPVR&pd_rd_wg=Ce2eT&pd_rd_r=3d6b51ef-6ad9-4054-9f8d-27762ef1d282
Accessories: https://www.amazon.sg/b/?_encoding=UTF8&node=6436150051&bbn=6314449051&ref_=Oct_d_odnav_d_6436075051_2&pd_rd_w=N7vLX&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HX45Z6CFGC086BMQGPVR&pd_rd_wg=Ce2eT&pd_rd_r=3d6b51ef-6ad9-4054-9f8d-27762ef1d282
Portable D

In [146]:
# code for Results Page 
driver = webdriver.Chrome()
sound_dict = {}
for value in tqdm(sound_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # Within the <div> tag, find the <h1> tag and extract the title of the Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # Getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 3 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 4), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    sound_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

In [147]:
# save information to json file
write_to_json(sound_dict, 'electronics_set/Portable Sound & Vision.json')

True

### Household Batteries & Chargers

In [148]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Household Batteries & Chargers']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
household_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    household_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Disposable Batteries: https://www.amazon.sg/b/?_encoding=UTF8&node=6436086051&bbn=6314449051&ref_=Oct_d_odnav_d_6436067051_0&pd_rd_w=a62y4&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=ZDWD6WQE3KY8RP4W648A&pd_rd_wg=j0Ys3&pd_rd_r=0dbb9d90-e341-4e63-804f-f0e9c59e95a4
Rechargeable Batteries: https://www.amazon.sg/b/?_encoding=UTF8&node=6436087051&bbn=6314449051&ref_=Oct_d_odnav_d_6436067051_1&pd_rd_w=a62y4&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=ZDWD6WQE3KY8RP4W648A&pd_rd_wg=j0Ys3&pd_rd_r=0dbb9d90-e341-4e63-804f-f0e9c59e95a4
Battery Chargers: https://www.amazon.sg/b/?_encoding=UTF8&node=6436083051&bbn=6314449051&ref_=Oct_d_odnav_d_6436067051_2&pd_rd_w=a62y4&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=ZDWD6WQE3KY8RP4W648A&pd_rd_wg=j0Ys3&pd_rd_r=0dbb9d90-e341-4e63-804f-f0e9c59e95a4
Battery S

In [149]:
# code for Results Page 
driver = webdriver.Chrome()
household_dict = {}
for value in tqdm(household_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # Within the <div> tag, find the <h1> tag and extract the title of the Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # Getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 3 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 5), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    household_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/33 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/33 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/33 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/33 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

In [150]:
# save information to json file
write_to_json(household_dict, 'electronics_set/Household Batteries & Chargers.json')

True