In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import re
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from tqdm.notebook import tqdm

# Creating PrettyPrinter Instance
import pprint
pp = pprint.PrettyPrinter(indent=2)

## Helper Functions

In [2]:
# Function to write data to a JSON file
def write_to_json(data, filename):
    """
    Write data to a JSON file.

    Args:
        data: The data (dictionary, list, etc.) to be written to the JSON file.
        filename (str): The name of the JSON file to write.

    Returns:
        bool: True if the data was successfully written to the file, False otherwise.
    """
    try:
        with open(filename, 'w') as json_file:
            json.dump(data, json_file, indent=4)
        return True
    except Exception as e:
        print(f"Error writing to JSON file: {e}")
        return False

# Function to load data from a JSON file
def load_from_json(filename):
    """
    Load data from a JSON file.

    Args:
        filename (str): The name of the JSON file to read data from.

    Returns:
        dict or list: The loaded data from the JSON file, or an empty dictionary/list if the file doesn't exist.
    """
    try:
        with open(filename, 'r') as json_file:
            data = json.load(json_file)
        return data
    except FileNotFoundError:
        print(f"JSON file '{filename}' not found. Returning an empty dictionary.")
        return {}
    except Exception as e:
        print(f"Error loading data from JSON file: {e}")
        return {}

In [3]:
# Function to load a pickle file and extract titles
def load_pickle(file_path):
    """
    Load data from a pickle file.

    :param file_path: The path to the output pickle file.
    """
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
        
    return data

def write_pickle(data, file_path):
    """
    Write data to a pickle file.

    :param data: The data to be written to the file.
    :param file_path: The path to the output pickle file.
    """
    try:
        with open(file_path, 'wb') as file:
            pickle.dump(data, file)
        #print(f'Data has been written to {file_path}')
    except Exception as e:
        print(f'Error writing to {file_path}: {e}')

In [4]:
# Function to extract raw data from Amazon Website
def product_information(soup):
    title_element = None
    rating_element = None
    total_rating_element = None
    price_element = None
    description_element_list = []
    product_features = []
    description_element = None

    try:
        title_element = soup.find('span', id='productTitle').text.strip()
    except AttributeError:
        title_element = 'NaN'

    try:
        rating_element = soup.find('span', id='acrPopover').get('title')
    except AttributeError:
        rating_element = 'NaN'

    try:
        total_rating_element = soup.find('span', id="acrCustomerReviewText").text
    except AttributeError:
        total_rating_element = 'NaN'

    try:
        price_element = soup.find('span', class_="aok-offscreen").text.strip()
    except AttributeError:
        price_element = 'NaN'

    try:
        description_element = soup.find('table', class_="a-normal a-spacing-micro").find_all('tr')
        for i in description_element:
            description_element_list.append(i.find('td', class_='a-span3').text.strip() + ': ' + i.find('td', class_='a-span9').text.strip())
    except AttributeError:
        description_element_list = 'NaN'

    try:
        product_feat = soup.find_all('li', class_="a-spacing-mini")
        for i in product_feat:
            product_features.append(i.text.strip())
    except AttributeError:
        product_features = 'NaN'

    try:
        description_element = soup.find('div', id='productDescription').text.strip()
    except AttributeError:
        description_element = 'NaN'

    return title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element


In [5]:
# Function to extract reviews from Amazon Website
def review_information(soup):
    review_elements = soup.select("div.review")
    if not review_elements:
        print('No Reviews Available')
    else:
        scraped_reviews = []
        for review in review_elements:
            r_author_element = review.select_one("span.a-profile-name")
            r_author = r_author_element.text if r_author_element else None
            r_rating_element = review.select_one("i.review-rating")
            r_rating = r_rating_element.text.replace("out of 5 stars", "") if r_rating_element else None
            r_title_element = review.select_one("a.review-title")
            r_title_span_element = r_title_element.select_one("span:not([class])") if r_title_element else None
            r_title = r_title_span_element.text if r_title_span_element else None
            r_content_element = review.select_one("span.review-text")
            r_content = r_content_element.text if r_content_element else None
            r_date_element = review.select_one("span.review-date")
            r_date = r_date_element.text if r_date_element else None
            r_verified_element = review.select_one("span.a-size-mini")
            r_verified = r_verified_element.text if r_verified_element else None
            print(r_author,r_rating,r_title,r_content,r_date,r_verified)
            print('\n')

## Scraping Information from Electronics Categories

In [8]:
# electronics url 
driver = webdriver.Chrome()
url = 'https://www.amazon.sg/Buy-Electronics-Online/b/?ie=UTF8&node=6314449051&ref_=nav_cs_electronics'
amazon_base_url = 'https://www.amazon.sg'
driver.get(url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

In [9]:
# getting the links to all subcategories
subcategory_url = {}
subcategory_container = soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link")
for element in subcategory_container:
    subcategory_url[element['title']] = amazon_base_url + element['href']
    print(element['title'] + ": " + amazon_base_url + element['href'])

Computers, Components & Accessories: https://www.amazon.sg/b/?_encoding=UTF8&node=6436071051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_0&pd_rd_w=mbVUC&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=8K3PSJWPXD47525QNH98&pd_rd_wg=7qtkZ&pd_rd_r=a31e7f05-ef1c-4e04-9214-75ae54b5406c
Mobile Phones & Communication: https://www.amazon.sg/b/?_encoding=UTF8&node=6436074051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_1&pd_rd_w=mbVUC&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=8K3PSJWPXD47525QNH98&pd_rd_wg=7qtkZ&pd_rd_r=a31e7f05-ef1c-4e04-9214-75ae54b5406c
Home Cinema, TV & Video: https://www.amazon.sg/b/?_encoding=UTF8&node=6436073051&bbn=6314449051&ref_=Oct_d_odnav_d_6314449051_2&pd_rd_w=mbVUC&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=8K3PSJWPXD47525QNH98&pd_rd_wg=7qtkZ&pd_rd_r=a31e7f05-ef1c-4e0

## Extract Subcategories Information

### Car & Vehicle Electronics

In [23]:
# code to get urls 
driver = webdriver.Chrome()
temp_url = subcategory_url['Car & Vehicle Electronics']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
car_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    car_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Accessories: https://www.amazon.sg/b/?_encoding=UTF8&node=6436107051&bbn=6314449051&ref_=Oct_d_odnav_d_6436070051_0&pd_rd_w=UtqlE&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=K6SZSN1SSMZBDWVNYTA9&pd_rd_wg=IoeKY&pd_rd_r=8f302d09-9135-4c3f-822d-6676cdbb9fcd
Car Electronics: https://www.amazon.sg/b/?_encoding=UTF8&node=6436109051&bbn=6314449051&ref_=Oct_d_odnav_d_6436070051_1&pd_rd_w=UtqlE&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=K6SZSN1SSMZBDWVNYTA9&pd_rd_wg=IoeKY&pd_rd_r=8f302d09-9135-4c3f-822d-6676cdbb9fcd
Motorcycle Electronics: https://www.amazon.sg/b/?_encoding=UTF8&node=6436112051&bbn=6314449051&ref_=Oct_d_odnav_d_6436070051_2&pd_rd_w=UtqlE&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=K6SZSN1SSMZBDWVNYTA9&pd_rd_wg=IoeKY&pd_rd_r=8f302d09-9135-4c3f-822d-6676cdbb9fcd


In [24]:
# code for Results Page 
driver = webdriver.Chrome()
car_dict = {}
for value in tqdm(car_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # extract the title of the Sub-Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 5 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 6), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    car_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

In [26]:
# save information to json file
write_to_json(car_dict, 'electronics_set/Car & Vehicle Electronics.json')

True

### Tablets

In [17]:
driver = webdriver.Chrome()
temp_url = subcategory_url['Tablets']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

headphones_dict = {}

# there are no further subcategories for tablets, hence proceed to scrape products
# Getting Pagination Link to obtain page where all results are accessible
pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

if pagination_link_element:
    result_url = amazon_base_url + pagination_link_element.get('href')

driver.get(result_url)
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

# Checking whether pagination button is available
if (soup.find('a', class_ = 's-pagination-button') is None):
    product_containers = soup.find_all('div', class_='s-result-item')
    for product_tag in product_containers:
        try:
            # extract relevant information based on the structure of the HTML
            product_url = product_tag.find('a', class_='a-link-normal').get('href')
            driver.get(amazon_base_url + product_url)
            time.sleep(3)
            page_source = driver.page_source
            new_soup = BeautifulSoup(page_source, 'html.parser')
            title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

            products.append(
                {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                'Description': description_element})
        except AttributeError as e:
            continue
else:
    pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
    # code to extract all information about products from the first 3 pages (our threshold for this project)
    products = []
    for iter in tqdm(range(1, 6), desc="Page Progress"):  
        url = amazon_base_url + pagination_link[:-1] + str(iter)
        driver.get(url)
        time.sleep(3)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in tqdm(product_containers, desc="Product Progress"):
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                link_response = requests.get(amazon_base_url + product_url)
                time.sleep(3)
                new_soup = BeautifulSoup(link_response.text, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                     'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                     'Description': description_element, 'URL': amazon_base_url + product_url})
            except AttributeError as e:
                continue

# add the results to the category_dict
headphones_dict = products

# quit the driver
driver.quit()

Page Progress:   0%|          | 0/5 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

In [19]:
# save information to json file
write_to_json(headphones_dict, 'electronics_set/Tablets.json')

True

### Hi-Fi & Home Audio

In [20]:
# code to get urls
driver = webdriver.Chrome()
temp_url = subcategory_url['Hi-Fi & Home Audio']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
home_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    home_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Speakers: https://www.amazon.sg/b/?_encoding=UTF8&node=6436131051&bbn=6314449051&ref_=Oct_d_odnav_d_6436072051_0&pd_rd_w=4wFy1&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HW85J0DHKKVQACSG2M5A&pd_rd_wg=fhWoJ&pd_rd_r=2b31d3ff-e2d4-4077-b3f0-b92744817174
Media Streaming Devices: https://www.amazon.sg/b/?_encoding=UTF8&node=6436127051&bbn=6314449051&ref_=Oct_d_odnav_d_6436072051_1&pd_rd_w=4wFy1&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HW85J0DHKKVQACSG2M5A&pd_rd_wg=fhWoJ&pd_rd_r=2b31d3ff-e2d4-4077-b3f0-b92744817174
Receivers & Separates: https://www.amazon.sg/b/?_encoding=UTF8&node=6436129051&bbn=6314449051&ref_=Oct_d_odnav_d_6436072051_2&pd_rd_w=4wFy1&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=HW85J0DHKKVQACSG2M5A&pd_rd_wg=fhWoJ&pd_rd_r=2b31d3ff-e2d4-4077-b3f0-b92744817174
Accessories: ht

In [21]:
# code for Results Page 
driver = webdriver.Chrome()
home_dict = {}
for value in tqdm(home_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # extract the title of the Sub-Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 5 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 5), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    home_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/8 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/34 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/34 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/34 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/34 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/4 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

In [27]:
# save information to json file
write_to_json(home_dict, 'electronics_set/Hi-Fi & Home Audio.json')

True

### Camera & Photo

In [28]:
# code to get urls
driver = webdriver.Chrome()
temp_url = subcategory_url['Camera & Photo']
driver.get(temp_url)
time.sleep(5)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
camera_url_dict = {}
for sub_element in soup.find_all('a', class_ = "a-link-normal octopus-pc-category-card-v2-category-link"):
    camera_url_dict[sub_element['title']] = amazon_base_url + sub_element['href']
    print(sub_element['title'] + ": " + amazon_base_url + sub_element['href'])   
    
# quit the driver
driver.quit()

Video Projectors: https://www.amazon.sg/b/?_encoding=UTF8&node=6436138051&bbn=6314449051&ref_=Oct_d_odnav_d_6436069051_0&pd_rd_w=I9jmH&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C1B1XVKY0KX4SV9W8KRQ&pd_rd_wg=U51wn&pd_rd_r=dafd1816-7c27-4a4e-a5b8-95dea71b2ccc
Lenses: https://www.amazon.sg/b/?_encoding=UTF8&node=6436101051&bbn=6314449051&ref_=Oct_d_odnav_d_6436069051_1&pd_rd_w=I9jmH&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C1B1XVKY0KX4SV9W8KRQ&pd_rd_wg=U51wn&pd_rd_r=dafd1816-7c27-4a4e-a5b8-95dea71b2ccc
Surveillance Cameras: https://www.amazon.sg/b/?_encoding=UTF8&node=6436103051&bbn=6314449051&ref_=Oct_d_odnav_d_6436069051_2&pd_rd_w=I9jmH&content-id=amzn1.sym.80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_p=80833b11-ffca-4e6a-8a04-ea9e915d944b&pf_rd_r=C1B1XVKY0KX4SV9W8KRQ&pd_rd_wg=U51wn&pd_rd_r=dafd1816-7c27-4a4e-a5b8-95dea71b2ccc
Digital Picture Frames: h

In [29]:
# code for Results Page 
driver = webdriver.Chrome()
camera_dict = {}
for value in tqdm(camera_url_dict.values(), desc = 'Subcategory Progress'): 
    sub_url = value
    driver.get(sub_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    div_tag = soup.find('div', class_='fst-h1-st')

    # extract the title of the Sub-Sub-Category
    if div_tag:
        title = div_tag.find('h1').get_text()

    # getting Pagination Link to obtain page where all results are accessible
    pagination_link_element = soup.find('a', id = 'apb-desktop-browse-search-see-all')

    if pagination_link_element:
        result_url = amazon_base_url + pagination_link_element.get('href')
    else:
        continue

    driver.get(result_url)
    time.sleep(3)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # checking whether pagination button is available
    if (soup.find('a', class_ = 's-pagination-button') is None):
        product_containers = soup.find_all('div', class_='s-result-item')
        for product_tag in product_containers:
            try:
                # extract relevant information based on the structure of the HTML
                product_url = product_tag.find('a', class_='a-link-normal').get('href')
                driver.get(amazon_base_url + product_url)
                time.sleep(3)
                page_source = driver.page_source
                new_soup = BeautifulSoup(page_source, 'html.parser')
                title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                products.append(
                    {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                    'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                    'Description': description_element})
            except AttributeError as e:
                continue
    else:
        pagination_link = soup.find('a', class_ = 's-pagination-button').get('href')
        # code to extract all information about products from the first 5 pages (our threshold for this project)
        products = []
        for iter in tqdm(range(1, 4), desc="Page Progress"):  
            url = amazon_base_url + pagination_link[:-1] + str(iter)
            driver.get(url)
            time.sleep(3)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            product_containers = soup.find_all('div', class_='s-result-item')
            for product_tag in tqdm(product_containers, desc="Product Progress"):
                try:
                    # extract relevant information based on the structure of the HTML
                    product_url = product_tag.find('a', class_='a-link-normal').get('href')
                    link_response = requests.get(amazon_base_url + product_url)
                    time.sleep(3)
                    new_soup = BeautifulSoup(link_response.text, 'html.parser')
                    title_element, rating_element, total_rating_element, price_element, description_element_list, product_features, description_element = product_information(new_soup)

                    products.append(
                        {'Name': title_element, 'Ratings': rating_element, 'Total Number of Ratings': total_rating_element,
                         'Price': price_element, 'Information': description_element_list, 'Features': product_features,
                         'Description': description_element, 'URL': amazon_base_url + product_url})
                except AttributeError as e:
                    continue
                    
    # add the results to the category_dict
    camera_dict[title] = products

# quit the driver
driver.quit()

Subcategory Progress:   0%|          | 0/13 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/28 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/37 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/32 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/32 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/32 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/29 [00:00<?, ?it/s]

Page Progress:   0%|          | 0/3 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

Product Progress:   0%|          | 0/30 [00:00<?, ?it/s]

In [30]:
# save information to json file
write_to_json(camera_dict, 'electronics_set/Camera & Photo.json')

True