# Contents  
1. [Extraction](#1)     
2. [Exploratory Data Analysis (EDA)](#2) 
3. [Transformation & Analysis](#3) 
    1. [Age Category Column (Optional)](#3.1) 
    2. [Risk Group Category Column (Optional)](#3.2) 
    3. [Muti Risk Factors Category Column (Optional)](#3.3) 
4. [Data Visualization](#4)
5. [Prediction](#5)


In [None]:
!pip3 install selenium

<a id="1"></a>
# 1. Extraction

In [28]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

# Set the path to the Chrome WebDriver
service = Service()
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

# Open the SinaLite website
driver.get('https://sinalite.com/en_ca/all-products.html')

# Initialize a list to store data
data = []

# Get all product containers
product_containers = driver.find_elements(By.CLASS_NAME, 'product-container')

# Process only the first product container
container = product_containers[0]

# Get the category name
category_name = container.find_element(By.CLASS_NAME, 'category-name').text

# Get all product names and their links under the category
products = container.find_elements(By.CLASS_NAME, 'category-product-name')
product_links = [product.find_element(By.TAG_NAME, 'a').get_attribute('href') for product in products]
product_names = [product.text for product in products]

# Iterate over each product link
for product_name, product_link in zip(product_names, product_links):
    # Go to the product page
    driver.get(product_link)
    time.sleep(2)  # Wait for the page to load

    # Get the number of reviews
    try:
        reviews_element = driver.find_element(By.XPATH, '//a[contains(@aria-label, "reviews")]')
        num_reviews = reviews_element.text.split()[0]
    except:
        num_reviews = 'No reviews'

    # Get the average rating
    try:
        # Click on the 'Reviews' tab to switch to the reviews section
        reviews_tab = driver.find_element(By.XPATH, "//a[@data-stab-name='reviews']")
        reviews_tab.click()

        avg_rating_element = driver.find_element(By.CLASS_NAME, 'avg-score')
        print(avg_rating_element)
        avg_rating = avg_rating_element.text
        print(avg_rating_element.text)
    except:
        avg_rating = 'No rating'
        print(avg_rating)

    # Get all rows in the table
    table_rows = driver.find_elements(By.XPATH, '//div[@id="details"]/table/tbody/tr')
    table_data = {}
    for row in table_rows:
        cells = row.find_elements(By.TAG_NAME, 'td')
        if len(cells) >= 2:
            property_title = cells[0].text
            property_value = cells[1].text
            table_data[property_title] = property_value

    # Append category name, product name, number of reviews, average rating, and table data to the data list
    data.append({
        'Category Name': category_name,
        'Product Name': product_name,
        'Number of Reviews': num_reviews,
        'Average Rating': avg_rating,
        'Paper Type': table_data.get('Paper Type', ''),
        'Coating': table_data.get('Coating', ''),
        'Color': table_data.get('Color', ''),
        'Quantities': table_data.get('Quantities', ''),
        'Sizes': table_data.get('Sizes', ''),
        'Finishing': table_data.get('Finishing', ''),
        'File Type': table_data.get('File Type', '')
    })
    break

# Save the data to a CSV file
with open('sinalite_products_reviews.csv', mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['Category Name', 'Product Name', 'Number of Reviews', 'Average Rating', 'Paper Type', 'Coating', 'Color', 'Quantities', 'Sizes', 'Finishing', 'File Type'])
    writer.writeheader()
    writer.writerows(data)

# Quit the WebDriver
driver.quit()


<selenium.webdriver.remote.webelement.WebElement (session="b5606848b3c81dde9739273804065015", element="f.260B0936AED597C994C9C5DB644C2755.d.370EC370AF1EEC8D4A7956DFE2567B3B.e.114")>
4.8
