## Web-scrapping on endclothing.com

### Step 1. Store the root url of the endclothing.com.

In [None]:
# Root url of the website for scrapping
root_url = "https://www.endclothing.com/ca/clothing"

### Step 2. Get the department and color category from the site

In [2]:
# Web-scrapping the dept and colors of the clothes and store them into two arrays
from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.ChromiumEdge()       # Using Edge as our web scrapping browser
driver.get(root_url)

depts_list = []
colors_list = []
depts = driver.find_elements(By.XPATH, '//div[@data-test-id="department_FilterItem"]')      # Using XPATH to point to the department items
for i in range(len(depts)):
    driver.execute_script("arguments[0].scrollIntoView();", depts[i])       # Use JavaScript syntax to scroll the element to render the text
    depts_list.append(depts[i].text)


colors = driver.find_elements(By.XPATH, '//div[@data-test-id="colour_FilterItem"]')     # Using XPATH to point to the color items
for i in range(len(colors)):
    driver.execute_script("arguments[0].scrollIntoView();", colors[i])      # Use JavaScript syntax to scroll the element to render the text
    colors_list.append(colors[i].text)

print(F"Departments: {depts_list}")
print(F"No. of department: {len(depts_list)}")
print(F"Colors: {colors_list}")
print(F"No. of Colors: {len(colors_list)}")

Departments: ['Blazers', 'Bomber Jackets', 'Boxer Shorts', 'Cardigans', 'Cargo Pants', 'Casual Jackets', 'Casual Trousers', 'Chinos', 'Coach Jackets', 'Cycling Shorts', 'Denim Jackets', 'Dressing Gowns', 'Dungarees', 'Fleeces', 'Gilets', 'Hoodies', 'Jackets & Coats', 'Jeans', 'Knitted Vests', 'Knitwear', 'Leather Jackets', 'Long Coats', 'Long Sleeve Lounge Tops', 'Long Sleeve Polos', 'Long Sleeve Tees', 'Long Sleeve Tops', 'Lounge Pants', 'Lounge Shorts', 'Lounge Tops', 'Loungewear', 'Parkas', 'Polos', 'Puffer Jackets', 'Shirt Jackets', 'Shirts', 'Short Sleeve Shirts', 'Short Sleeve Sweats', 'Short Sleeve T-Shirts', 'Shorts', 'Suits', 'Sweat Pants', 'Sweaters', 'Sweats', 'Swimwear', 'T-Shirts', 'Tailored Trousers', 'Technical & Rain Jackets', 'Tops', 'Track Pants', 'Track Tops', 'Trench & Long Coats', 'Trousers', 'Underwear', 'Vests', 'Waistcoats']
No. of department: 55
Colors: ['Black', 'Blue', 'Brown', 'Burgundy', 'Gold', 'Green', 'Grey', 'Multi', 'N/A', 'Navy', 'Neutrals', 'Orange',

### Step 3. Build a class object to organize better

In [4]:
# Build a class "ProductCard" to store the information of each product
class ProductCard:
    def __init__(self, name, color, color_cat, price, dept) -> None:
        self._name = name
        self._color = color
        self._color_cat = color_cat
        self._price = price
        self._dept = dept

    # Getter function
    def get_name(self) -> str:
        return self._name
    def get_color(self) -> str:
        return self._color
    def get_color_cat(self) -> str:
        return self._color_cat
    def get_price(self) -> int:
        return self._price
    def get_dept(self) -> str:
        return self._dept

    # Setter function
    def set_name(self, name) -> None:
        self._name = name
    def set_color(self, color) -> None:
        self._color = color
    def set_color_cat(self, color_cat) -> None:
        self._color_cat = color_cat
    def set_price(self, price) -> None:
        self._price = price
    def set_dept(self, dept) -> None:
        self._dept = dept


### Step 4. Build an dictionary to store the webpage which failed to scrap

In [6]:
from collections import defaultdict 

failed_page = defaultdict(list)

### Step 5. Implement an function to scrap a single web page

In [7]:
# Encode those departments and colors into url format
import urllib

# Function of a single web scrapping
def single_web_scrap(color_name, dept_name):
    print(f"Working on pairs:{dept_name}:{color_name}")
    product_per_page = 120      # Each page of endclothing.com will shows 120 product cards
    null_counter = 0            # Null counter for checking is there any missing item from scrapping
    product_index = 1           # Count the number of item that had been scrapped
    driver = webdriver.ChromiumEdge()

    # Build an try-catch block to do a http request to each webpage
    # After having the number of the corresponding department and color
    # The page number can act as the url parameter to do another http request for those pages afterward
    try:
        driver.get(F"https://www.endclothing.com/ca/clothing?colour={color_name}&department={dept_name}")
        product_count = driver.find_element(By.XPATH, '//div[@data-test-id="Product__Counter"]')
        product_count, _ = product_count.text.split(' ')
        page_count = (int(product_count)//product_per_page) + 1
        page_number = 1
        print(page_count)
    except Exception as e:
            # When the http request having error, store the department name and the color name in to failed_page dict to prepare the another scrap
            # Then return the function to free a thread back into the thread pool
            print(e)
            failed_page["dept_name"].append(urllib.parse.unquote(dept_name))
            failed_page["color_name"].append(urllib.parse.unquote(color_name))
            failed_page["error_message"].append(e)
            driver.close()
            return []

    product_card_list:[ProductCard] = []        # Build a list of custom build class "ProductCard" to store the product cards from scrapping

    while page_number < page_count+1:           # Traverse each page of corresponding the department and color
        print(f"Scrapping on page {page_number}")
        driver.get(F"https://www.endclothing.com/ca/clothing?colour={color_name}&department={dept_name}&page={page_number}")

        # Point to the product card element first and do the scrolling
        product_cards = driver.find_elements(By.XPATH, '//a[@data-test-id="ProductCard__ProductCardSC"]')
        for i in range(len(product_cards)):
            driver.execute_script("arguments[0].scrollIntoView();", product_cards[i])

        # Point to element name, color, and price from each product card
        temp_product_card_list:[ProductCard] = []
        for product_card in product_cards:
            name = product_card.find_element(By.XPATH, './/span[@data-test-id="ProductCard__PlpName"]').text
            color = product_card.find_element(By.XPATH, './/span[@data-test-id="ProductCard__ProductColor"]').text
            price = product_card.find_element(By.XPATH, './/span[@data-test-id="ProductCard__ProductFinalPrice"]').text
            print(f"{product_index}, name: {name}, color: {color}, price: {price}")
            product_index += 1

            # Check whether the product card information is empty
            # Otherwise, init a ProductCard object and store those information in it
            if name == '' or color == '' or price == '':
                null_counter += 1
            else:
                price = int(price[3:].replace(',',''))
                product_card = ProductCard(name, color, urllib.parse.unquote(color_name), price, urllib.parse.unquote(dept_name))
                print(F"{product_card.get_name()}, {product_card.get_color()}, {product_card.get_price()}, {product_card.get_dept()}")
                temp_product_card_list.append(product_card)

        # If there are missing information from product card, do the web scrapping again
        if null_counter > 0:
            page_number -= 1
            null_counter = 0
        else:
            print(temp_product_card_list)
            product_card_list.extend(temp_product_card_list)
        
        page_number += 1


    print(null_counter)
    print(F"product_card_list: {product_card_list}")
    print(F"No. of product card: {len(product_card_list)}")
    print(F"{product_card_list[0].get_name()}, {product_card_list[0].get_color()}, {product_card_list[0].get_price()}, {product_card_list[0].get_dept()}")
    driver.close()
    return product_card_list


### Step 6. Build a multithreading approach for scrapping each department and color

In [None]:
# Create multi-thread pool for multiple pages scrapping
import concurrent.futures
import os
import urllib

counter = 0
future_to_url = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for dept in depts_list:
        for color in colors_list:
            print(f"dept:{dept}, color:{color}")
            future_to_url.append(executor.submit(single_web_scrap, urllib.parse.quote_plus(color), urllib.parse.quote(dept)))

    data = []
    for future in concurrent.futures.as_completed(future_to_url):
        try:
            data.extend(future.result())
        except Exception as exc:
            print(exc)


### Step 7. Check the data after finishing all web scrapping and store those failed pages in scrapping in "failed_page_list.csv".

In [None]:
import pandas as pd

for i in range(0, 10):
    product_card:ProductCard = data[i]
    print(product_card.get_name(), product_card.get_price(),product_card.get_color(), product_card.get_color_cat(), product_card.get_dept())

print(len(data))
print(len(failed_page))
failed_page_df = pd.DataFrame.from_dict(failed_page)
print(failed_page_df)
failed_page_df.to_csv("failed_page_list.csv")

### Step 8. Extract those departments and colors from failed page list and do the scrapping again.

In [None]:
missing_color = failed_page["color_name"]
missing_dept = failed_page["dept_name"]
print(len(missing_color),missing_color)
print(len(missing_dept),missing_dept)

for i in range(len(missing_color)):
    data.extend(single_web_scrap(urllib.parse.quote_plus(missing_color[i]), urllib.parse.quote(missing_dept[i])))

The total number of data web scrapped from endclothing.com

In [37]:
print(len(data))

32542


### Step 9. Store the scrapped information into product_card.csv for further analysis

In [38]:
import csv

title = ["name", "color", "color category", "price", "department"]

with open('product_card_4.csv', 'w', newline='') as file:
    # Step 4: Using csv.writer to write the list to the CSV file
    writer = csv.writer(file)
    writer.writerow(title) # Use writerow for single list
    for product_card in data:
        writer.writerow([product_card.get_name(), product_card.get_color(), product_card.get_color_cat(), product_card.get_price(), product_card.get_dept()])

The below shows the structure of the product_card.csv

In [40]:
import pandas as pd
end_clothing_product_df = pd.read_csv("product_card.csv")

print(end_clothing_product_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32542 entries, 0 to 32541
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            32542 non-null  object
 1   color           32542 non-null  object
 2   color category  32542 non-null  object
 3   price           32542 non-null  int64 
 4   department      32542 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.2+ MB
None
