In [145]:
import requests
import time
from bs4 import BeautifulSoup 
from collections import defaultdict
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import presence_of_element_located
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv

In [148]:
options = Options()
options.add_argument('--disable-gpu')
DRIVER_PATH = 'chromedriver'
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)

# Step 1: Getting a list of item page URLs from the product list page.

In [155]:
# First we need to determine how many pages are there in total.
start_url = "https://www.sephora.com/shop/clean-skin-care?pageSize=300&currentPage=1"

# This function will scrape the page at starting_url
# and return an integer representing the last pagination number.
def find_last_page_number(starting_url):
    # request the html using the url, using selenium to take care of the javascript rendering stuff
    driver.get(starting_url)
    # scroll to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    soup = BeautifulSoup(driver.page_source, "html.parser")
    last_page_button = soup.find_all("button", class_="css-1lk9n5p eanm77i0")[-1]
    return int(last_page_button.text)

# This function will scrape and return a list of all products' urls on page_url.
def get_product_urls(page_url):
    # request the product list page using page_url
    driver.get(page_url)
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    # using selenium, we slow-scroll to the bottom to lazy-load all the products
    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # gotta do this twice to account for the last few products
    new_total_height = int(driver.execute_script("return document.body.scrollHeight"))
    for i in range(total_height, new_total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    # once all products are loaded, we can easily parse the URLs
    soup = BeautifulSoup(driver.page_source, "html.parser")
    group_elements = soup.find_all("div", class_="css-dkxsdo")
    result = []
    # for each group, iterate over all 12 of its child elements (individual products)
    for g in group_elements:
        child_products = g.findChildren("div", class_="css-12egk0t", recursive=False)
        for c in child_products:
            product = c.findChildren("a", recursive=False)
            result.append("https://www.sephora.com" + product[0]["href"])
    return result

# This function combines find_last_page_number and get_product_urls to return a list of 
# all products across all the pages.
def get_all_product_urls(start_url):
    last_page_number = find_last_page_number(start_url)
    last_page_number = 5
    result = []
    for i in range(1, last_page_number + 1):
        # build out the URL of the current page by changing the currentPage=X part of the URL
        current_url = start_url[:-1] + str(i)
        current_product_urls = get_product_urls(current_url)
        result = result + current_product_urls
    return result

# Step 2: Scrape each product page individually to retrieve the desired features.

In [156]:
# This function will scrape the page at starting_url
# and return an integer representing the last pagination number for the reviews.
def find_last_review_page_number(starting_url):
    # request the html using the url, using selenium to take care of the javascript rendering stuff
    driver.get(starting_url)
    # slowly scroll to the bottom so that the pagination bar fully loads
    total_height = int(driver.execute_script("return document.body.scrollHeight"))
    # using selenium, we slow-scroll to the bottom to lazy-load all the products
    for i in range(1, total_height, 5):
        driver.execute_script("window.scrollTo(0, {});".format(i))
    soup = BeautifulSoup(driver.page_source, "html.parser")
    last_page_button = soup.find_all("button", class_="css-exi524")[-1]
    return int(last_page_button.text)

# this function requires a beautiful soup document of the sephora product page
# the function returns (product brand,name)
def scrape_product_name(doc):
    product_container = doc.find_all('h1', class_='css-11zrkxf e65zztl0')[0]
    product_brand = product_container.findChild("a").text 
    product_name = product_container.findChild("span").text
    return (product_brand, product_name)


# scrape review section
# this function requires beautiful soup version of the product page
# the function returns [beautifulsoup elements] (each element represent a review)
# NOTE: this only finds the review section for 1 single page of reviews
def scrape_review_section(doc):
    return doc.find_all('div', class_="css-13o7eu2 eanm77i0")


# function input: single beautifulsoup element representing single review container
# function output: extract (product_brand, product_name, skin tone, skin type, star)
# if skin tone and skin type were empty, the function will return empty string 
def extract_review_properties(product_brand, product_name, review):
    star = int(review.find('div', class_='css-4qxrld')['aria-label'][0])
    skin_attributes = review.find('div', class_="css-z04cd8 eanm77i0").findChild("span")
    if skin_attributes is None:
        return (product_brand, product_name, "", "", star)
    # split this long string of combined skin attributes into a list of attributes i.e. [eye color, hair color, skin tone, skin type]
    skin_attributes_list = skin_attributes.split(", ")
    # extract the elements which contain the word "skin" -- ideally we only want the skin type and skin tone attributes
    filtered_skin_attributes_list = [i for i in skin_attributes_list if "skin" in i]
    skin_tone = ""
    skin_type = ""
    if len(filtered_skin_attributes_list) == 0: # we do nothing here, leave skin type and skin tone as empty string
        pass
    if len(filtered_skin_attributes_list) == 1: # user only inputed skin type OR skin tone
        if "skin tone" in filtered_skin_attributes_list[0]:
            skin_tone = filtered_skin_attributes_list[0]
        else:
            skin_type = filtered_skin_attributes_list[0]
        
    elif len(filtered_skin_attributes_list) == 2: # user gave both skin type AND skin tone
        # sephora inputs skin tone first before skin type
        skin_tone = filtered_skin_attributes_list[0]
        skin_type = filtered_skin_attributes_list[1]
    return (product_brand, product_name, skin_tone, skin_type, star)


# function purpose: extract the properties from all the reviews on a single page of reviews
# input: the beautifulsoup version of the product page
# output: the properties from all the reviews [(skin_tone, skin_type, star)]
# remove review that doesn't contain skin type or skin tone
# NOTE: this only finds the review section for 1 single page of reviews
def extract_reviews_properties(doc):
    reviews = scrape_review_section(doc)
    product_brand, product_name = scrape_product_name(doc)
    reviews_properties = []
    for r in reviews:
        review_properties = extract_review_properties(product_brand, product_name, r)
        if review_properties[0] != '' and review_properties[1] != '':
            reviews_properties.append(review_properties)        
    return reviews_properties

# this function will scrape all the reviews across all the paginated review pages
# input: url of the starting page
# output: properties from all the reviews across all paginations [(product_brand, product_name, skin_tone, skin_type, star)]
def extract_paginated_reviews_properties(start_url):
    result = []
    total_pages = find_last_review_page_number(start_url)
    for i in range(30):
        # scrape that page
        doc = BeautifulSoup(driver.page_source, "html.parser")
        result = result + extract_reviews_properties(doc)
        # then press next page only if we are not already on the last page
        if i != total_pages - 1:
            driver.find_element_by_class_name("css-2anst8").click()
    return result      

# this function scrapes all the products and all their reviews, saves the output to a csv
# input: url of the product list page, csv file name
# output: None, but will save a CSV 
def pipeline(product_list_url, csv_file_name):
    products_list = get_all_product_urls(product_list_url)
    with open(csv_file_name,'a') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['product_brand','product_name', "skin_tone", "skin_type", "ratings"])
        for p in products_list:
            reviews = extract_paginated_reviews_properties(p)
            for r in reviews:
                csv_out.writerow(r)
    out.close()
        

In [157]:
pipeline("https://www.sephora.com/shop/clean-skin-care?currentPage=1", "./test.csv")

TypeError: 'NoneType' object is not callable