# Sephora Skincare Products Scraper

This is a skincare product scraper that uses bs4 and regex to find save product information into json.

In [1]:
# importing required packages
from bs4 import BeautifulSoup
from selenium import webdriver
import time

import requests
import pandas as pd
import json
import re 

In [2]:
def get_html(url):
    sephora_page = requests.get(url)
    return BeautifulSoup(sephora_page.content, 'lxml')

In [3]:
def find_num_pages(sephora_html):
    num_products = sephora_html.find('span', {'data-at': 'number_of_products'}).get_text()
    num_products = int(re.findall('^\D*(\d+)', num_products)[0])
    return round(num_products / 60)

In [4]:
def find_product_urls(sephora_html): 
    script_string = sephora_html.find_all('script', {'type' : 'application/ld+json'})[1].get_text()
    return re.findall('(?<="url":").*?(?=\")', script_string) 

In [5]:
def find_float(s):
    result = ''
    for i in s.split():
        try:
            #trying to convert i to float
            result = float(i)
            #break the loop if i is the first string that's successfully converted
            break
        except:
            continue
    return str(result)

In [12]:
def parse_product(product_url):
    product_page = requests.get(product_url)
    product_html = BeautifulSoup(product_page.content, 'html.parser')

    skin_types = ['Normal', 'Oily', 'Combination', 'Dry', 'Sensitive']
    product_details = product_html.find('div', class_='css-pz80c5').get_text()

    product_brand = product_html.find('span', class_='css-euydo4').get_text()
    product_name = product_html.find('span', class_='css-0').get_text()
    product_price = product_html.find('div', class_='css-14hdny6').get_text()
    product_reviews = product_html.find('span', class_='css-2rg6q7').get_text()
    product_likes = product_html.find('span', {'data-at' : 'product_love_count'}).get_text()
    
    product_image = "https://www.sephora.com/" + product_html.find('image')['xlink:href']
    
    product_script = product_html.find_all('script')[5].contents[0]
    categories = re.findall('(?<="displayName":").*?(?=\")', product_script)
    product_category = categories[1]
    product_sub_category = categories[0]
    
    size_and_id = product_html.find('div', class_='css-1qf1va5').get_text()
    product_id = re.search('(?<=ITEM ).*', size_and_id)[0]
    
    product_rating = ''
    rating_text = product_html.find('div', class_='css-r17a09')['aria-label']
    rating_regex = re.search('^\D*(\d+)', rating_text)
    if rating_regex:
        product_rating = rating_regex[0]
    
    rating_script = re.search('(?<="productRating":).*?(?=\,)', product_script)
    if rating_script:
        product_rating = rating_script[0]
        
    product_skin_types = []
    for st in skin_types:
        if st in product_details[0:300]:
            product_skin_types.append(st)
            
    product_ingredients = ''
    if (len(product_html.find_all('div', class_='css-pz80c5')) > 2):
        product_ingredients = product_html.find_all('div', class_='css-pz80c5')[2].get_text()
        product_ingredients = product_ingredients.strip().replace(u'\xa0', u'') \
            .replace(u'\u2028', u'').replace(u'\r',u'').replace(u'\t',u'')
    return {
        'id' : product_id,
        'brand' : product_brand,
        'name' : product_name,
        'price' : product_price,
        'likes' : product_likes,
        'reviews': product_reviews,
        'rating' : product_rating,
        'product_url': product_url,
        'image_url' : product_image,
        'category' : product_category,
        'sub_category' : product_sub_category,
        'skin_types' : product_skin_types,
        'ingredients' : product_ingredients
    }

In [13]:
# testing parse function
url = 'https://www.sephora.com/shop/moisturizing-cream-oils-mists'
sephora_html = get_html(url)
url1 = find_product_urls(sephora_html)[1]
parse_product(url1)

{'id': '2025633',
 'brand': 'Drunk Elephant',
 'name': 'Protini™ Polypeptide Moisturizer',
 'price': '$68.00',
 'likes': '147788',
 'reviews': '2K reviews',
 'rating': '4.1596',
 'product_url': 'https://www.sephora.com/product/protini-tm-polypeptide-cream-P427421',
 'image_url': 'https://www.sephora.com//productimages/sku/s2025633-main-Lhero.jpg',
 'category': 'Moisturizers',
 'sub_category': 'Moisturizers',
 'skin_types': ['Normal', 'Oily', 'Combination', 'Dry'],
 'ingredients': '-9 Signal Peptide Complex-Pygmy Waterlily Stem Cell Extract-Soybean Folic Acid Ferment Extract   Water, Dicaprylyl Carbonate, Glycerin, Cetearyl Alcohol, Cetearyl Olivate, Sorbitan Olivate, Sclerocarya Birrea Seed Oil, Bacillus/Soybean/ Folic Acid Ferment Extract, Nymphaea Alba Root Extract, sh-Oligopeptide-1, sh-Oligopeptide-2, sh-Polypeptide-1, sh-Polypeptide-9, sh-Polypeptide-11, Copper Palmitoyl Heptapeptide-14, Heptapeptide-15 Palmitate, Palmitoyl Tetrapeptide-7, Palmitoyl Tripeptide-1, Alanine, Arginine

In [14]:
# categories to scrape on sephora

start_url = ['https://www.sephora.com/shop/moisturizing-cream-oils-mists',
            'https://www.sephora.com/shop/cleanser',
            'https://www.sephora.com/shop/facial-treatments',
            'https://www.sephora.com/shop/eye-treatment-dark-circle-treatment',
            'https://www.sephora.com/shop/face-mask',
            'https://www.sephora.com/shop/sunscreen-sun-protection',
            'https://www.sephora.com/shop/self-tanning-products',
            'https://www.sephora.com/shop/lip-treatments']

product_categories = [
    "Moisturizers",
    "Cleansers",
    "Facial Treatments",
    "Eye Treatments",
    "Face Masks",
    "Sunscreens",
    "Self Tanners",
    "Lip Treatments"
]

In [15]:
# get the number of pages
num_pages = []
for url in start_url:
    sephora_html = get_html(url)
    num_pages.append(find_num_pages(sephora_html))

In [16]:
# check num pages
num_pages

[14, 9, 10, 4, 6, 3, 1, 3]

In [17]:
all_products = {}
for i in range(len(product_categories)):
    print("scraping " + product_categories[i])
    url = start_url[i]
    product_list = []
    for j in range(num_pages[i]):
        curr_url = url + '?currentPage=' + str(j)
        page_html = get_html(curr_url)
        product_urls = find_product_urls(page_html)
        for k in range(len(product_urls)):
            pu = product_urls[k]
            product_list.append(parse_product(pu))
        print("page " + str(j + 1) + " parse completed!")
    all_products[product_categories[i]] = product_list

scraping Moisturizers
page 1 parse completed!
page 2 parse completed!
page 3 parse completed!
page 4 parse completed!
page 5 parse completed!
page 6 parse completed!
page 7 parse completed!
page 8 parse completed!
page 9 parse completed!
page 10 parse completed!
page 11 parse completed!
page 12 parse completed!
page 13 parse completed!
page 14 parse completed!
scraping Cleansers
page 1 parse completed!
page 2 parse completed!
page 3 parse completed!
page 4 parse completed!
page 5 parse completed!
page 6 parse completed!
page 7 parse completed!
page 8 parse completed!
page 9 parse completed!
scraping Facial Treatments
page 1 parse completed!
page 2 parse completed!
page 3 parse completed!
page 4 parse completed!
page 5 parse completed!
page 6 parse completed!
page 7 parse completed!
page 8 parse completed!
page 9 parse completed!
page 10 parse completed!
scraping Eye Treatments
page 1 parse completed!
page 2 parse completed!
page 3 parse completed!
page 4 parse completed!
scraping Face 

In [21]:
skincare = {'skincare_products': all_products}

In [22]:
# save products to json
product_file = open('products.json', 'w')  

with open('products.json', 'w') as json_file:  
    json.dump(skincare, json_file)