# Data gathering

There are three datasets needed for this analysis. 
Firstly, the brand list is a catalog of all brands offered by Sephora. 
Next, we use the brand list and search for a detailed product list within each branch product_id like 'P07102'. 
Lastly, the product_id is then used in gathering the review data.

Since there is already existing data parsed by Raghad Alharbi, we will skip Step 1 and Step 2.
- Sephora result data from Kaggle: https://www.kaggle.com/raghadalharbi/all-products-available-on-sephora-website

Tutorial for data gathering https://github.com/Shirleyiscool/Scraping-Sephora
- Step1 Brand list: we will use requests to the website https://www.sephora.com/brands-list and parse a list of brands
- Step2 Product list: for each brand list, we use requests to the individual website "https://www.sephora.com"+brand.a.attrs['href']+"/all?pageSize=300" and parse a list of products.
- Step3 Review list: for each product, we use an API call to bazaarvoice and get all reviews under each product.


## Additional filters
Due to large amount of products and reviews, we limited our scope according to the goal: look for products that brings happiness to people during pandemic. Thus below filters are applied.

- Keep only product with review count > 50
- Keep only product with review star >= 4
- Keep only the most recent 3100 revies. (Certain product reviews are too large)

In [None]:
# Not all packages are imported on top since we have to parse data in multiple days. 
# Thus the individual section has separate imports

import pandas as pd
pd.options.display.max_rows = 999

## Step1: Brand list

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
# Getctime Response of "brandlist" Website from Sephora
band_lst_link = "https://www.sephora.com/brands-list"
response = requests.get(band_lst_link)

# Use BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Scraping brand links and save them into a list
brand_link_lst = []
main_box = soup.find_all(attrs={"data-comp": "BrandsList StyledComponent BaseComponent "})[0]
for brand in main_box.find_all('li'):
    brand_link_lst.append("https://www.sephora.com" +
                          brand.a.attrs['href']+"/all?pageSize=300")

# Write brand links into a file:
with open('data/brand_link.txt', 'w') as f:
    for item in brand_link_lst:
        f.write(f"{item}\n")

# Indicate scraping completion
print(f'Got All Brand Links! There are {len(brand_link_lst)} brands in total.')

## Step2: Product list

In this step, we use the brand list to look for all products inside the brand. 

- In Step2.1, we gather all products from the brand page.
- In Step2.1, we gather information regarding each product from the product page.

### Step2.1: Get all products from brand list

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [None]:
test_brand = 1
test_product = 10

In [15]:
start = time.ctime()

In [None]:
def scape_product(link, proxy=None):
    """
    A function to scape all the product links from a given brand link.
    """
    try:
        response = requests.get(link, proxies={
                                "http": proxy, "https": proxy}, timeout=15)
    except:
        print(f'\r Unsuccessfully get data for {link.split("/")[4]}', end="")
        return None
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    product_link_lst = []
    try:
        product_box = soup.find_all(attrs={"data-comp": "ProductGrid "})[0]
    # There might be no products for that brand
    except IndexError:
        return []
    for product in product_box.find_all('a',
                                        attrs={"data-comp": "ProductItem "}):
        # use function split to remove text like "grid p12345"
        product_link_lst.append(
            "https://www.sephora.com" + product.attrs['href'].split()[0])
    return product_link_lst


# Read brand links file
product_link_dic = {'brand': [], 'product_links': []}
# num_lines = sum(1 for line in open("data/brand_link.txt", "r"))
num_lines = test_brand

# Scape all the product links from all the brands links.
# This will take some time!
ct = 1

# Get proxies from http://www.freeproxylists.net/zh/?c=US&pr=HTTPS&u=80&s=ts
px = ['143.198.222.22:8080', '143.198.206.183:8080', '157.230.208.88:8080']
px_idx = 0

for brand_link in open("data/brand_link.txt", "r"):
    if ct<=test_product:
        brand_name = brand_link.split('/')[4]
        product_link_list = scape_product(brand_link[:-1], proxy=px[px_idx])

        # If one proxy does not work, use another
        while product_link_list is None:
            px_idx += 1
            if px_idx == 3:
                px_idx = 0
            product_link_list = scape_product(brand_link[:-1], proxy=px[px_idx])

        print(f'\r === {ct} / {num_lines} ===  {brand_name} === {px[px_idx]}',
              end="")
        product_link_dic['brand'] += [brand_name] * len(product_link_list)
        product_link_dic['product_links'] += product_link_list
        ct += 1

# Write the result into csv file
product_link_df = pd.DataFrame(product_link_dic)
product_link_df.to_csv('data/product_links.csv', index=False)

# Indicate scraping completion
print(f'Got All product Links! There are {len(product_link_df)} products in '
      f'total.')

### Step2.2 Get details on product website
For example, likes, review total, price, etc.

In [None]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
def get_data(product_link, px_list=None):
    """Get product information"""
    data_dic = {'pd_id': [], 'size_and_item': [], 'category': [],
                'price': [], 'love_count': [], 'reviews_count': []}
    px_idx = 0
    proxy = None if px_list is None else px_list[px_idx]

    while True:
        try:
            response = requests.get(product_link, proxies={
                "http": proxy, "https": proxy}, timeout=15)
        except:
            if px_idx == len(px_list) - 1:
                px_idx = 0
            else:
                px_idx += 1
            proxy = px_list[px_idx]
            continue

        html = response.text
        soup = BeautifulSoup(html, 'html.parser')
        data_dic['pd_id'] = re.findall(R'P[0-9]{3,6}', product_link)[0]

        # Get Category
        try:
            cat_box = soup.find_all(attrs={'data-comp': 'ProductBreadCrumbs BreadCrumbs '})[0]
            cat_list = [cat.string for cat in cat_box.find_all('a')]
            category = ', '.join(cat_list)
        except:
            category = None


        category

        # Size and Content
        try:
            size_and_item = soup.find(
                attrs={"data-at": "sku_size_label"}).get_text()
        except:
            size_and_item = None

        size_and_item

        # Get Price
        try:
            price = soup.find_all(attrs={'data-comp': 'Price '})[
                0].get_text()
        except:
            price = None

        price


        # Get love counts
        try:
            love_count = soup.find('span', attrs={
                "class": "css-jk94q9"}).get_text()
        except:
            love_count = None

        love_count


        # review nums
        try:
        #     link_json = soup.find(attrs={"id": "linkJSON"})
        #     json_str = str(link_json)
        #     reviews = re.findall(R'\"reviews\":(.*?)\,', json_str)
            reviews = soup.find('span', attrs = {
        #         'class':"css-nv7myq eanm77i0",
#                 'class': "css-1vj6vps eanm77i0"
        #         'data-comp' : 'StyledComponent BaseComponent ',
        #         'id': 'ratings-reviews-container'
                'data-at': 'number_of_reviews'
            }).get_text()
            reviews_count = reviews
        except:
            reviews_count = None


        data_dic['category'] = category
        data_dic['size_and_item'] = size_and_item
        data_dic['love_count'] = love_count
        data_dic['reviews_count'] = reviews_count
        data_dic['price'] = price
        break
    return data_dic


px_list_ = [
            '167.99.218.191:8080',
            '144.217.254.175:3128', 
            '165.225.77.47:9443',
            '54.37.137.211:3128', 
            '165.22.91.197:8080',
            '165.225.77.47:8800', 
            '165.225.77.47:9400',
            '165.225.77.47:80', 
            '165.225.77.47:443',
            '143.198.222.22:8080', 
            '143.198.206.183:8080', 
            '157.230.208.88:8080',
            ]

pd_links_df = pd.read_csv('data/product_links.csv')
product_links = pd_links_df['product_links']

result = []
for i, link in enumerate(product_links[:]):
    result.append(get_data(link, px_list_))
    pd_df = pd.DataFrame(result)
    pd_df.to_csv('data/pd_info.csv', index=False)
    print(f'{i + 1:04d} / {len(product_links)} || {link}')

In [None]:
end = time.ctime()

In [None]:
print('Duration: ', {end - start})

# Step3: Reviews

In [None]:
import re
import time
import pandas as pd
import requests
import json
import pickle

In [None]:
# Load product link data frame.
# Here we used full data from Kaggle.
# In the project, we also limited the products using the filters mentioned above.
pd_links_df = pd.read_parquet('./data/pd_info.csv')

In [None]:
# Load product link data frame.
# Here we used full data from Kaggle.
# In the project, we also limited the products using the filters mentioned above.
pd_links_df = pd.read_parquet('./data/sephora_website_dataset.csv')

In [None]:
pd_links_df['product_links'] = pd_links_df['URL']

In [None]:
def remove_products_already_downloaded(pd_links_df):
    'Product reviews are too large (300 products for 600-800MB), thus we download in multiple batches.'
    with open('data/product_keys.pkl', 'rb') as file:
        lst = pickle.load(file)

    pids_cleaned = lst
    pids = list(result)
    pd_links_df['key'] = pd_links_df['URL'].str.findall('P[0-9]{4,7}').apply(lambda x: x[0])
    pd_links_df = pd_links_df[~pd_links_df['key'].isin(pids_cleaned)].copy()
    
    return pd_links_df

In [None]:
# Skip if this is the first time
# pd_links_df = remove_products_already_downloaded(pd_links_df)

In [None]:
pd_links_df.shape

In [None]:
result = {}

In [None]:
# Add a column of product id
pd_links_df['pd_id'] = [re.findall('P[0-9]{4,7}', link)[0] for link
                        in pd_links_df['product_links']]


def scrape_reviews(p_id, proxy=None):
    url = 'https://api.bazaarvoice.com/data/reviews.json'
    params = {
        'Filter': f'ProductId:{p_id}',
        'Sort': 'SubmissionTime:desc',
        'Limit': 100,
        'Offset': 0,
        'Include': 'Products,Comments',
        'Stats': 'Reviews',
        'passkey': 'caQ0pQXZTqFVYA1yYnnJ9emgUiW59DXA85Kxry8Ma02HE',
        'apiversion': 5.4,
        'Locale': 'en_US',
    }

    
    reviews = []
    loop = 0

    while loop<=30:
        params['Offset'] = len(reviews)

        # Make the same request that Javascript makes
        try:
            r = requests.get(url, params=params, proxies={
                "http": proxy, "https": proxy}, timeout=15)
        except KeyboardInterrupt:
            break
        except:
            print(f'{proxy} Cannot connect!')
            return None, None
        if loop == 0:
            try:
                product = r.json()['Includes']['Products']
            except KeyError:
                product = []

        # break if we have an error or have all the reviews
        if (r.status_code != 200) or (
                len(reviews) >= r.json()['TotalResults']):
            break

        # add the list of results to current results
        reviews.extend(r.json()['Results'])

        # Give a pause, so we don't get blocked
        time.sleep(0.2)
        loop += 1

    # Show how many reviews we scraped
    print(f'{p_id}: {len(reviews)} reviews')
    time.sleep(0.5)
    return product, reviews


# Scrape Product and Review Data
# result already imported from file
# result = {}

proxies = [
        '20.194.17.90:3128',
        '69.167.174.17:80',
        '129.226.52.93:443',
        '164.90.222.95:80',
        '143.55.38.198:8080',
        '130.61.236.104:80',
        '34.126.79.176:80',
        '132.145.18.53:80',
        '68.183.221.156:37486',
        '143.198.196.205:80',
        '140.227.63.136:58888',
        '167.71.230.124:8080',
        '148.66.131.212:80',
        '173.249.38.220:8118',
        '85.84.14.9:80',
        '129.21.105.164:8080',
        '190.9.55.12:8080',
        '209.127.191.180:9279',
        '208.74.51.100:80',
        '159.65.174.145:3128',
        '190.9.55.12:8080',
        '45.95.96.187:8746',
        '45.95.96.237:8796',
        '45.94.47.66:8110',
        '45.94.47.108:8152',
        '45.95.99.226:7786',
        '183.88.226.50:8080',
        '52.151.15.4:80',
        '51.81.82.175:80',
        '129.21.158.30:8080',
        '185.198.190.237:12444',
        '149.125.70.236',
        '167.99.118.98',
        '92.204.129.161:80',
        '52.168.34.113:80',
        '74.205.128.201:80',
        '209.97.150.167',
        '191.96.42.80:8080',
        '198.199.86.11:3128',
        '198.199.86.11:3128',  
]
px_id = 0
loop = 0

for pid in pd_links_df['pd_id']:
    loop_ = loop % 1000
    if (loop_ < 900) and (loop_ >= 1):
        product, reviews = None, None
        while True:
            if px_id == len(proxies):
                px_id = 0

            product_data, reviews_data = scrape_reviews(pid,
                                                        proxy=proxies[px_id])
            if product_data is not None:
                break
            px_id += 1

    # Use my own server to connect
    else:
        product_data, reviews_data = scrape_reviews(pid)
    loop += 1

    print(f'{proxies[px_id]} || {loop:04d}/{len(pd_links_df)}')
    result[pid] = [product_data, reviews_data]

In [None]:
len(result)

In [None]:
with open("data/scraper_result.json", "w") as file:
    json.dump(result, file)

In [None]:
with open("data/product_keys.pkl", "wb") as file:
    pickle.dump(list(result.values()), file)

In [None]:
# Create a backup file
!cp ./data/scraper_result.json ./data/scraper_result.bak