<img src="../images/cs_logo_pink.png" style="float: left; margin: 36px 20px 0 0; height: 60px">

# Capstone Project - Cos Skin <br><i style = "font-size:16px">Your skin but better</i>

## Notebook 3: Data Collection Part 3 of 3
Notebook 1: Introduction & Data Collection Part 1 of 3<br>
Notebook 2: Data Collection Part 2 of 3<br>
<b>Notebook 3: Data Collection Part 3 of 3<br></b>
Notebook 4: EDA & Data Cleaning<br>
Notebook 5: Preprocessing<br>
Notebook 6: Modelling<br>
Notebook 7: Streamlit 

In this notebook, I will be scraping for user reviews per product. Some popular products have over 10 000 reviews while new products tend to have 0 reviews. Hence I am only scraping 100 reviews per product for simple sentiment analysis to determine the impact of user reviews on recommendations. 

I used APIs to scrape for reviews per product. Regex was used to create query urls for scraping.

In [1]:
import re
import requests
import pandas as pd
from tqdm import tqdm

In [2]:
# import datasets
cleanser = pd.read_csv('../data/cleanser_pdts.csv')
toner = pd.read_csv('../data/toner_pdts.csv')
day_moisturizer = pd.read_csv('../data/day_moisturizer_pdts.csv')
night_cream = pd.read_csv('../data/night_cream_pdts.csv')
sunscreen = pd.read_csv('../data/sunscreen_pdts.csv')

In [3]:
# creating individual url per product for api queries
def api_query_url(df):
    pdt_paths = []
    for i in range(len(df['pdt_url'])):
        pdt_url = df.iloc[i]['pdt_url']

        try:
            pdt_path = re.search("(?<=/products/).+?(?=/)", pdt_url)[0]
            pdt_paths.append(f'https://www.sephora.sg/api/v2.4/products/{pdt_path}/reviews')

        except:
            pdt_path = re.search("(?<=/products/).+", pdt_url)[0]
            pdt_paths.append(f'https://www.sephora.sg/api/v2.4/products/{pdt_path}/reviews')
            
    df['query_url'] = pdt_paths
    
    return df

# function to scrape product reviews
# def reviews_scraper(df, cat_reviews_df):
#     query_url = df['query_url']
#     unique_id = df['unique_id']
#     review_pdt = []
#     reviewer_country = []
#     review_title = []
#     review_text = []
    
#     for i in tqdm(range(len(query_url)), desc = 'progress'):
#         request_url = query_url[i]
#         review_pdt_id = unique_id[i]
        
#         querystring = {"page[number]": "1","page[size]":"10","sort":"recent"}

#         headers = {
#             "authority": "www.sephora.sg",
#             "accept": "*/*",
#             "accept-language": "en-SG",
#         }

#         response = requests.request("GET", request_url, headers=headers, params=querystring)
#         data = response.json()
        
#         for p in range(1, data['meta']['total-pages']+1):
            
#             querystring_reviews = {"page[number]":f"{p}","page[size]":"10","sort":"recent"}

#             headers = {
#                 "authority": "www.sephora.sg",
#                 "accept": "*/*",
#                 "accept-language": "en-SG",
#             }

#             response = requests.request("GET", request_url, headers=headers, params=querystring_reviews)
#             data = response.json()
            
#             for r in range(len(data['data'])):
#                 country = data['data'][r]['attributes']['country-name']
#                 title = data['data'][r]['attributes']['title']
#                 text = data['data'][r]['attributes']['text']

#                 review_pdt.append(review_pdt_id)
#                 reviewer_country.append(country)
#                 review_title.append(title)
#                 review_text.append(text)
                
#     cat_reviews_df['unique_id'] = review_pdt
#     cat_reviews_df['review_country'] = reviewer_country
#     cat_reviews_df['review_title'] = review_title
#     cat_reviews_df['review_text'] = review_text
    
#     return cat_reviews_df

# function to scrape product reviews, limited to 100 reveiws per product
def reviews_scraper_lim(df, cat_reviews_df):
    query_url = df['query_url']
    unique_id = df['unique_id']
    review_pdt = []
    reviewer_country = []
    review_title = []
    review_text = []
    
    for i in tqdm(range(len(query_url)), desc = 'progress'):
        request_url = query_url[i]
        review_pdt_id = unique_id[i]
        
        querystring = {"page[number]": "1","page[size]":"10","sort":"recent"}

        headers = {
            "authority": "www.sephora.sg",
            "accept": "*/*",
            "accept-language": "en-SG",
        }

        response = requests.request("GET", request_url, headers=headers, params=querystring)
        data = response.json()
        
        num_page_reviews = data['meta']['total-pages']
        
        if num_page_reviews <= 10:
        
            for p in range(1, num_page_reviews+1):

                querystring_reviews = {"page[number]":f"{p}","page[size]":"10","sort":"recent"}

                headers = {
                    "authority": "www.sephora.sg",
                    "accept": "*/*",
                    "accept-language": "en-SG",
                }

                response = requests.request("GET", request_url, headers=headers, params=querystring_reviews)
                data = response.json()

                for r in range(len(data['data'])):
                    country = data['data'][r]['attributes']['country-name']
                    title = data['data'][r]['attributes']['title']
                    text = data['data'][r]['attributes']['text']

                    review_pdt.append(review_pdt_id)
                    reviewer_country.append(country)
                    review_title.append(title)
                    review_text.append(text)
        else:
            for p in range(1, 11):

                querystring_reviews = {"page[number]":f"{p}","page[size]":"10","sort":"recent"}

                headers = {
                    "authority": "www.sephora.sg",
                    "accept": "*/*",
                    "accept-language": "en-SG",
                }

                response = requests.request("GET", request_url, headers=headers, params=querystring_reviews)
                data = response.json()

                for r in range(len(data['data'])):
                    country = data['data'][r]['attributes']['country-name']
                    title = data['data'][r]['attributes']['title']
                    text = data['data'][r]['attributes']['text']

                    review_pdt.append(review_pdt_id)
                    reviewer_country.append(country)
                    review_title.append(title)
                    review_text.append(text)
                
    cat_reviews_df['unique_id'] = review_pdt
    cat_reviews_df['review_country'] = reviewer_country
    cat_reviews_df['review_title'] = review_title
    cat_reviews_df['review_text'] = review_text
    
    return cat_reviews_df

In [4]:
cleanser = api_query_url(cleanser)
toner = api_query_url(toner)
day_moisturizer = api_query_url(day_moisturizer)
night_cream = api_query_url(night_cream)
sunscreen = api_query_url(sunscreen)

In [5]:
cleanser_reviews = pd.DataFrame()
cleanser_pdt_reviews = reviews_scraper_lim(cleanser, cleanser_reviews)

progress: 100%|█████████████████████████████| 275/275 [3:56:56<00:00, 51.70s/it]


In [6]:
cleanser_pdt_reviews.to_csv('../data/cleanser_pdt_reviews.csv', index = False )

In [7]:
toner_reviews = pd.DataFrame()
toner_pdt_reviews = reviews_scraper_lim(toner, toner_reviews)

progress: 100%|█████████████████████████████| 160/160 [1:41:33<00:00, 38.09s/it]


In [8]:
toner_pdt_reviews.to_csv('../data/toner_pdt_reviews.csv', index = False )

In [13]:
day_moisturizer_reviews = pd.DataFrame()
day_moisturizer_pdt_reviews = reviews_scraper_lim(day_moisturizer, day_moisturizer_reviews)

progress: 100%|███████████████████████████████| 381/381 [40:49<00:00,  6.43s/it]


In [14]:
day_moisturizer_pdt_reviews.to_csv('../data/day_moisturizer_pdt_reviews.csv', index = False )

In [16]:
night_cream_reviews = pd.DataFrame()
night_cream_pdt_reviews = reviews_scraper_lim(night_cream, night_cream_reviews)

progress: 100%|███████████████████████████████| 197/197 [17:33<00:00,  5.35s/it]


In [17]:
night_cream_pdt_reviews.to_csv('../data/night_cream_pdt_reviews.csv', index = False )

In [18]:
sunscreen_reviews = pd.DataFrame()
sunscreen_pdt_reviews = reviews_scraper_lim(sunscreen, sunscreen_reviews)

progress: 100%|█████████████████████████████████| 80/80 [07:04<00:00,  5.31s/it]


In [19]:
sunscreen_pdt_reviews.to_csv('../data/sunscreen_pdt_reviews.csv', index = False )