In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import zipfile
import glob
import shutil

In [2]:
# Assigned variables to the paths
zip_path = 'Resources/reviews_all.zip'
unzipped = 'Resources/unzip-reviews'

# Use zipfile to extract the archive
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzipped)

In [3]:
# Assign the file names to a variable
review_files = ['reviews_0-250.csv','reviews_250-500.csv','reviews_500-750.csv','reviews_750-1250.csv','reviews_1250-end.csv']

# Use list comprehension to read the csv from the unzipped folder
df = [pd.read_csv(f'{unzipped}/{review_file}', low_memory=False) for review_file in review_files]
df = [pd.read_csv(file, low_memory=False) for file in glob.glob(f'{unzipped}/*.csv')]

In [4]:
# Concatenate dataframes
merged_df = pd.concat(df, ignore_index=True)

# Delete the unzipped folder to avoid pushing to github
shutil.rmtree(unzipped)

merged_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [5]:
# Merged on product id so the primary categories are assigned to each review, along with the ingredients if we decide to use them
product_info_df = pd.read_csv('Resources/product_info.csv', low_memory=False)

In [6]:
complete_df = pd.merge(merged_df, product_info_df[['product_id', 'secondary_category', 'ingredients']], on='product_id', how='left')

In [7]:
complete_df.secondary_category.value_counts()

secondary_category
Moisturizers              297399
Treatments                222042
Cleansers                 200604
Mini Size                  85498
Eye Care                   74999
Masks                      70531
Lip Balms & Treatments     61688
Sunscreen                  41139
Value & Gift Sets          12099
Self Tanners               11942
Wellness                   10530
High Tech Tools             5925
Shop by Concern               15
Name: count, dtype: int64

In [8]:
columns = ['product_id', 'product_name', 'brand_name', 'price_usd', 'secondary_category', 'ingredients', 'rating', 'is_recommended', 'helpfulness', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'review_text', 'review_title', 'submission_time']
complete_df = complete_df[columns]
complete_df.head()

Unnamed: 0,product_id,product_name,brand_name,price_usd,secondary_category,ingredients,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,review_text,review_title,submission_time
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Cleansers,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,1.0,1.0,2,0,2,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,2023-02-01
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,0.0,,0,0,0,I bought this lip mask after reading the revie...,Disappointed,2023-03-21
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,My review title says it all! I get so excited ...,New Favorite Routine,2023-03-21
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,I’ve always loved this formula for a long time...,Can't go wrong with any of them,2023-03-20
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,"If you have dry cracked lips, this is a must h...",A must have !!!,2023-03-20


In [9]:
# Remove brands not related to the study
target_brands = ['Tatcha', 'The Ordinary', 'Dior', 'Estée Lauder', 'LANEIGE']

target_brands_df = complete_df[complete_df['brand_name'].isin(target_brands)]
count_by_target_brand = target_brands_df['brand_name'].value_counts()
count_by_target_brand

brand_name
Tatcha          46770
The Ordinary    35934
LANEIGE         27519
Estée Lauder    17341
Dior             6150
Name: count, dtype: int64

In [10]:
target_brands_df

Unnamed: 0,product_id,product_name,brand_name,price_usd,secondary_category,ingredients,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,review_text,review_title,submission_time
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,0.0,,0,0,0,I bought this lip mask after reading the revie...,Disappointed,2023-03-21
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,My review title says it all! I get so excited ...,New Favorite Routine,2023-03-21
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,I’ve always loved this formula for a long time...,Can't go wrong with any of them,2023-03-20
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,"If you have dry cracked lips, this is a must h...",A must have !!!,2023-03-20
5,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",4,1.0,1.000000,1,0,1,The scent isn’t my favourite but it works grea...,Great!,2023-03-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1093706,P442833,EUK 134 0.1%,The Ordinary,11.1,Treatments,"['Propanediol, Ethylbisiminomethylguaiacol man...",2,1.0,0.959596,99,4,95,I love TO but sadly this product did not work ...,Not great for sensitive skin :/,2019-04-26
1093707,P442833,EUK 134 0.1%,The Ordinary,11.1,Treatments,"['Propanediol, Ethylbisiminomethylguaiacol man...",3,1.0,0.666667,24,8,16,I used this product after washing my face at n...,,2019-04-21
1093708,P442833,EUK 134 0.1%,The Ordinary,11.1,Treatments,"['Propanediol, Ethylbisiminomethylguaiacol man...",5,1.0,0.917160,169,14,155,I’m over 35 with dry and sensitive skin. EUK ...,Photoaging Shield and Antioxidant Buddy,2019-04-11
1093709,P442833,EUK 134 0.1%,The Ordinary,11.1,Treatments,"['Propanediol, Ethylbisiminomethylguaiacol man...",5,1.0,0.868421,38,5,33,I loved this product instantly. It has a warmi...,Great for redness!,2019-04-06
