In [6]:
# Import dependencies
import pandas as pd
import numpy as np
import zipfile
import glob
import shutil

In [7]:
# Assigned variables to the paths
zip_path = 'Resources/reviews_all.zip'
unzipped = 'Resources/unzip-reviews'

# Use zipfile to extract the archive
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzipped)

In [8]:
# Assign the file names to a variable
review_files = ['reviews_0-250.csv','reviews_250-500.csv','reviews_500-750.csv','reviews_750-1250.csv','reviews_1250-end.csv']

# Use list comprehension to read the csv from the unzipped folder
df = [pd.read_csv(f'{unzipped}/{review_file}', low_memory=False) for review_file in review_files]
df = [pd.read_csv(file, low_memory=False) for file in glob.glob(f'{unzipped}/*.csv')]

In [9]:
# Concatenate dataframes
merged_df = pd.concat(df, ignore_index=True)

# Delete the unzipped folder to avoid pushing to github
shutil.rmtree(unzipped)

merged_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [10]:
# Merged on product id so the primary categories are assigned to each review, along with the ingredients if we decide to use them
product_info_df = pd.read_csv('Resources/product_info.csv', low_memory=False)
merged_df = pd.merge(merged_df, product_info_df[['product_id', 'primary_category', 'ingredients']], on='product_id', how='left')
merged_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,...,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,primary_category,ingredients
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,...,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr..."
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,...,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut..."
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,...,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut..."
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,...,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut..."
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",...,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut..."


In [11]:
# Get the data types of each column
merged_df.dtypes

Unnamed: 0                    int64
author_id                    object
rating                        int64
is_recommended              float64
helpfulness                 float64
total_feedback_count          int64
total_neg_feedback_count      int64
total_pos_feedback_count      int64
submission_time              object
review_text                  object
review_title                 object
skin_tone                    object
eye_color                    object
skin_type                    object
hair_color                   object
product_id                   object
product_name                 object
brand_name                   object
price_usd                   float64
primary_category             object
ingredients                  object
dtype: object

In [12]:
columns = ['product_id', 'product_name', 'brand_name', 'price_usd', 'primary_category', 'ingredients', 'rating', 'is_recommended', 'helpfulness', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'review_text', 'review_title']
merged_df = merged_df[columns]
merged_df.head()

Unnamed: 0,product_id,product_name,brand_name,price_usd,primary_category,ingredients,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,review_text,review_title
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,1.0,1.0,2,0,2,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,0.0,,0,0,0,I bought this lip mask after reading the revie...,Disappointed
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,My review title says it all! I get so excited ...,New Favorite Routine
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,I’ve always loved this formula for a long time...,Can't go wrong with any of them
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,1.0,,0,0,0,"If you have dry cracked lips, this is a must h...",A must have !!!


In [36]:
# Next class we need to discuss what columns we want to remove to make our data more manageable