In [30]:
# Import dependencies
import pandas as pd
import numpy as np
import zipfile
import glob
import shutil

In [31]:
# Assigned variables to the paths
zip_path = 'Resources/reviews_all.zip'
unzipped = 'Resources/unzip-reviews'

# Use zipfile to extract the archive
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(unzipped)

In [32]:
# Assign the file names to a variable
review_files = ['reviews_0-250.csv','reviews_250-500.csv','reviews_500-750.csv','reviews_750-1250.csv','reviews_1250-end.csv']

# Use list comprehension to read the csv from the unzipped folder
df = [pd.read_csv(f'{unzipped}/{review_file}', low_memory=False) for review_file in review_files]
df = [pd.read_csv(file, low_memory=False) for file in glob.glob(f'{unzipped}/*.csv')]

In [33]:
# Concatenate dataframes
merged_df = pd.concat(df, ignore_index=True)

# Delete the unzipped folder to avoid pushing to github
shutil.rmtree(unzipped)

combined_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,"I use this with the Nudestix “Citrus Clean Balm & Make-Up Melt“ to double cleanse and it has completely changed my skin (for the better). The make-up melt is oil based and removes all of your makeup super easily. I follow-up with this water based cleanser, and I also use this just by itself when I’m not wearing make-up. It leaves the skin gently cleansed, but without stripping the skin. 10/10 ...",Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,"I bought this lip mask after reading the reviews and the hype. Unfortunately, it did not meet my expectations as vaseline petroleum jelly works way better for me.",Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited to get into bed and apply this lip mask. I do see a difference because I suffer from dry cracked lips. I drink a lot of water and apply lip balm daily but nothing helped until I started using this. untiluntistafted usinf this.,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time. I honestly don’t even use it for night time. I use it as an everyday lip balm. I love the texture. Gummy Bear is my second most favourite scent. Grapefruit is the best in my opinion.,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must have. After a few weeks of use I have learned I will always have by my bedside. I thought it was a little expensive but a little goes a long way. It is worth the price.",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0


In [34]:
# Merged on product id so the primary categories are assigned to each review, along with the ingredients if we decide to use them
product_info_df = pd.read_csv('Resources/product_info.csv', low_memory=False)
merged_df = pd.merge(combined_reviews_df, product_info_df[['product_id', 'primary_category', 'ingredients']], on='product_id', how='left')
merged_df.head()

Unnamed: 0.1,Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,...,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd,primary_category,ingredients
0,0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,"I use this with the Nudestix “Citrus Clean Balm & Make-Up Melt“ to double cleanse and it has completely changed my skin (for the better). The make-up melt is oil based and removes all of your makeup super easily. I follow-up with this water based cleanser, and I also use this just by itself when I’m not wearing make-up. It leaves the skin gently cleansed, but without stripping the skin. 10/10 ...",...,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,"['Water (Aqua), Dipropylene Glycol, Peg-6 Caprylic/Capric Glycerides, Glycerin, 1,2-Hexanediol, Polyglyceryl-4 Caprate, Butylene Glycol, Carbomer, Propanediol, Tromethamine, Peg-60 Hydrogenated Castor Oil, Ethylhexylglycerin, Citrus Aurantium Bergamia (Bergamot) Fruit Oil, Tricholoma Matsutake Extract, Trisodium Ethylenediamine Disuccinate, Hydrolyzed Cicer Seed Extract, Rhododendron Chrysanth..."
1,1,31423088263,1,0.0,,0,0,0,2023-03-21,"I bought this lip mask after reading the reviews and the hype. Unfortunately, it did not meet my expectations as vaseline petroleum jelly works way better for me.",...,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa..."
2,2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited to get into bed and apply this lip mask. I do see a difference because I suffer from dry cracked lips. I drink a lot of water and apply lip balm daily but nothing helped until I started using this. untiluntistafted usinf this.,...,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa..."
3,3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time. I honestly don’t even use it for night time. I use it as an everyday lip balm. I love the texture. Gummy Bear is my second most favourite scent. Grapefruit is the best in my opinion.,...,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa..."
4,4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must have. After a few weeks of use I have learned I will always have by my bedside. I thought it was a little expensive but a little goes a long way. It is worth the price.",...,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa..."


In [35]:
# Get the data types of each column
merged_df.dtypes

Unnamed: 0                    int64
author_id                    object
rating                        int64
is_recommended              float64
helpfulness                 float64
total_feedback_count          int64
total_neg_feedback_count      int64
total_pos_feedback_count      int64
submission_time              object
review_text                  object
review_title                 object
skin_tone                    object
eye_color                    object
skin_type                    object
hair_color                   object
product_id                   object
product_name                 object
brand_name                   object
price_usd                   float64
primary_category             object
ingredients                  object
dtype: object

In [37]:
columns = ['product_id', 'product_name', 'brand_name', 'price_usd', 'primary_category', 'ingredients', 'rating', 'is_recommended', 'helpfulness', 'total_feedback_count', 'total_neg_feedback_count', 'total_pos_feedback_count', 'review_text', 'review_title']
merged_df = merged_df[columns]
merged_df.head()

Unnamed: 0,product_id,product_name,brand_name,price_usd,primary_category,ingredients,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,review_text,review_title
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,"['Water (Aqua), Dipropylene Glycol, Peg-6 Caprylic/Capric Glycerides, Glycerin, 1,2-Hexanediol, Polyglyceryl-4 Caprate, Butylene Glycol, Carbomer, Propanediol, Tromethamine, Peg-60 Hydrogenated Castor Oil, Ethylhexylglycerin, Citrus Aurantium Bergamia (Bergamot) Fruit Oil, Tricholoma Matsutake Extract, Trisodium Ethylenediamine Disuccinate, Hydrolyzed Cicer Seed Extract, Rhododendron Chrysanth...",5,1.0,1.0,2,0,2,"I use this with the Nudestix “Citrus Clean Balm & Make-Up Melt“ to double cleanse and it has completely changed my skin (for the better). The make-up melt is oil based and removes all of your makeup super easily. I follow-up with this water based cleanser, and I also use this just by itself when I’m not wearing make-up. It leaves the skin gently cleansed, but without stripping the skin. 10/10 ...",Taught me how to double cleanse!
1,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa...",1,0.0,,0,0,0,"I bought this lip mask after reading the reviews and the hype. Unfortunately, it did not meet my expectations as vaseline petroleum jelly works way better for me.",Disappointed
2,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa...",5,1.0,,0,0,0,My review title says it all! I get so excited to get into bed and apply this lip mask. I do see a difference because I suffer from dry cracked lips. I drink a lot of water and apply lip balm daily but nothing helped until I started using this. untiluntistafted usinf this.,New Favorite Routine
3,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa...",5,1.0,,0,0,0,I’ve always loved this formula for a long time. I honestly don’t even use it for night time. I use it as an everyday lip balm. I love the texture. Gummy Bear is my second most favourite scent. Grapefruit is the best in my opinion.,Can't go wrong with any of them
4,P420652,Lip Sleeping Mask Intense Hydration with Vitamin C,LANEIGE,24.0,Skincare,"['Diisostearyl Malate, Hydrogenated Polyisobutene, Phyto- Steryl/Isostearyl/Cetyl/Stearyl/Behenyl Dimer Dilinoleate, Hydrogenated Poly(C6-14 Olefin), Polybutene, Microcrystalline Wax / Cera Microcristallina / Cire Microcri Stalline, Butyrospermum Parkii (Shea) Butter, Synthetic Wax, Ethylene/Propylene/Styrene Copolymer, Sucrose Tetrastearate Triacetate, Mica, Euphorbia Cerifera (Candelilla) Wa...",5,1.0,,0,0,0,"If you have dry cracked lips, this is a must have. After a few weeks of use I have learned I will always have by my bedside. I thought it was a little expensive but a little goes a long way. It is worth the price.",A must have !!!


In [36]:
# Next class we need to discuss what columns we want to remove to make our data more manageable