# **1.0 Initial Setup**

In [1]:
# Import the necessary libraries
import nltk
import numpy as np
import pandas as pd
import re

from datetime import datetime
from joblib import Parallel, delayed
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the stopwords and create a set of English stopwords
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Configurations
pd.set_option("display.max_colwidth", 100)
start_time = datetime.now()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **2.0 Load Data**

In [2]:
# Load the skincare_reviews dataset
skincare_reviews = pd.read_csv("processed_data/skincare_reviews_calibrated.csv", encoding="utf-8", low_memory=False)

# Drop the unwanted columns (as I have the calibrated_author_rating column already)
skincare_reviews = skincare_reviews.drop(["author_rating", "adjusted_author_rating"], axis=1)


In [3]:
# Display the first 5 rows of skincare_reviews
skincare_reviews.head().transpose()


Unnamed: 0,0,1,2,3,4
product_id,P503052,P423688,P504045,P479125,P440650
product_name,Mini Flatter Me Digestive Enzyme Supplement,Daily Microfoliant Exfoliator,Daily Dose Bioretinol + Mineral SPF 40 with Bakuchiol,Mini Cloud Dew Oil-Free Gel Cream,Mini Retinol Youth Renewal Serum
brand_name,HUM Nutrition,Dermalogica,Supergoop!,Summer Fridays,Murad
average_rating,4.2643,4.696,4.7357,4.4613,4.6089
ingredients,"['Protease 3.0, Protease 4.5, Protease 6.0, Peptidase, Bromelain, Neutral Bacterial Protease, Pa...","['Microcrystalline Cellulose, Magnesium Oxide, Sodium Cocoyl Isethionate, Colloidal Oatmeal, Dis...","['Aqua/Water/Eau, Dicaprylyl Ether, Caprylic/Capric Triglyceride, Butyloctyl Salicylate, Polygly...","['Water/Aqua/Eau, Propanediol, Dicaprylyl Carbonate, Glycerin, Pentylene Glycol, Sodium Hyaluron...","['Water/Aqua/Eau, Cyclopentasiloxane, Isopropyl Palmitate, C14-22 Alcohols, Polymethylsilsesquio..."
price_usd,15.0,65.0,46.0,17.0,32.0
limited_edition,na,na,na,na,na
highlights,"['Vegan', 'Clean at Sephora', 'Without Phthalates', 'Without Parabens', 'Gluten Free']","['Vegan', 'Good for: Dullness/Uneven Texture', 'allure 2019 Best of Beauty Award Winner', 'Salic...","['Good for: Dullness/Uneven Texture', 'Good for: Loss of firmness', 'Clean at Sephora', 'Retinol...","['Best for Normal Skin', 'Plumping', 'Hydrating', 'Clean at Sephora', 'Good for: Dryness']","['Good for: Dark Circles', 'Good for: Anti-Aging', 'Retinol', 'Without Parabens', 'Gluten Free',..."
secondary_category,Wellness,Cleansers,Treatments,Moisturizers,Mini Size
tertiary_category,Beauty Supplements,Exfoliators,Face Serums,Moisturizers,na


**Note:** Data information is displayed for quick overview.

In [4]:
# Display the value counts of the limited_edition column
skincare_reviews["limited_edition"].value_counts()


limited_edition
na                 50707
limited-edition      353
Name: count, dtype: int64

In [5]:
# Display the value counts of the ingredients column
skincare_reviews["ingredients"].value_counts().head()


ingredients
['Microcrystalline Cellulose, Magnesium Oxide, Sodium Cocoyl Isethionate, Colloidal Oatmeal, Disodium Lauryl Sulfosuccinate, Sodium Lauroyl Glutamate, Oryza Sativa (Rice) Bran Extract, Oryza Sativa (Rice) Starch, Hydrogenated Coconut Acid, Allantoin, Papain, Salicylic Acid, Ginkgo Biloba Leaf Extract, Camellia Sinensis Leaf Extract, Glycyrrhiza Glabra (Licorice) Root Extract, PCA, Populus Tremuloides Bark Extract, Cyclodextrin, Sodium Isethionate, Lauryl Methacrylate/Glycol Dimethacrylate Copolymer, Maltodextrin, Melaleuca, Alternifolia (Tea Tree) Leaf Oil, Citrus Paradisi (Grapefruit) Peel Oil, Sodium Dehydroacetate, Hydrolyzed Corn Starch Hydroxyethyl Ether, Water/Aqua/Eau, Limonene, Citric Acid.']                                                                                                                                                                                                                                                                                       

In [6]:
# Display the value counts of the highlights column
skincare_reviews["highlights"].value_counts()


highlights
['Good for: Dullness/Uneven Texture', 'Good for: Pores']                                                                                                                 1526
['Hyaluronic Acid', 'Clean at Sephora', 'Good for: Anti-Aging', 'Good for: Dryness', 'Niacinamide']                                                                      1110
['Good for: Dullness/Uneven Texture', 'Vitamin C', 'Hydrating', 'Good for: Anti-Aging', 'Best for Dry, Combo, Normal Skin', 'Cruelty-Free']                              1003
na                                                                                                                                                                        970
['Good for: Dark Circles', 'Good for: Anti-Aging', 'Retinol', 'Without Parabens', 'Gluten Free', 'Cruelty-Free']                                                          876
                                                                                                                       

In [7]:
# Display the value counts of the secondary_category column
skincare_reviews["secondary_category"].value_counts()


secondary_category
Treatments                14030
Moisturizers              13387
Cleansers                  5774
Eye Care                   4519
Sunscreen                  3338
Mini Size                  2472
Masks                      2085
Lip Balms & Treatments     1495
Value & Gift Sets          1097
Wellness                   1057
Self Tanners                970
High Tech Tools             836
Name: count, dtype: int64

In [8]:
# Display the value counts of the tertiary_category column
skincare_reviews["tertiary_category"].value_counts()


tertiary_category
Face Serums                  12215
Moisturizers                  9867
na                            5593
Eye Creams & Treatments       4410
Face Sunscreen                3166
Face Wash & Cleansers         3106
Face Masks                    2001
Face Oils                     1842
Exfoliators                   1423
Mists & Essences              1046
Facial Peels                  1034
Toners                         984
Beauty Supplements             935
Blemish & Acne Treatments      781
Anti-Aging                     606
For Body                       524
Decollete & Neck Creams        303
Night Creams                   298
Makeup Removers                230
Hair Removal                   123
Facial Rollers                 116
Eye Masks                      109
Facial Cleansing Brushes       103
Sheet Masks                     84
For Face                        75
Body Sunscreen                  40
BB & CC Creams                  31
Holistic Wellness                6
Fa

## **3.0 Text Preprocessing**

Before performing text vectorization (word embedding), I need to make sure that all text within the specified columns is in lowercase format.

The specific columns that need to be converted to lowercase format:
1. `ingredients`
2. `highlights`
3. `secondary_category`
4. `tertiary_category`

Given that the following column are already in lowercase format:
1. `limited_edition`

### **3.1 Lowercasing**

In [None]:
# Convert to lowercase for the specified columns
columns_to_lowercase = ["ingredients", "highlights", "secondary_category", "tertiary_category"]
skincare_reviews[columns_to_lowercase] = skincare_reviews[columns_to_lowercase].map(lambda x:
    x.lower() if isinstance(x, str) else x)

# Verify the text is now in lowercase
skincare_reviews[columns_to_lowercase].head()


Unnamed: 0,ingredients,highlights,secondary_category,tertiary_category
0,"['protease 3.0, protease 4.5, protease 6.0, peptidase, bromelain, neutral bacterial protease, pa...","['vegan', 'clean at sephora', 'without phthalates', 'without parabens', 'gluten free']",wellness,beauty supplements
1,"['microcrystalline cellulose, magnesium oxide, sodium cocoyl isethionate, colloidal oatmeal, dis...","['vegan', 'good for: dullness/uneven texture', 'allure 2019 best of beauty award winner', 'salic...",cleansers,exfoliators
2,"['aqua/water/eau, dicaprylyl ether, caprylic/capric triglyceride, butyloctyl salicylate, polygly...","['good for: dullness/uneven texture', 'good for: loss of firmness', 'clean at sephora', 'retinol...",treatments,face serums
3,"['water/aqua/eau, propanediol, dicaprylyl carbonate, glycerin, pentylene glycol, sodium hyaluron...","['best for normal skin', 'plumping', 'hydrating', 'clean at sephora', 'good for: dryness']",moisturizers,moisturizers
4,"['water/aqua/eau, cyclopentasiloxane, isopropyl palmitate, c14-22 alcohols, polymethylsilsesquio...","['good for: dark circles', 'good for: anti-aging', 'retinol', 'without parabens', 'gluten free',...",mini size,na


### **3.2 Remaining Text Preprocessing**

1. Remove special characters
2. Perform tokenization
3. Remove stopwords
4. Remove extra spaces

In [None]:
# Function to preprocess text data after converting to lowercase
def preprocess_text(text):
    text = re.sub(r"\W+", " ", text)                                                        # remove special characters
    tokens = word_tokenize(text)                                                            # perform tokenization
    filtered_tokens = [word for word in tokens if word not in stop_words and word.strip()]  # remove stopwords
    clean_text = " ".join(filtered_tokens)                                                  # remove extra spaces
    
    return clean_text

# Parallelize for faster processing
def apply_parallel(df, column, func):
    results = Parallel(n_jobs=-1)(delayed(func)(text) for text in df[column].astype(str))
    return results


In [None]:
# Apply the preprocess_text function in parallel to each relevant column
for column in ["ingredients", "highlights", "secondary_category", "tertiary_category", "limited_edition"]:
    skincare_reviews[column] = apply_parallel(skincare_reviews, column, preprocess_text)
    
# Display the first 5 rows of the relevant columns
skincare_reviews[["ingredients", "highlights", "secondary_category", "tertiary_category", "limited_edition"]].head()


Unnamed: 0,ingredients,highlights,secondary_category,tertiary_category,limited_edition
0,protease 3 0 protease 4 5 protease 6 0 peptidase bromelain neutral bacterial protease papain bre...,vegan clean sephora without phthalates without parabens gluten free,wellness,beauty supplements,na
1,microcrystalline cellulose magnesium oxide sodium cocoyl isethionate colloidal oatmeal disodium ...,vegan good dullness uneven texture allure 2019 best beauty award winner salicylic acid best oily...,cleansers,exfoliators,na
2,aqua water eau dicaprylyl ether caprylic capric triglyceride butyloctyl salicylate polyglyceryl ...,good dullness uneven texture good loss firmness clean sephora retinol good anti aging reef safe spf,treatments,face serums,na
3,water aqua eau propanediol dicaprylyl carbonate glycerin pentylene glycol sodium hyaluronate hyd...,best normal skin plumping hydrating clean sephora good dryness,moisturizers,moisturizers,na
4,water aqua eau cyclopentasiloxane isopropyl palmitate c14 22 alcohols polymethylsilsesquioxane d...,good dark circles good anti aging retinol without parabens gluten free cruelty free,mini size,na,na


### **3.3 Multi-Criteria Features Concatenation**

In [None]:
# Concatenate the processed columns into a single column
skincare_reviews["multicriteria_combined_features"] = skincare_reviews[["ingredients",
                                                                        "highlights",
                                                                        "secondary_category",
                                                                        "tertiary_category",
                                                                        "limited_edition"]
                                                                       ].agg(" ".join, axis=1)

# Display the new dataframe to verify
skincare_reviews[["multicriteria_combined_features"]].head()


Unnamed: 0,multicriteria_combined_features
0,protease 3 0 protease 4 5 protease 6 0 peptidase bromelain neutral bacterial protease papain bre...
1,microcrystalline cellulose magnesium oxide sodium cocoyl isethionate colloidal oatmeal disodium ...
2,aqua water eau dicaprylyl ether caprylic capric triglyceride butyloctyl salicylate polyglyceryl ...
3,water aqua eau propanediol dicaprylyl carbonate glycerin pentylene glycol sodium hyaluronate hyd...
4,water aqua eau cyclopentasiloxane isopropyl palmitate c14 22 alcohols polymethylsilsesquioxane d...


In [None]:
# Drop the columns that are no longer needed
skincare_reviews.drop(columns=["ingredients",
                               "highlights",
                               "secondary_category",
                               "tertiary_category",
                               "limited_edition"], inplace=True)


# **4.0 Text Vectorization**

**Note:** Check if there are any missing values in the dataset.

In [9]:
# Display if there are any missing values in skincare_reviews
skincare_reviews.isnull().sum()


product_id                  0
product_name                0
brand_name                  0
average_rating              0
ingredients                 0
price_usd                   0
limited_edition             0
highlights                  0
secondary_category          0
tertiary_category           0
author_id                   0
calibrated_author_rating    0
dtype: int64

## **4.1 Modern Word Embeddings**

### **4.1.1 Global Vectors for Word Representation (GloVe)**

In [15]:
# Function to extract vocabulary from the multicriteria_combined_features column
def extract_vocabulary(column):
    vocabulary = set()
    
    for document in column:
        words = document.split()
        for word in words:
            vocabulary.add(word)
            
    return vocabulary

# Extract vocabulary
vocabulary = extract_vocabulary(skincare_reviews["multicriteria_combined_features"])

# Display the first 5 words in the vocabulary
list(vocabulary)[:5]


['pavonica', 'methicone', 'konjac', 'diastase', 'eop']

In [16]:
# Define the path to the GloVe embeddings model (840B tokens and 300-dimensional vectors)
glove_path = "models/glove.840B.300d.txt"

# Function to load GloVe model and extract embeddings for words in our vocabulary
def load_glove_model(glove_file_path, vocab):
    glove_embeddings = {}
    with open(glove_file_path, "r", encoding="utf-8") as file:
        for line in file:
            values = line.split(maxsplit=1) # ensure to only split once
            word = values[0]
            if word in vocab: # only load embeddings for words in our vocabulary
                # Split the embeddings string into an array
                vector = np.asarray(values[1].split(), dtype="float32")
                glove_embeddings[word] = vector
                
    return glove_embeddings

# Load the GloVe embeddings
glove_embeddings = load_glove_model(glove_path, vocabulary)

# Display the number of words in the GloVe embeddings
print(f"Loaded GloVe embeddings for {len(glove_embeddings)} words.")


Loaded GloVe embeddings for 2739 words.


In [17]:
# Function to vectorize using the GloVe embeddings
def vectorize_text_glove(text, embeddings, dim=300):
    words = text.split()
    word_vectors = [embeddings.get(word, None) for word in words]
    word_vectors = [vec for vec in word_vectors if vec is not None]
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Vectorize the multicriteria_combined_features data
skincare_reviews["vectorized_features_glove"] = skincare_reviews["multicriteria_combined_features"].apply(
    lambda x: vectorize_text_glove(x, glove_embeddings)
)

# Verify the vectorization
skincare_reviews["vectorized_features_glove"].head()


0    [-0.1909807, 0.21728708, 0.063824415, -0.09881957, -0.1294178, 0.29103935, -0.16182004, 0.041187...
1    [-0.2007592, 0.18091148, 0.016441643, -0.12368103, -0.18935207, 0.16839498, -0.18683685, 0.11725...
2    [-0.19948438, 0.109405495, -0.016978933, -0.05321018, -0.23000163, 0.08242865, -0.09035488, 0.12...
3    [-0.16096152, 0.1353347, 0.0015462068, -0.03779596, -0.19740348, 0.16779856, -0.28631714, 0.2497...
4    [-0.22409232, 0.1372577, -0.024697045, -0.04310343, -0.15350178, 0.27084374, -0.07375035, 0.1545...
Name: vectorized_features_glove, dtype: object

### **4.1.2 Word2Vec**

In [18]:
# Function to load Word2Vec model from a binary file (as gensim is not installed due to library conflicts in my environment)
def load_word2vec_model(file_name, vocab_size=3000000, vector_size=300):
    word_vectors = {}
    with open(file_name, "rb") as f:
        # Read header
        header = f.readline()
        vocab_count, layer_size = map(int, header.split())
        binary_len = np.dtype("float32").itemsize * vector_size

        for _ in range(vocab_size):
            # Read word
            word = []
            while True:
                ch = f.read(1)
                if ch == b" ":
                    break
                if ch != b"\n":
                    word.append(ch)
            word = b"".join(word).decode("latin-1")

            # Read vector
            vector = np.frombuffer(f.read(binary_len), dtype="float32")
            word_vectors[word] = vector

    return word_vectors

# Define the path to the Word2Vec model (Google News 300-dimensional embeddings)
word2vec_path = "models/GoogleNews-vectors-negative300.bin"

# Load the Word2Vec embeddings
word_vectors = load_word2vec_model(word2vec_path)

# Display the number of words in the Word2Vec embeddings
print(f"Loaded Word2Vec embeddings for {len(word_vectors)} words.")


Loaded Word2Vec embeddings for 3000000 words.


In [19]:
# Function to vectorize using the Word2Vec embeddings
def vectorize_text_word2vec(text, word_vectors, dim=300):
    words = text.split()
    word_vectors = [word_vectors[word] for word in words if word in word_vectors]
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(dim)

# Vectorize the multicriteria_combined_features data
skincare_reviews["vectorized_features_w2v"] = skincare_reviews["multicriteria_combined_features"].apply(
    lambda x: vectorize_text_word2vec(x, word_vectors)
)

# Verify the vectorization
skincare_reviews["vectorized_features_w2v"].head()


0    [-0.025286864, 0.04773051, 0.00082256, 0.05296351, -0.11717987, 0.0034599304, 0.041460164, -0.15...
1    [-0.05279678, 0.100673035, 0.004677933, 0.080512166, -0.08585795, 0.025877278, 0.008998057, -0.1...
2    [-0.018925477, 0.11361898, 0.0039180755, 0.039836273, -0.077773795, 0.008910879, -0.0038208007, ...
3    [-0.0947309, 0.082422346, 0.046553873, 0.033076715, -0.15074053, -0.0037631989, -0.019455997, -0...
4    [-0.075490214, 0.063881546, 0.043978788, 0.03898683, -0.12316739, 0.017027408, -0.009806263, -0....
Name: vectorized_features_w2v, dtype: object

In [20]:
# Display the first 5 rows of skincare_reviews
skincare_reviews.head().transpose()


Unnamed: 0,0,1,2,3,4
product_id,P503052,P423688,P504045,P479125,P440650
product_name,Mini Flatter Me Digestive Enzyme Supplement,Daily Microfoliant Exfoliator,Daily Dose Bioretinol + Mineral SPF 40 with Bakuchiol,Mini Cloud Dew Oil-Free Gel Cream,Mini Retinol Youth Renewal Serum
brand_name,HUM Nutrition,Dermalogica,Supergoop!,Summer Fridays,Murad
average_rating,4.2643,4.696,4.7357,4.4613,4.6089
price_usd,15.0,65.0,46.0,17.0,32.0
author_id,27517179462,11109420515,22705593305,20430950197,5967382859
calibrated_author_rating,5.0,5.0,5.0,5.0,5.0
multicriteria_combined_features,protease 3 0 protease 4 5 protease 6 0 peptidase bromelain neutral bacterial protease papain bre...,microcrystalline cellulose magnesium oxide sodium cocoyl isethionate colloidal oatmeal disodium ...,aqua water eau dicaprylyl ether caprylic capric triglyceride butyloctyl salicylate polyglyceryl ...,water aqua eau propanediol dicaprylyl carbonate glycerin pentylene glycol sodium hyaluronate hyd...,water aqua eau cyclopentasiloxane isopropyl palmitate c14 22 alcohols polymethylsilsesquioxane d...
vectorized_features_glove,"[-0.1909807, 0.21728708, 0.063824415, -0.09881957, -0.1294178, 0.29103935, -0.16182004, 0.041187...","[-0.2007592, 0.18091148, 0.016441643, -0.12368103, -0.18935207, 0.16839498, -0.18683685, 0.11725...","[-0.19948438, 0.109405495, -0.016978933, -0.05321018, -0.23000163, 0.08242865, -0.09035488, 0.12...","[-0.16096152, 0.1353347, 0.0015462068, -0.03779596, -0.19740348, 0.16779856, -0.28631714, 0.2497...","[-0.22409232, 0.1372577, -0.024697045, -0.04310343, -0.15350178, 0.27084374, -0.07375035, 0.1545..."
vectorized_features_w2v,"[-0.025286864, 0.04773051, 0.00082256, 0.05296351, -0.11717987, 0.0034599304, 0.041460164, -0.15...","[-0.05279678, 0.100673035, 0.004677933, 0.080512166, -0.08585795, 0.025877278, 0.008998057, -0.1...","[-0.018925477, 0.11361898, 0.0039180755, 0.039836273, -0.077773795, 0.008910879, -0.0038208007, ...","[-0.0947309, 0.082422346, 0.046553873, 0.033076715, -0.15074053, -0.0037631989, -0.019455997, -0...","[-0.075490214, 0.063881546, 0.043978788, 0.03898683, -0.12316739, 0.017027408, -0.009806263, -0...."


## **4.2 Advanced Contextual Embeddings**

### **4.2.1 Bidirectional Encoder Representations from Transformers (BERT)**

# **5.0 Export Processed Data**

## **5.1 Export for Content-Based Filtering**

In [21]:
# Drop the column that are no longer needed (as the multicriteria_combined_features has been vectorized)
skincare_reviews.drop(columns=["multicriteria_combined_features"], inplace=True)


In [22]:
# Save the vectorized skincare_reviews dataframe to a feather file
skincare_reviews.to_feather("processed_data/skincare_reviews_cbf.feather")


## **5.2 Export for Collaborative Filtering**

In [23]:
# Drop the vectorized columns (as rating are only needed for collaborative filtering)
skincare_reviews.drop(columns=["vectorized_features_glove", "vectorized_features_w2v"], inplace=True)


In [24]:
# Save the skincare_reviews with no vectorized columns to a feather file
skincare_reviews.to_feather("processed_data/skincare_reviews_cf.feather")


<blockquote style="background-color: yellow; color: black;"><strong>End of Part 3</strong></blockquote>


In [25]:
# Calculate the overall execution time
end_time = datetime.now()
execution_time = end_time - start_time
hours, remainder = divmod(execution_time.total_seconds(), 3600)
minutes, seconds = divmod(remainder, 60)

# Display the overall execution time
print(f"Start Time      : {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"End Time        : {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Execution Time  : {int(hours)}h {int(minutes)}m {int(seconds)}s")


Start Time      : 2024-04-19 06:35:41
End Time        : 2024-04-19 06:38:48
Execution Time  : 0h 3m 6s
