In [1]:
# Import necessary libraries.
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopworda = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\easonlai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load the source dataset.
df = pd.read_csv('.\data\data_v3b.csv')

In [3]:
# Show the dataset headers.
df.head()

Unnamed: 0,Product_ID,Product_Category,Brand_Name,Product_Name,Product_Details,Origin,Ingredients,key_phrase_extract
0,100001,SHAMPOO,DOVE,DOVE BOTANIC BREAKAGE PROTECT SHAMPOO,Japan Origin. Combat hair breakage with Dove J...,Japan,Water Sodium Laureth Sulfate Cocamidopropyl ...,Dove Japan Hair Breakage Protection range Com...
1,100002,SHAMPOO,DOVE,DOVE BOTANIC SPLIT ENDS PROTECT SHAMPOO,Japan Origin. Looking for smooth and straight ...,Japan,Water Sodium Laureth Sulfate Cocamidopropyl ...,Split Ends Protection range Dove Japan Straig...
2,100003,SHAMPOO,DOVE,DOVE LIGHT MOIST PURE SHAMPOO,China Origin. Dove Light Moisture & Purifying ...,China,Ingredients: Water Sodium Laureth Sulfate So...,natural sourced lotus extract Dove Light Mois...
3,100004,SHAMPOO,DOVE,DOVE JAPAN AIRY MOISTURE SHAMPOO,Japan Origin. Featuring Oxygen 1 that attracts...,Japan,Water Sodium Laureth Sulfate Cocamidopropyl ...,oxygen-combined smooth moisture ingredient Ad...
4,100005,SHAMPOO,DOVE,DOVE JAPAN RICH MOISTURE SHAMPOO,Japan Origin. Dove’s unique advanced moisture ...,Japan,Water Sodium Laureth Sulfate Cocamidopropyl ...,unique advanced moisture oil formula moisture...


In [9]:
# Show the dataset information.
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, DOVE BOTANIC BREAKAGE PROTECT SHAMPOO to Essential Purify Anti Dandruff Shampoo
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   index               21 non-null     int64 
 1   Product_ID          21 non-null     int64 
 2   Product_Category    21 non-null     object
 3   Brand_Name          21 non-null     object
 4   Product_Details     21 non-null     object
 5   Origin              21 non-null     object
 6   Ingredients         21 non-null     object
 7   key_phrase_extract  21 non-null     object
dtypes: int64(2), object(6)
memory usage: 1.5+ KB


**Based on content wording in Product Details (Key Phrases from Azure Text Analytics) to figure out the similarity.**

In [5]:
# Create a TF-IDF Vectorizer and Cosine Similarity.
df.reset_index(inplace=True)
df.set_index('Product_Name', inplace=True)
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['key_phrase_extract'])
cos_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim

array([[1.        , 0.17392681, 0.04192626, 0.04461385, 0.04707357,
        0.00391087, 0.03647558, 0.03647558, 0.02051899, 0.03647558,
        0.01646681, 0.01652632, 0.00522318, 0.00510027, 0.00438579,
        0.00673983, 0.09526899, 0.03790213, 0.02364243, 0.03525902,
        0.00899201],
       [0.17392681, 1.        , 0.03605101, 0.06388497, 0.05737213,
        0.00336283, 0.03136415, 0.03136415, 0.0176436 , 0.03136415,
        0.01415927, 0.01421044, 0.00449124, 0.00438555, 0.0037712 ,
        0.00579536, 0.05055484, 0.06460558, 0.02032935, 0.0414159 ,
        0.00773193],
       [0.04192626, 0.03605101, 1.        , 0.02196479, 0.02870175,
        0.0173243 , 0.03130338, 0.03130338, 0.05758003, 0.03130338,
        0.02110649, 0.02118276, 0.03107176, 0.03034058, 0.02609029,
        0.04009404, 0.01035577, 0.01999262, 0.02087944, 0.02643918,
        0.02169777],
       [0.04461385, 0.06388497, 0.02196479, 1.        , 0.41887124,
        0.02042699, 0.02723023, 0.02723023, 0.0179562

In [6]:
# Create list of Product_Name.
p_name = pd.Series(df.index)
p_name

0                 DOVE BOTANIC BREAKAGE PROTECT SHAMPOO
1               DOVE BOTANIC SPLIT ENDS PROTECT SHAMPOO
2                         DOVE LIGHT MOIST PURE SHAMPOO
3                      DOVE JAPAN AIRY MOISTURE SHAMPOO
4                      DOVE JAPAN RICH MOISTURE SHAMPOO
5                           DOVE SHAMPOO STRAIGHT&SILKY
6             VIDAL SASSOON LIGHT & SOFT SMOOTH SHAMPOO
7                 VIDAL SASSOON MOISTURIZING AD SHAMPOO
8             VIDAL SASSOON TEXTURIZED STRAIGHT SHAMPOO
9           VIDAL SASSOON MOISTURING TRT REPAIR SHAMPOO
10                         REJOICE VOLUME FRESH SHAMPOO
11                                  REJOICE OIL REMOVAL
12                            REJOICE OLIVE OIL SHAMPOO
13                      REJOICE MINT REFRESHING SHAMPOO
14                       REJOICE LEMON BALANCED SHAMPOO
15                            REJOICE 3IN1 CARE SHAMPOO
16        ESSENTIAL NOURISHING BREAKAGE DEFENSE SHAMPOO
17            ESSENTIAL MOISTURIZING FRIZZ FREE 

In [7]:
# Create function for Product Recommendations.
def product_recommendations(Product_Name, cos_sim = cos_sim):
    recommended_product = []
    idx = p_name[p_name == Product_Name].index[0]
    score_series = pd.Series(cos_sim[idx]).sort_values(ascending = False)
    top_10_indexes = list(score_series.iloc[1:11].index)
    for i in top_10_indexes:
        recommended_product.append(list(df.index)[i])
    return recommended_product

In [8]:
# Perform Product Recommendations. It is based on Ingredients to figure out the similarity.
product_recommendations('DOVE BOTANIC BREAKAGE PROTECT SHAMPOO')

['DOVE BOTANIC SPLIT ENDS PROTECT SHAMPOO',
 'ESSENTIAL NOURISHING BREAKAGE DEFENSE SHAMPOO',
 'DOVE JAPAN RICH MOISTURE SHAMPOO',
 'DOVE JAPAN AIRY MOISTURE SHAMPOO',
 'DOVE LIGHT MOIST PURE SHAMPOO',
 'ESSENTIAL MOISTURIZING FRIZZ FREE SHAMPOO',
 'VIDAL SASSOON LIGHT & SOFT SMOOTH SHAMPOO',
 'VIDAL SASSOON MOISTURIZING AD SHAMPOO',
 'VIDAL SASSOON MOISTURING TRT REPAIR SHAMPOO',
 'Essential Purify Weightlessly Smooth Care Shampoo']

**Perform TF-IDF analysis for Product Details**

In [4]:
# Convert 'Product_Details' to list.
list_of_product__details = df['Product_Details'].tolist()

In [5]:
# Initialise TfidfVectorizer.
vectoriser = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

# Obtain weights of each term to each document in Product Details (TF-IDF scores).
tf_idf_scores = vectoriser.fit_transform(list_of_product__details)

In [6]:
# Get vocabulary of terms.
feature_names = vectoriser.get_feature_names()
list_of_product__details_index = [n for n in list_of_product__details]

In [7]:
# Create pandas DataFrame with TF-IDF scores (Term-Document Matrix).
df_product_details_tf_id = pd.DataFrame(tf_idf_scores.T.todense(), index = feature_names, columns = list_of_product__details)
df_product_details_tf_id

Unnamed: 0,Japan Origin. Combat hair breakage with Dove Japan Hair Breakage Protection range and bring back a head of silky hair stronger and smoother.\n,Japan Origin. Looking for smooth and straight hair that is easy to look after Dove Japan Straight and Split Ends Protection range is designed for frizzy hair knot and split ends hair type leave your hair smooth and shiny silkness.,China Origin. Dove Light Moisture & Purifying range infused with natural sourced lotus extract replenish essential nutrients. From the 1st use it keeps hair smooth.,Japan Origin. Featuring Oxygen 1 that attracts moisture.\nAdvanced Moisture Oxygen formula unique to Dove infused with oxygen-combined smooth moisture ingredient 2.\n 1 Oxygen atom in the ingredient structure 2 Cation Guargum\nTransparent shampoo washes your hair soft and fresh preparing your hair for the next step of care after shampoo.\n\nSubtle burst of elegant floral and fresh fruits.\n\nEven the dry spread and frizzy hair becomes this manageable.\nAiry finish that is so lightly smooth and manageable.,Japan Origin. Dove’s unique advanced moisture oil formula infused with moisture lipid 1 which is same as the component that naturally exists in hair.\nInfused with moisture lipid that penetrates into the hair and Squalane 2 which cares for the surface of the hair.\n 1 Oleic acid: moisture ingredient 2 Moisture ingredient\nTransparent shampoo washes your hair soft and fresh preparing your hair for the next step of care after shampoo.\n\nSubtle burst of elegant floral and fresh fruits.\n\nEven the dry spread and frizzy hair becomes this manageable.\nRich finish that is soft and manageable.,Thailand Origin. With Pro-Moisture Complex leaves hair straighter & manageable,China Origin. Contains PPT can protect dry hair moisturizing ingredients can clean and fully penetrate dry fragile hair.,China Origin. Contains PPT can protect dry hair moisturizing ingredients can clean and fully penetrate dry fragile hair..1,China Origin. Light texture moisturizing formula makes hair feeling light and smooth and can with any hair style .,China Origin. Contains PPT can protect dry hair moisturizing ingredients can clean and fully penetrate dry fragile hair..2,...,China Origin. Rejoice Innovative Hair make-up removal \n1. Inspired by French Micellar Water Hair Make-up technology mild to the skin\n2. Micellar micron particle easily get into the scalp pore the clean the grease hidden in the deep pore\n3. Natural white tea and lemon grass essence. Purify and remove oil deliver volume up smoothness.\n4.Free of Paraben preservatives Mineral oils Paraffins,China Origin. Olive Oil Shampoo,China Origin. Mint Refreshing Shampoo,China Origin. Lemon Balanced Conditioning Shampoo,China Origin. 3-in-1 Care Shampoo,Taiwan Origin. Upgraded 360 cuticle restructure essence could form a thin uniform protective layer along every hair strand reduce the fiction between hair and thus protect cuticle from external damage.\n\nSuitable for severely damaged hair which always undergo coloring & perming intensively repair damaged cuticle anti- hair breakage & split end with ultra smooth & shiny hair,Taiwan Origin. Upgraded 360 cuticle restructure essence could form a thin uniform protective layer along every hair strand reduce the fiction between hair and thus protect cuticle from external damage.\n\nSuitable for dry and frizzy hair which always treated with dryer & curler refill hair moisture from cuticle loss hair is moisturized &shiny without fizziness,Taiwan Origin. Watery 0% silicone protect hair moisture\n4times water lock & natural repair\n\nLong-lasting oil control formula\nIntensively deep cleansing the excessive oil. Retain scalp and hair oil & water balance long time control oil. Hair root is fresh and without stickiness hair end not dry. Hair can keep moisture and smooth always.\n\nMeans not adding silicone applies to shampoo products only.,Taiwan Origin. Watery 0% silicone protect hair moisture\n4times water lock & natural repair\n\nQuick Dry Airy formula\nEffective to reduce tangled hair and also add airy feeling to hair. No flat hair top while hair tip does not tangle. Hair become distinctly seperated when hair dryer lightly blow air pass through hair make hair easy to dry save 20%# of the hair blowing time and protect hair from heat and friction.\n\nMeans not adding silicone applies to shampoo products only,Taiwan Origin. Watery 0% silicone protect hair moisture\n4times water lock & natural repair\n\nHighly effective lock moisture anti-dandruff formulation\nContains Piroctone Olamine which is an active anti-dandruff ingredient that effectively remove dandruff prevent itchy. After use scalp can keep refresh and moisturized no more dandruff problem.\n\nMeans not adding silicone applies to shampoo products only
1st,0.0,0.0,0.142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
1st use,0.0,0.0,0.142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
1st use keeps,0.0,0.0,0.142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
20,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.079654,0.000000
20 hair,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.079654,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
watery silicone protect,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.070935,0.063405,0.072545
white,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085504,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
white tea,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.085504,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
white tea lemon,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.097089,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000
