In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings("ignore")

In [2]:
products = pd.read_csv('skincare_products_clean.csv')
chem_df = pd.read_csv('chemicals.csv')

In [3]:
print(products.isna().sum())
print(chem_df.isna().sum())

product_name     0
product_url      0
product_type     0
clean_ingreds    0
price            0
dtype: int64
Chemical_Name    0
Skin_Type        0
Description      0
dtype: int64


In [4]:
def find_matches(ingredient, chemicals):
    for chemical in chemicals:
        if chemical.lower() in ingredient.lower():
            return chemical
    return None

In [5]:
products['matched_chemical'] = products['clean_ingreds'].apply(lambda x: find_matches(x, chem_df['Chemical_Name']))

In [6]:
skin_products = pd.merge(products, chem_df, how='left', left_on='matched_chemical', right_on='Chemical_Name')

In [7]:
skin_products = skin_products.drop(columns=['product_url', 'price', 'Chemical_Name'])

In [22]:
ingred_matrix = pd.get_dummies(skin_products['matched_chemical'].str.split("|").apply(pd.Series).stack()).sum(level=1)
ingred_matrix

Unnamed: 0,"alcohol denat',","alcohol', 'glycerin',",aloe barbadenis,"ammonium lactate',",anthemis nobilis,"ascorbic acid',","butylene glycol',","butyrospermum parkii',","c12-15', 'tapioca","capric triglyceride',",...,"sodium chloride',","sodium hyaluronate',","sodium lactate',",sodium laureth,"sodium palmate',","sodium polyacrylate',","squalene', 'glycerin',","triethanolamine', 'sodium",vitis vinifera,"water\\aqua\\eau ',"
0,92.0,9.0,3744.0,4.0,3.0,2.0,2880.0,16.0,1.0,9963.0,...,20.0,717.0,5.0,912.0,2.0,2.0,5.0,1.0,75.0,10.0


In [8]:
skin_products.dropna(inplace=True)

In [9]:
grouped_chem = skin_products.groupby('clean_ingreds')['matched_chemical'].apply(lambda x: ', '.join(x)).reset_index()

In [10]:
# Create final dataframe
final_df = skin_products.merge(grouped_chem, on='matched_chemical', how='left')
final_df = final_df[['product_name','product_type', 'matched_chemical', 'Skin_Type', 'Description']]

In [11]:
def add_name_to_skin(row):
    if pd.isnull(row['Skin_Type']):
        return row['product_name'].replace("|", ",")
    else:
        return row['Skin_Type'] + "," + row['product_name'].replace("|", ",")

In [19]:
final_df


Unnamed: 0,product_name,product_type,matched_chemical,Skin_Type,Description
0,The Ordinary Natural Moisturising Factors + HA...,Moisturiser,"capric triglyceride',","Dry Skin,The Ordinary Natural Moisturising Fac...",A moisturizing agent derived from coconut oil....
1,The Ordinary Natural Moisturising Factors + HA...,Moisturiser,"capric triglyceride',","Dry Skin,The Ordinary Natural Moisturising Fac...",A moisturizing agent derived from coconut oil....
2,The Ordinary Natural Moisturising Factors + HA...,Moisturiser,"capric triglyceride',","Dry Skin,The Ordinary Natural Moisturising Fac...",A moisturizing agent derived from coconut oil....
3,The Ordinary Natural Moisturising Factors + HA...,Moisturiser,"capric triglyceride',","Dry Skin,The Ordinary Natural Moisturising Fac...",A moisturizing agent derived from coconut oil....
4,The Ordinary Natural Moisturising Factors + HA...,Moisturiser,"capric triglyceride',","Dry Skin,The Ordinary Natural Moisturising Fac...",A moisturizing agent derived from coconut oil....
...,...,...,...,...,...
22698,Connock London Kukui Oil Soothing Bath & Showe...,Bath Oil,"capric triglyceride',","Dry Skin,Connock London Kukui Oil Soothing Bat...",A moisturizing agent derived from coconut oil....
22699,Connock London Kukui Oil Soothing Bath & Showe...,Bath Oil,"capric triglyceride',","Dry Skin,Connock London Kukui Oil Soothing Bat...",A moisturizing agent derived from coconut oil....
22700,Connock London Kukui Oil Soothing Bath & Showe...,Bath Oil,"capric triglyceride',","Dry Skin,Connock London Kukui Oil Soothing Bat...",A moisturizing agent derived from coconut oil....
22701,Weleda Baby Calendula Cream Bath (200ml),Bath Oil,"glycerin', 'glyceryl","Oily Skin,Weleda Baby Calendula Cream Bath (20...",A fatty alcohol used as an emollient and thick...


In [12]:
final_df['Skin_Type'] = final_df.apply(lambda row: add_name_to_skin(row), axis=1)

In [13]:
product_names = final_df['product_name'].tolist()
skin_types = final_df['Skin_Type'].str.strip().str.split(",").tolist()

In [14]:
# Create bag of words
def create_bow(sktype_list):
    bow = {}
    if not isinstance(sktype_list, float):
        for Skin_Type in sktype_list:
            bow[Skin_Type] = 1
    return bow

In [15]:
bags_of_words = [create_bow(skin_type) for skin_type in skin_types]
sktype_df = pd.DataFrame(bags_of_words, index=product_names).fillna(0)

In [16]:
# Calculate cosine similarity
cosine_sim = cosine_similarity(sktype_df)
similarity_df = pd.DataFrame(cosine_sim, index=sktype_df.index, columns=sktype_df.index)

In [20]:
# User input for recommendation
user_input = input('What is your skin type or product name: ')

What is your skin type or product name: Dry Skin,The Ordinary Natural Moisturising


In [21]:

# Find the index of the product in the similarity dataframe
try:
    product_index = similarity_df.index.get_loc(user_input)
    top_10 = similarity_df.iloc[product_index].sort_values(ascending=False)[1:11]

    # Print the top 10 most similar products
    print(f'Top 10 similar products to {user_input}:')
    print(top_10)
except KeyError:
    print("The input provided does not match any product or skin type in the dataset.")


The input provided does not match any product or skin type in the dataset.
