# Identify and examine labels to find safe-for-you beauty products

In [149]:
import pandas as pd
import numpy as np
from sklearn import base
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.neighbors import NearestNeighbors
import re
import spacy
from sklearn.decomposition import TruncatedSVD
from spacy.lang.en.stop_words import STOP_WORDS

In [150]:
#1.user input a skincare product
#2.user identify an allergen/irritation
#3.recommend products

In [151]:
#conda install -c conda-forge spacy
#python -m spacy download en
#pip install streamlit-extras

In [259]:
ulta=pd.read_csv('Clean_data')
ulta.sample(3)

Unnamed: 0,Brand,Name,Rating,Price,Link,ID,Reviews,Ingredient,Main_Category,Category,Details,Image,Ig_list,len
2403,CHANEL,N°1 DE CHANEL Revitalizing Lotion,3.8,$65.00,https://www.ulta.com/p/n1-de-chanel-revitalizi...,2589596,10.0,"camellia japonica flower water, aqua (water), ...",Skincare,Moisturizers,Details\nKey Ingredients\nAt the heart of the ...,https://media.ulta.com/i/ulta/2589596?w=78&h=78,"['2-hexanediol', 'acrylates/c10-30 alkyl acryl...",24
1408,Clinique,Fresh Pressed Renewing Powder Cleanser with Pu...,4.8,$39.00,https://www.ulta.com/p/fresh-pressed-renewing-...,2506735,51.0,"""water\\aqua\\eau, ascorbic acid, glycerin, di...",Skincare,Cleansers,Details\nBenefits\nAll Skin Types.\nUltrafine ...,https://media.ulta.com/i/ulta/2506735?w=156&h=156,"['""water\\\\aqua\\\\eau', 'acetyl glucosamine'...",30
4797,ColourPop,Sorbet Eyeshadow Palette Quad,4.0,$10.00,https://www.ulta.com/p/sorbet-eyeshadow-palett...,2583819,25.0,"glazey: calcium sodium borosilicate, caprylic/...",Makeup,Eyes,Details\nShades\nGlazey (metallic icy pastel p...,https://media.ulta.com/i/ulta/2557?w=582&h=582...,"['caprylic/capric triglyceride', 'caprylyl gly...",11


In [261]:
product=ulta['Name'][2403]
ig=ulta['Ingredient'][2403]
product,ig

('N°1 DE CHANEL Revitalizing Lotion',
 'camellia japonica flower water, aqua (water), glycerin, pentylene glycol, faex (yeast extract), camellia japonica flower extract, decyl glucoside, chlorphenesin, caprylyl glycol, maltodextrin, acrylates/c10-30 alkyl acrylate crosspolymer, propanediol, sodium hyaluronate, parfum (fragrance), sodium hydroxide, glycol distearate, biosaccharide gum-1, saccharide isomerate, coco-glucoside, glyceryl oleate, glyceryl stearate, 1,2-hexanediol, citric acid, benzoic acid, il58a.')

In [262]:
allergen='il58a'

In [263]:
#Check if Ingredient list contains allergen item

def containsAllergen(df,allergen):
    return df[df['Ingredient'].apply(lambda x: allergen not in x)]

## Model Selection:

### Model 1: CountVectorizer, NearestNeighbors

In [264]:
%%time
#CountVectorizer,NearestNeighbors
#based on ingredients of previously working-well products
#check products for problematic ingredients

def recommender_1(df,product, allergen):
    safe_df= df.copy()
    
    bag_of_words_vectorizer=CountVectorizer(min_df=0,
                             ngram_range=(1,2), 
                             stop_words='english')
    counts=bag_of_words_vectorizer.fit_transform(safe_df['Ingredient'])
    nn = NearestNeighbors(n_neighbors=20).fit(counts) 
    index=safe_df[safe_df['Name']==product].index
    #safe_df.index[safe_df['Name']==product][0]
    dists, indices = nn.kneighbors(counts[index[0]])
    prod_nbr=safe_df.iloc[indices[0]]
    
    prod_nbr=prod_nbr[prod_nbr['Main_Category']==df['Main_Category'][index[0]]]
    return containsAllergen(prod_nbr,allergen).head(5)
    #return #.reset_index(drop=True)

prod_1=recommender_1(ulta,product,allergen)
prod_1

Int64Index([2403], dtype='int64')
CPU times: user 867 ms, sys: 35.4 ms, total: 902 ms
Wall time: 906 ms


Unnamed: 0,Brand,Name,Rating,Price,Link,ID,Reviews,Ingredient,Main_Category,Category,Details,Image,Ig_list,len
2435,CHANEL,N°1 DE CHANEL Revitalizing Essence Lotion,5.0,$125.00,https://www.ulta.com/p/n1-de-chanel-revitalizi...,2600331,1.0,"camellia japonica flower water, aqua (water), ...",Skincare,Moisturizers,Details\nFeatures\nN°1 DE CHANEL REVITALIZING ...,https://media.ulta.com/i/ulta/2600331?w=78&h=78,"['2-hexanediol', 'acrylates/c10-30 alkyl acryl...",20
559,CHANEL,N°1 DE CHANEL Revitalizing Serum,4.1,$125.00 - $165.00,https://www.ulta.com/p/n1-de-chanel-revitalizi...,2589598,41.0,"camellia japonica flower water, propanediol, g...",Skincare,Treatment & Serums,Details\nKey Ingredients\nAt the heart of the ...,"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAA...","['2-hexanediol', 'adenosine', 'camellia japoni...",23
1033,CHANEL,HYDRA BEAUTY MICRO SÉRUM Intense Replenishing ...,4.5,$110.00,https://www.ulta.com/p/hydra-beauty-micro-seru...,2528709,11.0,"aqua (water), glycerin, propanediol, butylene ...",Skincare,Treatment & Serums,Details\nKey Ingredients\nPatented micro-dropl...,https://media.ulta.com/i/ulta/2528709?w=78&h=78,"['amodimethicone', 'butylene glycol', 'camelli...",18
2706,KYLIE SKIN,Clarifying Exfoliating Toner,4.1,$14.00 - $28.00,https://www.ulta.com/p/clarifying-exfoliating-...,2595369,32.0,"aqua/water/eau, propanediol, butylene glycol, ...",Skincare,Cleansers,"Details\nBenefits\nLeave-on, exfoliating toner...",https://media.ulta.com/i/ulta/2595369?w=78&h=78,"['2-hexanediol', 'butylene glycol', 'caprylhyd...",8
2676,Kiehl's Since 1851,Ultra Pure High-Potency 1.5% Hyaluronic Acid S...,4.7,$35.00,https://www.ulta.com/p/ultra-pure-high-potency...,2607903,378.0,"aqua/water, glycerin, sodium hyaluronate, phen...",Skincare,Treatment & Serums,Details\nBenefits\nAdd a drop of Kiehl's Hyalu...,https://media.ulta.com/i/ulta/2607903?w=78&h=78,"['chlorphenesin', 'citric acid', 'glycerin', '...",6


### Model 2: CountVectorizer, Cosine similarity

In [265]:
%%time
#CountVectorizer,cosine similarity
"""Calculate similarity scores: Use a content-based filtering algorithm to calculate similarity scores 
between products based on their ingredient lists."""

def cos_similarity(row1, row2):
    dot = np.dot(row1, row2.T)[0, 0]
    len1 = np.sqrt(np.dot(row1, row1.T))[0, 0]
    len2 = np.sqrt(np.dot(row2, row2.T))[0, 0]
    return dot / len1 / len2


def recommender_2(df,product, allergen):
    bag_of_words_vectorizer=CountVectorizer(min_df=0,
                             ngram_range=(1,2), 
                             stop_words='english')
    counts=bag_of_words_vectorizer.fit_transform(df['Ingredient'])
    #print(counts.shape)
    prod_nbr={}
    for j in range(len(counts.toarray())):
        prod_nbr[j]=cos_similarity(counts[2147],counts[j])
    nbr=sorted(prod_nbr.items(),key=lambda x:x[1],reverse=True )
    indices=[x[0] for x in nbr[0:30]]

    prod_nbr=df.iloc[indices] 
    index=df.index[df['Name']==product][0]
    prod_nbr=prod_nbr[prod_nbr['Main_Category']==df['Main_Category'][index]]
    return containsAllergen(prod_nbr,allergen).head(5)

prod_2=recommender_2(ulta,product,allergen)
prod_2

CPU times: user 7.78 s, sys: 517 ms, total: 8.3 s
Wall time: 8.57 s


Unnamed: 0,Brand,Name,Rating,Price,Link,ID,Reviews,Ingredient,Main_Category,Category,Details,Image,Ig_list,len
2147,belif,Problem Solution Moisturizer,4.3,$38.00,https://www.ulta.com/p/problem-solution-moistu...,2557464,10.0,"water, glycerin, glyceryl stearate, cetyl ethy...",Skincare,Moisturizers,Details\nBenefits\nThis moisturizer gently soo...,https://media.ulta.com/i/ulta/2557464?w=78&h=78,"['2-hexanediol', 'acrylates/c10-30 alkyl acryl...",44
1863,belif,Problem Solution Toner,4.6,$30.00,https://www.ulta.com/p/problem-solution-toner-...,2557461,24.0,"water, dipropylene glycol, butylene glycol, 1,...",Skincare,Global Skin Care,Details\nBenefits\nInfused with proven effecti...,https://media.ulta.com/i/ulta/2557461?w=78&h=78,"['2-hexanediol', 'alanyl glutamine', 'arctium ...",44
2300,The Crème Shop,Klean Beauty Essence Sheet Mask,4.5,$4.00,https://www.ulta.com/p/klean-beauty-essence-sh...,2605375,2.0,"aqua/water, glycerin dipropylene glycol collag...",Skincare,Global Skin Care,Details\nBenefits\nFree of 11+ ingredients to ...,Image is not available,"['2-hexanediol tromethamine', 'allantoin', 'as...",18
114,belif,Aqua Bomb Sleeping Mask,4.6,$38.00,https://www.ulta.com/p/aqua-bomb-sleeping-mask...,2557477,957.0,"water, dipropylene glycol, polyglycerin-3, 1,2...",Skincare,Moisturizers,Details\nBenefits\nThe 'memory' formula of thi...,https://media.ulta.com/i/ulta/2557477?w=156&h=156,"['2-hexanediol', 'acrylates/c10-30 alkyl acryl...",46
1398,belif,Peat Miracle Revital Eye Cream,4.7,$58.00,https://www.ulta.com/p/peat-miracle-revital-ey...,2557462,20.0,"water, glycerin, dipropylene glycol, cyclopent...",Skincare,Global Skin Care,Details\nBenefits\nThe velvety-textured eye cr...,https://media.ulta.com/i/ulta/2557462?w=156&h=156,"['2-hexanediol', 'biosaccharide gum-1', 'butyr...",17


### Model 3: DictVectorizer, NearestNeighbors

In [266]:
%%time
#Feature engineering
class DictEncoder(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, col):
        self.col = col
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        def to_dict(l):
            try:
                return {x.strip(): 1 for x in l}
            except TypeError:
                return {}        
        return X[self.col].apply(to_dict)


def recommender_3(df,product, allergen):
    df_1=df.copy()
    #df_1['Ig_list']=df.apply(lambda x:x['Ingredient'].split(','),axis=1)
    ig_pipe = Pipeline([('encoder', DictEncoder('Ig_list')),
                     ('vectorizer', DictVectorizer())#,('svd', TruncatedSVD(n_components=300))
                   ])
    #dt_pipe=Pipeline([("encoder",DictEncoder('Dt_list')),('vectorizer',DictVectorizer())])
    #union=FeatureUnion([('Ingredient',ig_pipe),('Detail',dt_pipe)])
    features = ig_pipe.fit_transform(df_1)
    nn = NearestNeighbors(n_neighbors=20).fit(features)
    index=df_1.index[df_1['Name']==product][0]
    dists, indices = nn.kneighbors(features[index])
    
    prod_nbr=df_1.iloc[indices[0]]    
    prod_nbr=prod_nbr[prod_nbr['Main_Category']==df_1['Main_Category'][index]]
    
   
    return containsAllergen(prod_nbr,allergen).head(5)

prod_3=recommender_3(ulta,product,allergen)
prod_3

CPU times: user 384 ms, sys: 9.58 ms, total: 393 ms
Wall time: 397 ms


Unnamed: 0,Brand,Name,Rating,Price,Link,ID,Reviews,Ingredient,Main_Category,Category,Details,Image,Ig_list,len
946,fresh,Tea Elixir Skin Resilience Activating Serum,4.8,$80.00 - $110.00,https://www.ulta.com/p/tea-elixir-skin-resilie...,2598820,1919.0,"aqua (water), glycerin, simmondsia chinensis (...",Skincare,Treatment & Serums,Details\nBenefits\nThis breakthrough formula h...,https://media.ulta.com/i/ulta/2598820?w=78&h=78,"['2-hexanediol', 'acrylates/c10-30 alkyl acryl...",35
367,First Aid Beauty,Ultra Repair Hydra-Firm Night Cream,4.7,$44.00,https://www.ulta.com/p/ultra-repair-hydra-firm...,2575650,287.0,"aqua/water/eau, glycerin, caprylic/capric trig...",Skincare,Moisturizers,"Details\nBenefits\nRich, buttery texture\nForm...",https://media.ulta.com/i/ulta/2575650?w=156&h=156,"['acacia decurrens flower wax', 'acrylates/c10...",37
1044,StriVectin,Peptight Tightening & Brightening Face Serum,4.5,$99.00,https://www.ulta.com/p/peptight-tightening-bri...,2557473,351.0,"aqua (water, eau), c12-15 alkyl benzoate, glyc...",Skincare,Treatment & Serums,Details\nBenefits\nVisibly tightens and improv...,https://media.ulta.com/i/ulta/2557473?w=78&h=78,"['2-hexanediol', 'acetyl hexapeptide-1', 'acry...",74
860,Eczema Honey,Multi-Peptide Eye Cream,3.8,$14.95,https://www.ulta.com/p/multi-peptide-eye-cream...,2598806,38.0,"water (aqua), glycerin, caprylic/capric trigly...",Skincare,Eye Treatments,Details\nBenefits\nRevitilizes skin by softeni...,https://media.ulta.com/i/ulta/2598806?w=78&h=78,"['aloe barbadensis leaf juice', 'avena sativa ...",42
1593,KORRES,Santorini Grape Poreless Skin Cream,4.5,$38.50,https://www.ulta.com/p/santorini-grape-poreles...,2588999,27.0,"aqua/water/eau, glycerin, simmondsia chinensis...",Skincare,Moisturizers,Details are not available,https://media.ulta.com/i/ulta/2588999?w=78&h=78,['acrylates/c10-30 alkyl acrylate crosspolymer...,38


In [267]:
prod_1.index,prod_2.index,prod_3.index

(Int64Index([2435, 559, 1033, 2706, 2676], dtype='int64'),
 Int64Index([2147, 1863, 2300, 114, 1398], dtype='int64'),
 Int64Index([946, 367, 1044, 860, 1593], dtype='int64'))

In [268]:
prod_1['Name'],prod_2['Name'],prod_3['Name']

(2435            N°1 DE CHANEL Revitalizing Essence Lotion
 559                      N°1 DE CHANEL Revitalizing Serum
 1033    HYDRA BEAUTY MICRO SÉRUM Intense Replenishing ...
 2706                         Clarifying Exfoliating Toner
 2676    Ultra Pure High-Potency 1.5% Hyaluronic Acid S...
 Name: Name, dtype: object,
 2147       Problem Solution Moisturizer
 1863             Problem Solution Toner
 2300    Klean Beauty Essence Sheet Mask
 114             Aqua Bomb Sleeping Mask
 1398     Peat Miracle Revital Eye Cream
 Name: Name, dtype: object,
 946      Tea Elixir Skin Resilience Activating Serum
 367              Ultra Repair Hydra-Firm Night Cream
 1044    Peptight Tightening & Brightening Face Serum
 860                          Multi-Peptide Eye Cream
 1593             Santorini Grape Poreless Skin Cream
 Name: Name, dtype: object)

#### Model 1 generates the best result with the fastest speed.