In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

In [2]:
fragrance_data = pd.read_csv('../data/fra_cleaned.csv', sep = ';', on_bad_lines='skip', encoding_errors='ignore')
df = pd.DataFrame(fragrance_data)
df.head()

Unnamed: 0,url,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,Perfumer1,Perfumer2,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5
0,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",unknown,,rose,woody,fruity,aromatic,floral
1,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",unknown,,citrus,white floral,sweet,fresh,musky
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",natalie gracia-cetto,quentin bisch,citrus,white floral,sweet,fresh spicy,musky
3,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",unknown,,fruity,nutty,woody,tropical,
4,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",francis kurkdjian,,aromatic,warm spicy,fresh spicy,cinnamon,vanilla


In [3]:
df['Notes'] = df[df.columns[8:10]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)


In [4]:
df.head()

Unnamed: 0,url,Perfume,Brand,Country,Gender,Rating Value,Rating Count,Year,Top,Middle,Base,Perfumer1,Perfumer2,mainaccord1,mainaccord2,mainaccord3,mainaccord4,mainaccord5,Notes
0,https://www.fragrantica.com/perfume/xerjoff/ac...,accento-overdose-pride-edition,xerjoff,Italy,unisex,142,201,2022.0,"fruity notes, aldehydes, green notes","bulgarian rose, egyptian jasmine, lily-of-the-...","eucalyptus, pine",unknown,,rose,woody,fruity,aromatic,floral,"fruity notes, aldehydes, green notes,bulgarian..."
1,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2024,jean-paul-gaultier,France,women,186,70,2024.0,"yuzu, citruses","orange blossom, neroli","musk, blonde woods",unknown,,citrus,white floral,sweet,fresh,musky,"yuzu, citruses,orange blossom, neroli"
2,https://www.fragrantica.com/perfume/jean-paul-...,classique-pride-2023,jean-paul-gaultier,France,unisex,191,285,2023.0,"blood orange, yuzu","neroli, orange blossom","musk, white woods",natalie gracia-cetto,quentin bisch,citrus,white floral,sweet,fresh spicy,musky,"blood orange, yuzu,neroli, orange blossom"
3,https://www.fragrantica.com/perfume/bruno-bana...,pride-edition-man,bruno-banani,Germany,men,192,59,2019.0,"guarana, grapefruit, red apple","walnut, lavender, guava","vetiver, benzoin, amber",unknown,,fruity,nutty,woody,tropical,,"guarana, grapefruit, red apple,walnut, lavende..."
4,https://www.fragrantica.com/perfume/jean-paul-...,le-male-pride-collector,jean-paul-gaultier,France,men,193,632,2020.0,"mint, lavender, cardamom, artemisia, bergamot","caraway, cinnamon, orange blossom","vanilla, sandalwood, amber, cedar, tonka bean",francis kurkdjian,,aromatic,warm spicy,fresh spicy,cinnamon,vanilla,"mint, lavender, cardamom, artemisia, bergamot,..."


In [5]:
df["Notes"] = df["Notes"].str.split(",")
Perfume_Notes = df["Perfume"], df["Notes"]
Perfume_Notes = pd.DataFrame(Perfume_Notes).T
Perfume_Notes.columns = ["Name", "Notes"]

In [6]:
Perfume_Notes.head()

Unnamed: 0,Name,Notes
0,accento-overdose-pride-edition,"[fruity notes, aldehydes, green notes, bulga..."
1,classique-pride-2024,"[yuzu, citruses, orange blossom, neroli]"
2,classique-pride-2023,"[blood orange, yuzu, neroli, orange blossom]"
3,pride-edition-man,"[guarana, grapefruit, red apple, walnut, la..."
4,le-male-pride-collector,"[mint, lavender, cardamom, artemisia, berg..."


In [7]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(Perfume_Notes["Notes"].apply(lambda x: ' '.join(x)))
vectorizer.get_feature_names_out()

array(['absinthe', 'absolute', 'acai', ..., 'zefir', 'zest', 'zinnia'],
      dtype=object)

In [8]:
count_vect_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out())
pd.concat([Perfume_Notes["Name"], count_vect_df], axis=1).head()
cosine_sim = cosine_similarity(X, X)
cosine_sim_df = pd.DataFrame(cosine_sim, index=Perfume_Notes["Name"], columns=Perfume_Notes["Name"])
cosine_sim_df.head()

Name,accento-overdose-pride-edition,classique-pride-2024,classique-pride-2023,pride-edition-man,le-male-pride-collector,le-male-pride-2023,le-male-pride-2024,polo-red-pride-edition,ralph-pride-edition,waffle,...,lavish,clipping,xchange-wonderman,citizen-jack-parfum,art-collection-la-nuit-de-l-homme,floratta,cheval-d-arabie,khaox,aoud-no-1,narmar-extrait-de-parfum
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
accento-overdose-pride-edition,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086066,0.0,...,0.11547,0.0,0.0,0.069007,0.0,0.63901,0.167542,0.326599,0.09759,0.223607
classique-pride-2024,0.0,1.0,0.790569,0.0,0.298142,0.790569,1.0,0.0,0.149071,0.169031,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.338062,0.0
classique-pride-2023,0.0,0.790569,1.0,0.0,0.353553,1.0,0.790569,0.0,0.117851,0.133631,...,0.316228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.400892,0.0
pride-edition-man,0.0,0.0,0.0,1.0,0.125988,0.0,0.0,0.267261,0.125988,0.0,...,0.0,0.0,0.133631,0.0,0.188982,0.0,0.0,0.0,0.142857,0.109109
le-male-pride-collector,0.0,0.298142,0.353553,0.125988,1.0,0.353553,0.298142,0.117851,0.111111,0.125988,...,0.298142,0.136083,0.353553,0.089087,0.5,0.0,0.0,0.210819,0.251976,0.3849


In [None]:
def get_recommendations(name="althair", cosine_sim=cosine_sim_df):
    '''
    This function takes a perfume name and returns the top 10 most similar perfumes based on cosine similarity.
    It uses the cosine similarity matrix to find the most similar perfumes and returns their names.
    Using the Levenshtein distance, it finds similar names if the exact name is not found in the dataset.
    If the name is not found in the dataset, it returns an error message.
    Parameters:
        name (str): The name of the perfume to find recommendations for.
        cosine_sim (DataFrame): The cosine similarity matrix.
    Returns:
        DataFrame: A DataFrame containing the top 10 most similar perfumes.

    The code commented out below is another way to clean up the name input, but it seems to create a loop, 
    and it is not necessary as the get_close_matches function already handles similar names.
    '''
    # Clean up the name input
    # replacements = {" ": "", ".": "", "'": "z", "-": "", "_": "", ",": "", ";": "", ":": "", "(": "", ")": "", "[": "", "]": ""}
    # translate_table = str.maketrans(replacements)
    # name = name.translate(translate_table)
    if name not in cosine_sim.index:
        # Find similar names using the similarity it has to other perfume names through Levenshtein distance
        from difflib import get_close_matches
        similar_names = get_close_matches(name, cosine_sim.index, n=5, cutoff=0.6)
        print(f"Similar names found: {similar_names}")
        if not similar_names == []:
            name = similar_names[0]  # Use the first similar name found
            # Provide a recommendation message
            return get_recommendations(name, cosine_sim)
        else:
            # If no similar names are found, return an error message
            return f"Error: '{name}' not found in the dataset."
    # Get the pairwise similarity scores of all perfumes with that perfume
    sim_scores = list(enumerate(cosine_sim[name]))
    # Sort the perfumes based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar perfumes
    sim_scores = sim_scores[1:11]
    # Get the perfume indices
    perfume_indices = [i[0] for i in sim_scores]
    return Perfume_Notes["Name"].iloc[perfume_indices]

In [23]:
# Example usage
perfume_input = input("Enter the name of the perfume: ")
# perfume_input = "althair"
# perfume_input = "al haramain"
# perfume_input = "dior"
# perfume_input = "red tobacco"
recommendations = get_recommendations(perfume_input)
print(f"Recommendations for '{perfume_input}': \n{recommendations}")

Similar names found: ['althair', 'lothair', 'altamir', 'salt-air', 'althea']
Recommendations for 'althaair': 
24044             liquid-brun
12754           signature-man
19798         paris-sao-paulo
2231              oud-couture
3281          calypso-vanille
13093                 kachgar
13483    fuel-for-life-spirit
23732                il-homme
14070              yunnan-tea
18675               boundless
Name: Name, dtype: object
