# Data Exploration of Fragrance Dataset

## Import Libraries

In [4]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
   ---------------------------------------- 0.0/250.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/250.0 kB ? eta -:--:--
   ------ -------------------------------- 41.0/250.0 kB 487.6 kB/s eta 0:00:01
   -------------- ------------------------ 92.2/250.0 kB 744.7 kB/s eta 0:00:01
   --------------------------- ------------ 174.1/250.0 kB 1.2 MB/s eta 0:00:01
   ---------------------------------------  245.8/250.0 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 250.0/250.0 kB 1.0 MB/s eta 0:00:00
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


In [1]:
import pandas as pd
import numpy as np

## Open Data

In [2]:
df = pd.read_excel('perfume_database.xlsx', 
                   usecols=['brand', 'perfume', 'notes', 'longevity', 'sillage', 'gender'])
df

Unnamed: 0,brand,perfume,notes,longevity,sillage,gender
0,18 21 Man Made,Sweet Tobacco Spirits,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla...","[4, 0, 9, 14, 31]","[10, 19, 28, 11]",unisex
1,40 Notes Perfume,Cashmere Musk,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmer...","[1, 0, 0, 1, 1]","[1, 3, 0, 2]",unisex
2,40 Notes Perfume,Exotic Ylang Ylang,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]","[1, 0, 0, 0, 2]","[0, 2, 1, 2]",unisex
3,40 Notes Perfume,Exquisite Amber,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""...","[0, 0, 0, 0, 1]","[1, 0, 3, 1]",female
4,40 Notes Perfume,Oudwood Veil,"[""Kephalis"", ""Agarwood (Oud)""]","[1, 1, 1, 0, 6]","[1, 4, 0, 5]",unisex
...,...,...,...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"[""Lemongrass"", ""Myrtle"", ""Grapefruit"", ""Eucaly...","[0, 0, 0, 0, 0]","[0, 0, 0, 0]",male
37922,Urban Rituelle,Peach Blossom,"[""Peach"", ""Honey"", ""Sweet Pea"", ""Mimosa""]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]",female
37923,Urban Rituelle,Pomegranate,"[""Pomegranate"", ""Citruses"", ""Red Berries""]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]",unisex
37924,Urban Rituelle,Vanilla,"[""Vanilla"", ""Caramel"", ""Milk""]","[0, 0, 0, 0, 0]","[0, 0, 0, 0]",female


### Process Data - Longevity and Sillage
* We are given a vector of 5 numbers for longevity: `[very weak, weak, moderate, long lasting, eternal]` The numbers represent the votes from users. The max of these numbers will be used to determine how long the fragrance lasts. The newly processed data will just hold a string that displays 1 of the 5 options.

* We are given a vector of 4 numbers for sillage: `[intimate, moderate, strong, enormous]` Again the max will be taken to represent the sillage - or scent trail. The newly processed data will hold a string that displays 1 of the 4 options.

* The image and launch years will be removed

In [3]:
dataset_longevity = pd.DataFrame(df['longevity'])

longevity_classification = {
    0 : "very weak",
    1 : "weak",
    2 : "moderate",
    3 : "long lasting",
    4 : "eternal"
}

def classify_longevity(votes):
    if isinstance(votes, str):
        votes = eval(votes)
    index = np.argmax(votes)
    # Return the corresponding classification
    return longevity_classification.get(index, "unknown")


dataset_longevity['longevity'] = dataset_longevity['longevity'].apply(classify_longevity)

print(dataset_longevity.head())
    
    
    

   longevity
0    eternal
1  very weak
2    eternal
3    eternal
4    eternal


In [4]:
df["longevity"] = dataset_longevity['longevity']
df

Unnamed: 0,brand,perfume,notes,longevity,sillage,gender
0,18 21 Man Made,Sweet Tobacco Spirits,"[""Citruses"", ""Saffron"", ""Tonka Bean"", ""Vanilla...",eternal,"[10, 19, 28, 11]",unisex
1,40 Notes Perfume,Cashmere Musk,"[""Sandalwood"", ""Cedar"", ""White Musk"", ""Cashmer...",very weak,"[1, 3, 0, 2]",unisex
2,40 Notes Perfume,Exotic Ylang Ylang,"[""Ylang-Ylang"", ""Gardenia"", ""Musk""]",eternal,"[0, 2, 1, 2]",unisex
3,40 Notes Perfume,Exquisite Amber,"[""Labdanum"", ""Styrax"", ""Benzoin"", ""Vanilla"", ""...",eternal,"[1, 0, 3, 1]",female
4,40 Notes Perfume,Oudwood Veil,"[""Kephalis"", ""Agarwood (Oud)""]",eternal,"[1, 4, 0, 5]",unisex
...,...,...,...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"[""Lemongrass"", ""Myrtle"", ""Grapefruit"", ""Eucaly...",very weak,"[0, 0, 0, 0]",male
37922,Urban Rituelle,Peach Blossom,"[""Peach"", ""Honey"", ""Sweet Pea"", ""Mimosa""]",very weak,"[0, 0, 0, 0]",female
37923,Urban Rituelle,Pomegranate,"[""Pomegranate"", ""Citruses"", ""Red Berries""]",very weak,"[0, 0, 0, 0]",unisex
37924,Urban Rituelle,Vanilla,"[""Vanilla"", ""Caramel"", ""Milk""]",very weak,"[0, 0, 0, 0]",female


In [8]:
sillage_classification = {
    0 : "intimate",
    1 : "moderate",
    2 : "strong",
    3 : "enormous"
}

dataset_sillage = pd.DataFrame(df['sillage'])


def classify_sillage(votes):
    if isinstance(votes, str):
        votes = eval(votes)
    index = np.argmax(votes)
    # Return the corresponding classification
    return sillage_classification.get(index, "unknown")


dataset_sillage['sillage'] = dataset_sillage['sillage'].apply(classify_sillage)

print(dataset_sillage.head())

    sillage
0    strong
1  moderate
2  moderate
3    strong
4  enormous


In [9]:
df["sillage"] = dataset_sillage['sillage']
df

Unnamed: 0,brand,perfume,notes,longevity,sillage,gender
0,18 21 Man Made,Sweet Tobacco Spirits,"citruses, saffron, tonka bean, vanilla, exotic...",eternal,strong,unisex
1,40 Notes Perfume,Cashmere Musk,"sandalwood, cedar, white musk, cashmere wood",very weak,moderate,unisex
2,40 Notes Perfume,Exotic Ylang Ylang,"ylang-ylang, gardenia, musk",eternal,moderate,unisex
3,40 Notes Perfume,Exquisite Amber,"labdanum, styrax, benzoin, vanilla, musk",eternal,strong,female
4,40 Notes Perfume,Oudwood Veil,"kephalis, agarwood (oud)",eternal,enormous,unisex
...,...,...,...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"lemongrass, myrtle, grapefruit, eucalyptus",very weak,intimate,male
37922,Urban Rituelle,Peach Blossom,"peach, honey, sweet pea, mimosa",very weak,intimate,female
37923,Urban Rituelle,Pomegranate,"pomegranate, citruses, red berries",very weak,intimate,unisex
37924,Urban Rituelle,Vanilla,"vanilla, caramel, milk",very weak,intimate,female


### Process Data - Clean Notes
* We will remove the list seen in the data, the quotes and lower case all the notes.
* Cleaning the notes column is provided by https://github.com/rdemarqui/perfume_recommender/blob/main/perfume_similarity.ipynb

In [10]:
df["notes"] = pd.DataFrame(df['notes'])

itens_to_remove = [
    '[', ']', '"', '{', '}',
    'middle: ', 'top: ', 'base: ', 'null'
]
def remove_items(text):
    for item in itens_to_remove:
        text = text.replace(item, "")
    return text

df['notes'] = df['notes'].astype(str)
df['notes'] = df['notes'].str.lower()
df['notes'] = df['notes'].apply(remove_items)

In [11]:
df

Unnamed: 0,brand,perfume,notes,longevity,sillage,gender
0,18 21 Man Made,Sweet Tobacco Spirits,"citruses, saffron, tonka bean, vanilla, exotic...",eternal,strong,unisex
1,40 Notes Perfume,Cashmere Musk,"sandalwood, cedar, white musk, cashmere wood",very weak,moderate,unisex
2,40 Notes Perfume,Exotic Ylang Ylang,"ylang-ylang, gardenia, musk",eternal,moderate,unisex
3,40 Notes Perfume,Exquisite Amber,"labdanum, styrax, benzoin, vanilla, musk",eternal,strong,female
4,40 Notes Perfume,Oudwood Veil,"kephalis, agarwood (oud)",eternal,enormous,unisex
...,...,...,...,...,...,...
37921,Urban Rituelle,Lemongrass Blend,"lemongrass, myrtle, grapefruit, eucalyptus",very weak,intimate,male
37922,Urban Rituelle,Peach Blossom,"peach, honey, sweet pea, mimosa",very weak,intimate,female
37923,Urban Rituelle,Pomegranate,"pomegranate, citruses, red berries",very weak,intimate,unisex
37924,Urban Rituelle,Vanilla,"vanilla, caramel, milk",very weak,intimate,female


In [41]:
df.to_excel('perfume_database_cleaned.xlsx', index=False)