<a href="https://colab.research.google.com/github/fadwa-chb/Sephora_Analysis_Project/blob/main/Sephora__Data_Cleaning_and_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [379]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder,OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [380]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


First data

In [381]:
df=pd.read_csv('/content/drive/MyDrive/sephora/brands_w_m_products.csv')

In [382]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    100 non-null    int64 
 1   brand         100 non-null    object
 2   COUNT(brand)  100 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 2.5+ KB


In [383]:
df.head(10)


Unnamed: 0.1,Unnamed: 0,brand,COUNT(brand)
0,0,CLINIQUE,64
1,1,Murad,54
2,2,Perricone MD,52
3,3,Shiseido,52
4,4,Origins,45
5,5,Kiehl's Since 1851,42
6,6,Kate Somerville,41
7,7,Peter Thomas Roth,40
8,8,Est√©e Lauder,39
9,9,Fresh,39


In [384]:
df.tail(10)

Unnamed: 0.1,Unnamed: 0,brand,COUNT(brand)
90,90,COOLA,4
91,91,COVER FX,4
92,92,Evian,4
93,93,Indie Lee,4
94,94,J.One,4
95,95,Naturally Serious,4
96,96,Nurse Jamie,4
97,97,Wander Beauty,4
98,98,BECCA,3
99,99,Briogeo,3


In [385]:
df.shape


(100, 3)

Check null values


In [386]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
brand,0
COUNT(brand),0


Check Duplicate Rows

In [387]:
df.duplicated().sum()

0

Drop Unnecessary Columns

In [388]:
df.drop(['id'], axis=1, inplace=True, errors='ignore')

Remove Missing Brands

In [389]:
df.dropna(subset=['brand'], inplace=True)

Convert Count to Numeric

In [390]:
df['COUNT(brand)'] = pd.to_numeric(df['COUNT(brand)'], errors='coerce')

  Rename the column

In [391]:
df.rename(columns={'COUNT(brand)': 'COUNT'}, inplace=True)

In [392]:
df


Unnamed: 0.1,Unnamed: 0,brand,COUNT
0,0,CLINIQUE,64
1,1,Murad,54
2,2,Perricone MD,52
3,3,Shiseido,52
4,4,Origins,45
...,...,...,...
95,95,Naturally Serious,4
96,96,Nurse Jamie,4
97,97,Wander Beauty,4
98,98,BECCA,3


In [393]:
df.drop(['Unnamed: 0'], axis=1, inplace=True, errors='ignore')

In [394]:
df

Unnamed: 0,brand,COUNT
0,CLINIQUE,64
1,Murad,54
2,Perricone MD,52
3,Shiseido,52
4,Origins,45
...,...,...
95,Naturally Serious,4
96,Nurse Jamie,4
97,Wander Beauty,4
98,BECCA,3


Data types

In [395]:
df.dtypes

Unnamed: 0,0
brand,object
COUNT,int64


Handle NaN values created by the conversion

In [396]:
df.dropna(subset=['COUNT'], inplace=True)

DataFrame after dropping NaN values

In [397]:
df

Unnamed: 0,brand,COUNT
0,CLINIQUE,64
1,Murad,54
2,Perricone MD,52
3,Shiseido,52
4,Origins,45
...,...,...
95,Naturally Serious,4
96,Nurse Jamie,4
97,Wander Beauty,4
98,BECCA,3


Standardize Brand Names

In [398]:
df['brand'] = df['brand'].str.strip().str.lower()

In [399]:
df

Unnamed: 0,brand,COUNT
0,clinique,64
1,murad,54
2,perricone md,52
3,shiseido,52
4,origins,45
...,...,...
95,naturally serious,4
96,nurse jamie,4
97,wander beauty,4
98,becca,3


Second data


In [400]:
df = pd.read_json('/content/drive/MyDrive/sephora/csvjson.json')


In [401]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0           100 non-null    int64 
 1   brand   100 non-null    object
 2   name    100 non-null    object
 3   price   100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.3+ KB


In [402]:
df.head()

Unnamed: 0,Unnamed: 1,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer,495
1,1,Guerlain,Orchid√©e Imp√©riale The Cream,460
2,2,SK-II,Ultimate Revival Cream,385
3,3,La Mer,The Concentrate,370
4,4,La Mer,The Regenerating Serum,345


In [403]:
df.shape

(100, 4)

Check null Values

In [404]:
df.isnull().sum()

Unnamed: 0,0
,0
brand,0
name,0
price,0


Remove rows with missing brand names

In [405]:
df.dropna(subset=['brand', 'price'], how='all', inplace=True)  # Removes rows where BOTH 'brand' AND 'price' are NaN

In [406]:
df

Unnamed: 0,Unnamed: 1,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer,495
1,1,Guerlain,Orchid√©e Imp√©riale The Cream,460
2,2,SK-II,Ultimate Revival Cream,385
3,3,La Mer,The Concentrate,370
4,4,La Mer,The Regenerating Serum,345
...,...,...,...,...
95,95,StackedSkincare,PSC Peptide Serum,150
96,96,Eve Lom,WHITE Advanced Brightening Serum,150
97,97,Peter Thomas Roth,Un-Wrinkle¬Æ Turbo Face Serum,150
98,98,Perricone MD,Cold Plasma+ Face,149


In [407]:
df.dropna(subset=['brand'], inplace=True)

Remove rows with percentage values in the 'brand' column (and other non-product rows)

In [408]:
df = df[~df['brand'].astype(str).str.contains('%')] #remove rows that contains % in the brand column

In [409]:
df=df[~df['brand'].astype(str).str.contains('Other')]#remove rows that contains Other in the brand column

In [410]:
df = df[~df['name'].astype(str).str.contains('√©')]
df = df[~df['name'].astype(str).str.contains('¬Æ')]

In [411]:
df = df[df['price'].notna()] #remove rows that contains NaN in the price column

In [412]:
df

Unnamed: 0,Unnamed: 1,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer,495
2,2,SK-II,Ultimate Revival Cream,385
3,3,La Mer,The Concentrate,370
4,4,La Mer,The Regenerating Serum,345
6,6,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...,...
94,94,Eve Lom,Age Defying Smoothing Treatment,150
95,95,StackedSkincare,PSC Peptide Serum,150
96,96,Eve Lom,WHITE Advanced Brightening Serum,150
98,98,Perricone MD,Cold Plasma+ Face,149


In [413]:
df

Unnamed: 0,Unnamed: 1,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer,495
2,2,SK-II,Ultimate Revival Cream,385
3,3,La Mer,The Concentrate,370
4,4,La Mer,The Regenerating Serum,345
6,6,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...,...
94,94,Eve Lom,Age Defying Smoothing Treatment,150
95,95,StackedSkincare,PSC Peptide Serum,150
96,96,Eve Lom,WHITE Advanced Brightening Serum,150
98,98,Perricone MD,Cold Plasma+ Face,149


Combine multi-line product names

In [414]:
 df['name'] = df.groupby('brand')['name'].transform(lambda x: '\n'.join(x))
df.drop_duplicates(subset=['brand','price'],inplace=True)

Convert 'price' to numeric

In [415]:
df['price'] = pd.to_numeric(df['price'], errors='coerce')

remove extra spaces from brands

In [416]:
df['brand'] = df['brand'].str.strip()


remove extra spaces from names

In [417]:
df['name'] = df['name'].str.strip()

Drop the first column

In [418]:
df = df.reset_index(drop=True)

In [419]:
df

Unnamed: 0,Unnamed: 1,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,495
1,2,SK-II,Ultimate Revival Cream\nUltimate Revival Essen...,385
2,3,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,370
3,4,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,345
4,6,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...,...
65,89,Peter Thomas Roth,FIRMx Growth Factor Extreme Neuropeptide Serum,150
66,90,StackedSkincare,EGF Activating Serum\nPSC Peptide Serum,150
67,94,Eve Lom,Age Defying Smoothing Treatment\nWHITE Advance...,150
68,98,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,149


In [420]:
df.rename(columns={'': 'index'}, inplace=True)

In [421]:
df

Unnamed: 0,index,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,495
1,2,SK-II,Ultimate Revival Cream\nUltimate Revival Essen...,385
2,3,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,370
3,4,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,345
4,6,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...,...
65,89,Peter Thomas Roth,FIRMx Growth Factor Extreme Neuropeptide Serum,150
66,90,StackedSkincare,EGF Activating Serum\nPSC Peptide Serum,150
67,94,Eve Lom,Age Defying Smoothing Treatment\nWHITE Advance...,150
68,98,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,149


In [422]:
df = df.reset_index(drop=True)

In [423]:
df

Unnamed: 0,index,brand,name,price
0,0,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,495
1,2,SK-II,Ultimate Revival Cream\nUltimate Revival Essen...,385
2,3,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,370
3,4,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,345
4,6,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...,...
65,89,Peter Thomas Roth,FIRMx Growth Factor Extreme Neuropeptide Serum,150
66,90,StackedSkincare,EGF Activating Serum\nPSC Peptide Serum,150
67,94,Eve Lom,Age Defying Smoothing Treatment\nWHITE Advance...,150
68,98,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,149


In [424]:
df.drop('index', axis=1, inplace=True)  # Drop the 'index' column

In [425]:
df

Unnamed: 0,brand,name,price
0,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,495
1,SK-II,Ultimate Revival Cream\nUltimate Revival Essen...,385
2,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,370
3,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,345
4,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...
65,Peter Thomas Roth,FIRMx Growth Factor Extreme Neuropeptide Serum,150
66,StackedSkincare,EGF Activating Serum\nPSC Peptide Serum,150
67,Eve Lom,Age Defying Smoothing Treatment\nWHITE Advance...,150
68,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,149


In [426]:
df

Unnamed: 0,brand,name,price
0,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,495
1,SK-II,Ultimate Revival Cream\nUltimate Revival Essen...,385
2,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,370
3,La Mer,The Concentrate\nThe Regenerating Serum\nThe M...,345
4,Shiseido,Future Solution LX Intensive Firming Contour S...,306
...,...,...,...
65,Peter Thomas Roth,FIRMx Growth Factor Extreme Neuropeptide Serum,150
66,StackedSkincare,EGF Activating Serum\nPSC Peptide Serum,150
67,Eve Lom,Age Defying Smoothing Treatment\nWHITE Advance...,150
68,Perricone MD,Neuropeptide Smoothing Facial Conformer\nNeuro...,149


Third data


In [427]:
df=pd.read_csv('/content/drive/MyDrive/sephora/product_info.csv')

In [428]:
df.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count,child_max_price,child_min_price
0,P473671,Fragrance Discovery Set,6342,19-69,6320,3.6364,11.0,,,,...,1,0,0,"['Unisex/ Genderless Scent', 'Warm &Spicy Scen...",Fragrance,Value & Gift Sets,Perfume Gift Sets,0,,
1,P473668,La Habana Eau de Parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,85.0,30.0
2,P473662,Rainbow Bar Eau de Parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
3,P473660,Kasbah Eau de Parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0
4,P473658,Purple Haze Eau de Parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 mL,Size + Concentration + Formulation,3.4 oz/ 100 mL,...,1,0,0,"['Unisex/ Genderless Scent', 'Layerable Scent'...",Fragrance,Women,Perfume,2,75.0,30.0


In [429]:
df.columns

Index(['product_id', 'product_name', 'brand_id', 'brand_name', 'loves_count',
       'rating', 'reviews', 'size', 'variation_type', 'variation_value',
       'variation_desc', 'ingredients', 'price_usd', 'value_price_usd',
       'sale_price_usd', 'limited_edition', 'new', 'online_only',
       'out_of_stock', 'sephora_exclusive', 'highlights', 'primary_category',
       'secondary_category', 'tertiary_category', 'child_count',
       'child_max_price', 'child_min_price'],
      dtype='object')

In [430]:
df.dtypes

Unnamed: 0,0
product_id,object
product_name,object
brand_id,int64
brand_name,object
loves_count,int64
rating,float64
reviews,float64
size,object
variation_type,object
variation_value,object


In [431]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8216 non-null   float64
 6   reviews             8216 non-null   float64
 7   size                6863 non-null   object 
 8   variation_type      7050 non-null   object 
 9   variation_value     6896 non-null   object 
 10  variation_desc      1250 non-null   object 
 11  ingredients         7549 non-null   object 
 12  price_usd           8494 non-null   float64
 13  value_price_usd     451 non-null    float64
 14  sale_price_usd      270 non-null    float64
 15  limited_edition     8494 non-null   int64  
 16  new   

In [432]:
df.shape

(8494, 27)

Handle Missing Values

In [433]:
df.isnull().sum()

Unnamed: 0,0
product_id,0
product_name,0
brand_id,0
brand_name,0
loves_count,0
rating,278
reviews,278
size,1631
variation_type,1444
variation_value,1598


Drop unnecessary columns

In [434]:
columns_to_drop = ['variation_desc', 'ingredients', 'child_max_price', 'child_min_price']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore') # errors='ignore' prevents errors if a column doesn't exist

Standardize text columns (lowercase, remove extra spaces)

In [435]:
text_cols = ['product_name', 'brand_name', 'primary_category', 'secondary_category', 'tertiary_category', 'size', 'variation_type', 'variation_value', 'highlights']

for col in text_cols:
    if col in df.columns:  # Check if column exists
        df[col] = df[col].astype(str).str.lower().str.strip()
        df[col] = df[col].replace(r'\s+', ' ', regex=True)

Handling Missing Value

In [436]:
#rating and reviews (since they have the same missing values)
df['rating'].fillna(0, inplace=True)  # Fill with 0 (meaning no rating)
df['reviews'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(0, inplace=True)  # Fill with 0 (meaning no rating)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews'].fillna(0, inplace=True)


In [437]:
df

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
0,P473671,fragrance discovery set,6342,19-69,6320,3.6364,11.0,,,,...,0,0,1,0,0,"['unisex/ genderless scent', 'warm &spicy scen...",fragrance,value & gift sets,perfume gift sets,0
1,P473668,la habana eau de parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
2,P473662,rainbow bar eau de parfum,6342,19-69,3253,4.2500,16.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
3,P473660,kasbah eau de parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
4,P473658,purple haze eau de parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,P467659,couture clutch eyeshadow palette,1070,yves saint laurent,2790,4.4286,7.0,,,,...,0,0,0,0,0,,makeup,eye,eye palettes,0
8490,P500874,l'homme eau de parfum,1070,yves saint laurent,2319,4.6367,556.0,2 oz / 60 ml,size + concentration + formulation,2 oz / 60 ml eau de parfum spray,...,0,0,0,0,0,"['layerable scent', 'woody & earthy scent']",fragrance,men,cologne,1
8491,P504428,mon paris eau de parfum gift set,1070,yves saint laurent,1475,5.0000,2.0,,,,...,1,1,1,1,0,,fragrance,value & gift sets,perfume gift sets,0
8492,P504448,y eau de parfum gift set,1070,yves saint laurent,840,0.0000,0.0,,,,...,1,1,1,0,0,,fragrance,value & gift sets,cologne gift sets,0


Handling missing values

In [438]:
missing_value_formats = ["nan", " ", ""] #Add all the representations of missing values
for col in ['size','variation_type', 'variation_value', 'highlights']:
    if col in df.columns:
        df[col] = df[col].replace(missing_value_formats, np.nan)

# --- Handling Missing Values ---
fillna_cols = ['size','variation_type', 'variation_value', 'highlights']
for col in fillna_cols:
    if col in df.columns:
        df[col] = df[col].fillna('not specified')

In [439]:
# --- Handling Missing Values (using SimpleImputer for 'most_frequent') ---
imputer = SimpleImputer(missing_values='not specified', strategy='most_frequent')

df[['variation_type']] = imputer.fit_transform(df[['variation_type']])
df[['variation_value']] = imputer.fit_transform(df[['variation_value']])
df[['highlights']] = imputer.fit_transform(df[['highlights']])

In [440]:
df

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
0,P473671,fragrance discovery set,6342,19-69,6320,3.6364,11.0,not specified,size,1.7 oz/ 50 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'warm &spicy scen...",fragrance,value & gift sets,perfume gift sets,0
1,P473668,la habana eau de parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
2,P473662,rainbow bar eau de parfum,6342,19-69,3253,4.2500,16.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
3,P473660,kasbah eau de parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
4,P473658,purple haze eau de parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,P467659,couture clutch eyeshadow palette,1070,yves saint laurent,2790,4.4286,7.0,not specified,size,1.7 oz/ 50 ml,...,0,0,0,0,0,"['layerable scent', 'floral scent']",makeup,eye,eye palettes,0
8490,P500874,l'homme eau de parfum,1070,yves saint laurent,2319,4.6367,556.0,2 oz / 60 ml,size + concentration + formulation,2 oz / 60 ml eau de parfum spray,...,0,0,0,0,0,"['layerable scent', 'woody & earthy scent']",fragrance,men,cologne,1
8491,P504428,mon paris eau de parfum gift set,1070,yves saint laurent,1475,5.0000,2.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,1,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,perfume gift sets,0
8492,P504448,y eau de parfum gift set,1070,yves saint laurent,840,0.0000,0.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,0,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,cologne gift sets,0


In [441]:
# --- Handling Missing Values in Numeric Columns (filling with 0) ---
numeric_cols_with_missing = ['rating', 'reviews']
for col in numeric_cols_with_missing:
    if col in df.columns:
        df[col] = df[col].fillna(0)

In [442]:
# Price related
df['value_price_usd'] = df['value_price_usd'].fillna(df['price_usd'])
df['sale_price_usd'] = df['sale_price_usd'].fillna(0)




In [443]:
df

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
0,P473671,fragrance discovery set,6342,19-69,6320,3.6364,11.0,not specified,size,1.7 oz/ 50 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'warm &spicy scen...",fragrance,value & gift sets,perfume gift sets,0
1,P473668,la habana eau de parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
2,P473662,rainbow bar eau de parfum,6342,19-69,3253,4.2500,16.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
3,P473660,kasbah eau de parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
4,P473658,purple haze eau de parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8489,P467659,couture clutch eyeshadow palette,1070,yves saint laurent,2790,4.4286,7.0,not specified,size,1.7 oz/ 50 ml,...,0,0,0,0,0,"['layerable scent', 'floral scent']",makeup,eye,eye palettes,0
8490,P500874,l'homme eau de parfum,1070,yves saint laurent,2319,4.6367,556.0,2 oz / 60 ml,size + concentration + formulation,2 oz / 60 ml eau de parfum spray,...,0,0,0,0,0,"['layerable scent', 'woody & earthy scent']",fragrance,men,cologne,1
8491,P504428,mon paris eau de parfum gift set,1070,yves saint laurent,1475,5.0000,2.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,1,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,perfume gift sets,0
8492,P504448,y eau de parfum gift set,1070,yves saint laurent,840,0.0000,0.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,0,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,cologne gift sets,0


Data Type Conversion

In [444]:
numeric_cols = ['loves_count', 'rating', 'reviews', 'price_usd', 'value_price_usd', 'sale_price_usd', 'child_count']
for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [445]:
df.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
0,P473671,fragrance discovery set,6342,19-69,6320,3.6364,11.0,not specified,size,1.7 oz/ 50 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'warm &spicy scen...",fragrance,value & gift sets,perfume gift sets,0
1,P473668,la habana eau de parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
2,P473662,rainbow bar eau de parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
3,P473660,kasbah eau de parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
4,P473658,purple haze eau de parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2


In [446]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8494 entries, 0 to 8493
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_id          8494 non-null   object 
 1   product_name        8494 non-null   object 
 2   brand_id            8494 non-null   int64  
 3   brand_name          8494 non-null   object 
 4   loves_count         8494 non-null   int64  
 5   rating              8494 non-null   float64
 6   reviews             8494 non-null   float64
 7   size                8494 non-null   object 
 8   variation_type      8494 non-null   object 
 9   variation_value     8494 non-null   object 
 10  price_usd           8494 non-null   float64
 11  value_price_usd     8494 non-null   float64
 12  sale_price_usd      8494 non-null   float64
 13  limited_edition     8494 non-null   int64  
 14  new                 8494 non-null   int64  
 15  online_only         8494 non-null   int64  
 16  out_of

In [447]:
df.head()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
0,P473671,fragrance discovery set,6342,19-69,6320,3.6364,11.0,not specified,size,1.7 oz/ 50 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'warm &spicy scen...",fragrance,value & gift sets,perfume gift sets,0
1,P473668,la habana eau de parfum,6342,19-69,3827,4.1538,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
2,P473662,rainbow bar eau de parfum,6342,19-69,3253,4.25,16.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
3,P473660,kasbah eau de parfum,6342,19-69,3018,4.4762,21.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2
4,P473658,purple haze eau de parfum,6342,19-69,2691,3.2308,13.0,3.4 oz/ 100 ml,size + concentration + formulation,3.4 oz/ 100 ml,...,0,0,1,0,0,"['unisex/ genderless scent', 'layerable scent'...",fragrance,women,perfume,2


In [448]:
df.tail()

Unnamed: 0,product_id,product_name,brand_id,brand_name,loves_count,rating,reviews,size,variation_type,variation_value,...,limited_edition,new,online_only,out_of_stock,sephora_exclusive,highlights,primary_category,secondary_category,tertiary_category,child_count
8489,P467659,couture clutch eyeshadow palette,1070,yves saint laurent,2790,4.4286,7.0,not specified,size,1.7 oz/ 50 ml,...,0,0,0,0,0,"['layerable scent', 'floral scent']",makeup,eye,eye palettes,0
8490,P500874,l'homme eau de parfum,1070,yves saint laurent,2319,4.6367,556.0,2 oz / 60 ml,size + concentration + formulation,2 oz / 60 ml eau de parfum spray,...,0,0,0,0,0,"['layerable scent', 'woody & earthy scent']",fragrance,men,cologne,1
8491,P504428,mon paris eau de parfum gift set,1070,yves saint laurent,1475,5.0,2.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,1,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,perfume gift sets,0
8492,P504448,y eau de parfum gift set,1070,yves saint laurent,840,0.0,0.0,not specified,size,1.7 oz/ 50 ml,...,1,1,1,0,0,"['layerable scent', 'floral scent']",fragrance,value & gift sets,cologne gift sets,0
8493,P505461,candy glaze lip gloss stick duo with hyaluroni...,1070,yves saint laurent,193,0.0,0.0,.11 oz / 3.2 ml,color,1.7 oz/ 50 ml,...,0,1,1,0,1,"['hyaluronic acid', 'high shine finish', 'plum...",makeup,lip,lip gloss,0
