## 1. Loading Dataset and Overview

In [1]:
# Import necessary libraries
import pandas as pd

### 1.1 Loading Dataset

In [None]:
# Load the dataset
raw_data_path = '../data/data_raw.csv'
raw_data = pd.read_csv(raw_data_path)

# Display the dataset
raw_data


Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White
...,...,...,...,...,...,...,...,...
12486,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,Pepe Jeans,Men,1299,7,"Black dark wash 5-pocket low-rise jeans, clean...",Black
12487,10261721,Mochi Women Gold-Toned Solid Heels,Mochi,Women,1990,5,"A pair of gold-toned open toe heels, has regul...",Gold
12488,10261607,612 league Girls Navy Blue & White Printed Reg...,612 league,Girls,602,4,Navy Blue and White printed mid-rise denim sho...,Blue
12489,10266621,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,Bvlgari,Men,8950,2,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,


### 1.2 Data overview

In [4]:
# Display the dataset shape
print("Shape of DataFrame:", raw_data.shape)

Shape of DataFrame: (12491, 8)


In [5]:

# Display the number of unique values in each column
print("Number of unique values in each column:\n", raw_data.nunique())

Number of unique values in each column:
 ProductID       12491
ProductName     10761
ProductBrand      677
Gender              6
Price (INR)      1543
NumImages          10
Description     10435
PrimaryColor       27
dtype: int64


In [6]:
# Display the dataset information
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12491 entries, 0 to 12490
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductID     12491 non-null  int64 
 1   ProductName   12491 non-null  object
 2   ProductBrand  12491 non-null  object
 3   Gender        12491 non-null  object
 4   Price (INR)   12491 non-null  int64 
 5   NumImages     12491 non-null  int64 
 6   Description   12491 non-null  object
 7   PrimaryColor  11597 non-null  object
dtypes: int64(3), object(5)
memory usage: 780.8+ KB


In [7]:
# Display the dataset descriptive statistics
raw_data.describe()

Unnamed: 0,ProductID,Price (INR),NumImages
count,12491.0,12491.0,12491.0
mean,9917160.0,1452.660956,4.913698
std,1438006.0,2118.503976,1.092333
min,101206.0,90.0,1.0
25%,10062150.0,649.0,5.0
50%,10154630.0,920.0,5.0
75%,10215650.0,1499.0,5.0
max,10275140.0,63090.0,10.0


In [8]:
# Check for duplicates
raw_data.duplicated().sum()

np.int64(0)

### 2. Data Cleaning

- Handle missing values

- Rename columns

- Correct data types

- Remove duplicates



In [9]:
# keep only the columns needed for analysis
columns_to_keep = ["ProductID", "ProductName", "Description", "ProductBrand", "Gender", "Price (INR)", "PrimaryColor"]
df = raw_data[columns_to_keep].copy()

df

Unnamed: 0,ProductID,ProductName,Description,ProductBrand,Gender,Price (INR),PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,"Black and grey printed medium trolley bag, sec...",DKNY,Unisex,11745,Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,Beige & Grey made to measure kurta with churid...,EthnoVogue,Women,5810,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,Pink coloured wash 5-pocket high-rise cropped ...,SPYKAR,Women,899,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Blue self-design bandhgala suitBlue self-desig...,Raymond,Men,5599,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,"Brown and off-white printed casual shirt, has ...",Parx,Men,759,White
...,...,...,...,...,...,...,...
12486,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,"Black dark wash 5-pocket low-rise jeans, clean...",Pepe Jeans,Men,1299,Black
12487,10261721,Mochi Women Gold-Toned Solid Heels,"A pair of gold-toned open toe heels, has regul...",Mochi,Women,1990,Gold
12488,10261607,612 league Girls Navy Blue & White Printed Reg...,Navy Blue and White printed mid-rise denim sho...,612 league,Girls,602,Blue
12489,10266621,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,Bvlgari Men Aqva Pour Homme Marine Eau de Toil...,Bvlgari,Men,8950,


In [63]:

# Drop rows with missing values
df = df.dropna()

In [64]:
# Rename columns
df = df.rename(columns={
    "ProductID": "product_id",
    "ProductName": "product_name",
    "ProductBrand": "product_brand",
    "Description": "description",
    "Gender": "gender",
    "Price (INR)": "price_inr",
    "PrimaryColor": "primary_color"
})

In [65]:
# Ensure correct data types
df["product_id"]     = df["product_id"].astype("string")
df["product_name"]   = df["product_name"].astype("string")
df["product_brand"]  = df["product_brand"].astype("string")
df["description"]    = df["description"].astype("string")
df["gender"]         = df["gender"].astype("category")
df["primary_color"]  = df["primary_color"].astype("category")
df["price_inr"]      = df["price_inr"].astype("float64")

# Display the cleaned DataFrame
df.head()

Unnamed: 0,product_id,product_name,description,product_brand,gender,price_inr,primary_color
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,"Black and grey printed medium trolley bag, sec...",DKNY,Unisex,11745.0,Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,Beige & Grey made to measure kurta with churid...,EthnoVogue,Women,5810.0,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,Pink coloured wash 5-pocket high-rise cropped ...,SPYKAR,Women,899.0,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Blue self-design bandhgala suitBlue self-desig...,Raymond,Men,5599.0,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,"Brown and off-white printed casual shirt, has ...",Parx,Men,759.0,White


In [66]:
# Remove spaces from string columns
string_columns = ["product_id", "product_name", "product_brand", "description", "gender", "primary_color"]

for col in string_columns:
    df[col] = df[col].str.strip()

In [67]:
# Ensure correct data types after cleaning
df["gender"]         = df["gender"].astype("category")
df["primary_color"]  = df["primary_color"].astype("category")

In [68]:
df.duplicated().sum()

np.int64(0)

In [69]:
# Display cleaned dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11597 entries, 0 to 12490
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   product_id     11597 non-null  string  
 1   product_name   11597 non-null  string  
 2   description    11597 non-null  string  
 3   product_brand  11597 non-null  string  
 4   gender         11597 non-null  category
 5   price_inr      11597 non-null  float64 
 6   primary_color  11597 non-null  category
dtypes: category(2), float64(1), string(4)
memory usage: 567.7 KB


In [70]:
# Display cleaned dataset descriptive statistics
df.describe()

Unnamed: 0,price_inr
count,11597.0
mean,1460.912995
std,2159.003043
min,153.0
25%,649.0
50%,939.0
75%,1499.0
max,63090.0


### 3. Save the cleaned data

In [71]:
df.to_csv("../data/data_cleaned.csv", index=False)