## 1. Loading Dataset

In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
# Load the cleaned data
cleaned_data = pd.read_csv('../data/data_cleaned.csv')

cleaned_data

Unnamed: 0,product_id,product_name,description,product_brand,gender,price_inr,primary_color
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,"Black and grey printed medium trolley bag, sec...",DKNY,Unisex,11745.0,Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,Beige & Grey made to measure kurta with churid...,EthnoVogue,Women,5810.0,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,Pink coloured wash 5-pocket high-rise cropped ...,SPYKAR,Women,899.0,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Blue self-design bandhgala suitBlue self-desig...,Raymond,Men,5599.0,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,"Brown and off-white printed casual shirt, has ...",Parx,Men,759.0,White
...,...,...,...,...,...,...,...
11592,10261595,Laceandme Black Lace Non-Wired Lightly Padded ...,Black lace full-coverage Bralette bra Lightly ...,Laceandme,Women,569.0,Red
11593,10262843,Pepe Jeans Men Black Hammock Slim Fit Low-Rise...,"Black dark wash 5-pocket low-rise jeans, clean...",Pepe Jeans,Men,1299.0,Black
11594,10261721,Mochi Women Gold-Toned Solid Heels,"A pair of gold-toned open toe heels, has regul...",Mochi,Women,1990.0,Gold
11595,10261607,612 league Girls Navy Blue & White Printed Reg...,Navy Blue and White printed mid-rise denim sho...,612 league,Girls,602.0,Blue


## 2. Data Preprocessing

- Color grouping

- Full text generation

### 2.1 Color Grouping

In [3]:
color_mapping = {
    'Grey': 'Grey', 'Silver': 'Grey', 'Charcoal': 'Grey',
    'Yellow': 'Yellow', 'Gold': 'Yellow', 'Mustard': 'Yellow', 'Orange': 'Yellow', 'Khaki': 'Yellow',
    'Red': 'Red', 'Burgundy': 'Red', 'Maroon': 'Red',
    'Blue': 'Blue', 'Navy': 'Blue',
    'Pink': 'Pink',
}

cleaned_data['color_group'] = cleaned_data['primary_color'].map(color_mapping).fillna(cleaned_data['primary_color'])

### 2.2 Full Text Generation

In [4]:
# Combine 'product_name' and 'description' into a single 'full_text'
cleaned_data['full_text'] = cleaned_data['product_name'].str.strip() + ". " + cleaned_data['description'].str.strip()

# Convert 'full_text' to lowercase and strip whitespace
cleaned_data['full_text'] = cleaned_data['full_text'].str.lower().str.strip()


In [5]:
# Calculate the length of each text entry in words
cleaned_data['text_length'] = cleaned_data['full_text'].apply(lambda x: len(x.split()))


# Display the descriptive statistics
cleaned_data['text_length'].describe()


count    11597.000000
mean        30.141847
std         17.836388
min          6.000000
25%         22.000000
50%         27.000000
75%         32.000000
max        346.000000
Name: text_length, dtype: float64

- Most product records contain between 20 to 35 words, with an average length of 30 words. 

- The max text length is 346 words, a soft truncation threshold of 400 words was applied as a safeguard for potential future data growth.

In [6]:
MAX_WORD_LENGTH = 400 

def truncate_text(text):
    words = text.split()
    return ' '.join(words[:MAX_WORD_LENGTH])

cleaned_data['full_text'] = cleaned_data['full_text'].apply(truncate_text)

In [7]:
# Check sample entries to verify the changes
cleaned_data[['product_name', 'description', 'full_text']].sample(5)

Unnamed: 0,product_name,description,full_text
5781,Indian Terrain Boys Black Solid Double-Breaste...,"Black nehru jacket, has a mandarin collar, no ...",indian terrain boys black solid double-breaste...
8538,GNIST Women Black Solid Suede Open Toe Flats,"A pair of black open-toed flats, has high-top ...",gnist women black solid suede open toe flats. ...
7587,Van Heusen Men Blue Tight Slim Fit Printed For...,"Blue printed formal shirt, has a spread collar...",van heusen men blue tight slim fit printed for...
4615,ROMEE Multicoloured Geometric 144 TC Cotton 1 ...,Set content: 1 Queen bedsheet with 2 pillow co...,romee multicoloured geometric 144 tc cotton 1 ...
10785,SG YUVRAJ Boys Black Solid Kurta with Pyjamas,Black solid kurta with pyjamas Black pathani k...,sg yuvraj boys black solid kurta with pyjamas....


## 3. Embedding Date Generation

In [10]:
# Select relevant fields for embedding
final_fields = [
    'product_id',
    'product_name',
    'full_text',
    'product_brand',
    'gender',
    'price_inr',
    'primary_color',
    'color_group'
]

data_for_embedding = cleaned_data[final_fields].copy()

# Save the processed data for embedding
data_for_embedding.to_csv("../data/data_for_embedding.csv", index=False)