## 1. Loading Dataset

In [None]:
# Import necessary libraries
import pandas as pd

In [None]:
# Load the cleaned data
cleaned_data = pd.read_csv('../data/data_cleaned.csv')

cleaned_data

## 2. Data Preprocessing

- Color grouping

- Full text generation

### 2.1 Color Grouping

In [None]:
color_mapping = {
    'Grey': 'Grey', 'Silver': 'Grey', 'Charcoal': 'Grey',
    'Yellow': 'Yellow', 'Gold': 'Yellow', 'Mustard': 'Yellow', 'Orange': 'Yellow', 'Khaki': 'Yellow',
    'Red': 'Red', 'Burgundy': 'Red', 'Maroon': 'Red',
    'Blue': 'Blue', 'Navy': 'Blue',
    'Pink': 'Pink',
}

cleaned_data['color_group'] = cleaned_data['primary_color'].map(color_mapping).fillna(cleaned_data['primary_color'])

### 2.2 Full Text Generation

In [None]:
# Combine 'product_name' and 'description' into a single 'full_text'
cleaned_data['full_text'] = cleaned_data['product_name'].str.strip() + ". " + cleaned_data['description'].str.strip()

# Convert 'full_text' to lowercase and strip whitespace
cleaned_data['full_text'] = cleaned_data['full_text'].str.lower().str.strip()


In [None]:
# Calculate the length of each text entry in words
cleaned_data['text_length'] = cleaned_data['full_text'].apply(lambda x: len(x.split()))


# Display the descriptive statistics
cleaned_data['text_length'].describe()


- Most product records contain between 20 to 35 words, with an average length of 30 words. 

- The max text length is 346 words, a soft truncation threshold of 400 words was applied as a safeguard for potential future data growth.

In [None]:
MAX_WORD_LENGTH = 400 

def truncate_text(text):
    words = text.split()
    return ' '.join(words[:MAX_WORD_LENGTH])

cleaned_data['full_text'] = cleaned_data['full_text'].apply(truncate_text)

In [None]:
# Check sample entries to verify the changes
cleaned_data[['product_name', 'description', 'full_text']].sample(5)

## 3. Embedding Date Generation

In [None]:
# Select relevant fields for embedding
final_fields = [
    'product_id',
    'product_name',
    'full_text',
    'product_brand',
    'gender',
    'primary_color',
    'color_group'
]

data_for_embedding = cleaned_data[final_fields].copy()

# Save the processed data for embedding
data_for_embedding.to_csv("../data/data_for_embedding.csv", index=False)