In [1]:
import pandas as pd
import numpy as np
import os

# --- 1. Load the Raw Datasets ---

print("Loading raw datasets...")
# Define file paths
styles_path = '../data/raw/styles.csv'
reviews_path = '../data/raw/Womens Clothing E-Commerce Reviews.csv'

# Load the datasets, skipping bad lines in styles.csv
catalog_df_raw = pd.read_csv(styles_path, on_bad_lines='skip')
reviews_df_raw = pd.read_csv(reviews_path)
print("Raw datasets loaded successfully.")

# --- 2. Process the Product Catalog ---

print("Processing product catalog...")
# Select relevant columns
catalog_df = catalog_df_raw[['id', 'productDisplayName', 'masterCategory', 'subCategory', 'articleType', 'baseColour']].copy()

# Rename columns for clarity
catalog_df.rename(columns={
    'id': 'product_id',
    'productDisplayName': 'product_name',
    'masterCategory': 'category',
    'articleType': 'article_type',
    'baseColour': 'color'
}, inplace=True)

# Create the image filename, which is just the product_id + .jpg
catalog_df['image_filename'] = catalog_df['product_id'].astype(str) + '.jpg'

# Save the processed catalog
os.makedirs('../data/processed', exist_ok=True)
catalog_df.to_csv('../data/processed/catalog.csv', index=False)
print("Processed catalog created and saved to 'data/processed/catalog.csv'")

# --- 3. Process the Reviews to Create Returns Data ---

print("Processing reviews to create returns data...")
# Filter for negative reviews (ratings 1 or 2)
returns_df = reviews_df_raw[reviews_df_raw['Rating'] <= 2].copy()

# Drop rows where there is no review text, as that's our complaint
returns_df.dropna(subset=['Review Text'], inplace=True)

# Select and rename columns
returns_df = returns_df[['Clothing ID', 'Class Name', 'Review Text']].copy()
returns_df.rename(columns={
    'Clothing ID': 'original_product_id',
    'Class Name': 'class_name',
    'Review Text': 'return_reason_text'
}, inplace=True)
print(f"Filtered down to {len(returns_df)} negative reviews to act as returns.")

# --- 4. Link Returns to Products in our Catalog (Optimized Method) ---

print("Linking returns data to the catalog (optimized method)...")

# This mapping helps link the two datasets.
category_mapping = {
    'Tops': 'Tshirts',
    'Dresses': 'Dresses',
    'Jeans': 'Jeans',
    'Pants': 'Trousers',
    'Shorts': 'Shorts',
    'Skirts': 'Skirts',
    'Jackets': 'Jackets'
}

# Step 1: Create a lookup dictionary from the catalog for fast access.
# This groups all product IDs and filenames by their article type.
product_lookup = catalog_df.groupby('article_type').agg({
    'product_id': list,
    'image_filename': list
}).to_dict('index')

# Step 2: Map the review class names to the catalog article types.
returns_df['article_type'] = returns_df['class_name'].map(category_mapping)

# Step 3: Define a function to get a random product for a given row.
def get_random_product(row):
    article_type = row['article_type']
    # Check if the article type is valid and exists in our lookup
    if pd.notna(article_type) and article_type in product_lookup:
        # Choose a random product from the list for that type
        num_products = len(product_lookup[article_type]['product_id'])
        random_idx = np.random.randint(0, num_products)
        
        pid = product_lookup[article_type]['product_id'][random_idx]
        fname = product_lookup[article_type]['image_filename'][random_idx]
        return pid, fname
    # Return None if there's no match
    return None, None

# Step 4: Apply the function to the returns dataframe. This is much faster than a loop.
linked_products = returns_df.apply(get_random_product, axis=1, result_type='expand')
returns_df[['product_id', 'image_filename']] = linked_products

# --- The rest of the processing is the same ---
# Drop returns that we couldn't find a matching product for
returns_df.dropna(subset=['product_id'], inplace=True)
returns_df['product_id'] = returns_df['product_id'].astype(int)

# Save the final, linked returns file
returns_df.to_csv('../data/processed/returns.csv', index=False)

print("Returns have been successfully linked and saved to 'data/processed/returns.csv'")
print("\n--- DATA PREPARATION COMPLETE ---")

Loading raw datasets...
Raw datasets loaded successfully.
Processing product catalog...
Processed catalog created and saved to 'data/processed/catalog.csv'
Processing reviews to create returns data...
Filtered down to 2370 negative reviews to act as returns.
Linking returns data to the catalog (optimized method)...
Returns have been successfully linked and saved to 'data/processed/returns.csv'

--- DATA PREPARATION COMPLETE ---
