### Stage 1: Loading Dataset

In [68]:
import pandas as pd
import numpy as np
import gc
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [69]:
class Config:
    # Paths (adjust based on your Kaggle dataset location)
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_2')
    
     # Temporal configuration
    N_TRAIN_WEEKS = 11  # Number of weeks for training
    N_VAL_WEEKS = 1     # Validation week
    TOTAL_WEEKS = 24    # Total weeks to consider (16-24 range)
    
    # User sampling configuration
    TARGET_USERS = 50000  # Target number of users to sample
    MIN_USER_PURCHASES = 1  # Minimum purchases across all weeks for a user
    
    # Cold start configuration
    INCLUDE_COLD_START = True  # Include users with limited history
    COLD_START_RATIO = 0.15  # 15% of sampled users will be cold start
    COLD_START_MAX_PURCHASES = 1  # Users with <= this many purchases are "cold start"
    
    # Stratification configuration
    STRATIFY_BY_ACTIVITY = True  # Stratify users by activity level
    ACTIVITY_BINS = [0, 5, 10, 20, 50, np.inf]  # Purchase count bins
    ACTIVITY_LABELS = ['low', 'medium', 'high', 'very_high', 'extreme']
    
    # Item filtering
    MIN_ITEM_PURCHASES = 5  # Minimum purchases for an item to be included
    
    # Memory optimization
    CHUNK_SIZE = 500_000  # Process transactions in chunks
    
    # Random seed
    RANDOM_STATE = 42

config = Config()
config.OUTPUT_PATH.mkdir(exist_ok=True)
np.random.seed(config.RANDOM_STATE)

In [70]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def reduce_mem_usage(df, verbose=True):
    """
    Reduce memory usage of a dataframe by optimizing dtypes
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip datetime and object columns
        if col_type == object or pd.api.types.is_datetime64_any_dtype(df[col]):
            continue
            
        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f'Memory usage decreased from {start_mem:.2f} MB to {end_mem:.2f} MB '
              f'({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    
    return df

def print_section(title):
    """Pretty print section headers"""
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80)

In [71]:
# ============================================================================
# STEP 1: LOAD AND EXPLORE DATA
# ============================================================================

print_section("STEP 1: LOADING DATA")

# Load transactions
print("Loading transactions...")
transactions = pd.read_csv(
    config.DATA_PATH / 'transactions_train.csv',
    dtype={
        'article_id': 'int32',
        'price': 'float32',
        'sales_channel_id': 'int8'
    },
    parse_dates=['t_dat']
)

print(f"‚úì Loaded {len(transactions):,} transactions")
print(f"  Date range: {transactions['t_dat'].min()} to {transactions['t_dat'].max()}")
print(f"  Unique customers: {transactions['customer_id'].nunique():,}")
print(f"  Unique articles: {transactions['article_id'].nunique():,}")

# Load customers
print("\nLoading customers...")
customers = pd.read_csv(
    config.DATA_PATH / 'customers.csv',
    dtype={
        'FN': 'float32',
        'Active': 'float32',
        'age': 'float32'
    }
)
customers = reduce_mem_usage(customers, verbose=False)
print(f"‚úì Loaded {len(customers):,} customers")

# Load articles
print("\nLoading articles...")
articles = pd.read_csv(
    config.DATA_PATH / 'articles.csv',
    dtype={'article_id': 'int32'}
)
articles = reduce_mem_usage(articles, verbose=False)
print(f"‚úì Loaded {len(articles):,} articles")


  STEP 1: LOADING DATA
Loading transactions...
‚úì Loaded 31,788,324 transactions
  Date range: 2018-09-20 00:00:00 to 2020-09-22 00:00:00
  Unique customers: 1,362,281
  Unique articles: 104,547

Loading customers...
‚úì Loaded 1,371,980 customers

Loading articles...
‚úì Loaded 105,542 articles


In [72]:
# ============================================================================
# STEP 2: TEMPORAL WINDOW SELECTION
# ============================================================================

print_section("STEP 2: SELECTING TEMPORAL WINDOW")

# Get the last date in transactions
max_date = transactions['t_dat'].max()
print(f"Last transaction date: {max_date}")

# Calculate cutoff dates for the full window
window_start = max_date - timedelta(weeks=config.TOTAL_WEEKS)
print(f"\nUsing {config.TOTAL_WEEKS} weeks of data for sampling")
print(f"Window: {window_start.date()} to {max_date.date()}")

# Filter transactions to our window
print(f"\nFiltering transactions from {window_start.date()} onwards...")
transactions = transactions[transactions['t_dat'] >= window_start].copy()
print(f"‚úì Retained {len(transactions):,} transactions ({len(transactions)/1e6:.2f}M)")

# Add week number (relative to window start)
transactions['week'] = ((transactions['t_dat'] - window_start).dt.days // 7).astype(np.int8)
print(f"  Week range: {transactions['week'].min()} to {transactions['week'].max()}")


  STEP 2: SELECTING TEMPORAL WINDOW
Last transaction date: 2020-09-22 00:00:00

Using 24 weeks of data for sampling
Window: 2020-04-07 to 2020-09-22

Filtering transactions from 2020-04-07 onwards...
‚úì Retained 7,561,154 transactions (7.56M)
  Week range: 0 to 24


In [73]:
# ============================================================================
# STEP 3: USER-BASED STRATIFIED SAMPLING
# ============================================================================

print_section("STEP 3: USER-BASED STRATIFIED SAMPLING")

# Calculate user activity across all weeks
print("Calculating user activity metrics...")
user_activity = transactions.groupby('customer_id').agg({
    'article_id': 'count',  # Total purchases
    'week': ['min', 'max', 'nunique']  # Week span and diversity
}).reset_index()

user_activity.columns = ['customer_id', 'total_purchases', 'first_week', 'last_week', 'active_weeks']
user_activity['week_span'] = user_activity['last_week'] - user_activity['first_week'] + 1

print(f"Total users in window: {len(user_activity):,}")
print(f"  Avg purchases per user: {user_activity['total_purchases'].mean():.2f}")
print(f"  Avg active weeks per user: {user_activity['active_weeks'].mean():.2f}")

# Separate cold start and regular users
if config.INCLUDE_COLD_START:
    cold_start_users = user_activity[
        user_activity['total_purchases'] <= config.COLD_START_MAX_PURCHASES
    ].copy()
    regular_users = user_activity[
        user_activity['total_purchases'] >= config.MIN_USER_PURCHASES
    ].copy()
    
    print(f"\nUser segments:")
    print(f"  Cold start users (‚â§{config.COLD_START_MAX_PURCHASES} purchases): {len(cold_start_users):,}")
    print(f"  Regular users (‚â•{config.MIN_USER_PURCHASES} purchases): {len(regular_users):,}")
    
    # Calculate target counts
    n_cold_start_target = int(config.TARGET_USERS * config.COLD_START_RATIO)
    n_regular_target = config.TARGET_USERS - n_cold_start_target
    
    print(f"\nSampling targets:")
    print(f"  Cold start: {n_cold_start_target:,} users ({config.COLD_START_RATIO*100:.1f}%)")
    print(f"  Regular: {n_regular_target:,} users ({(1-config.COLD_START_RATIO)*100:.1f}%)")
    
else:
    # Filter users with minimum activity
    regular_users = user_activity[
        user_activity['total_purchases'] >= config.MIN_USER_PURCHASES
    ].copy()
    cold_start_users = pd.DataFrame()
    n_cold_start_target = 0
    n_regular_target = config.TARGET_USERS
    
    print(f"\nUsers with >= {config.MIN_USER_PURCHASES} purchases: {len(regular_users):,}")

# Sample cold start users (if enabled)
sampled_cold_start = []
if config.INCLUDE_COLD_START and len(cold_start_users) > 0:
    n_cold_sample = min(n_cold_start_target, len(cold_start_users))
    sampled_cold_start = cold_start_users['customer_id'].sample(
        n=n_cold_sample, 
        random_state=config.RANDOM_STATE
    ).tolist()
    print(f"\n‚úì Sampled {len(sampled_cold_start):,} cold start users")

# Sample regular users with stratification
if config.STRATIFY_BY_ACTIVITY and len(regular_users) > 0:
    regular_users['activity_level'] = pd.cut(
        regular_users['total_purchases'],
        bins=config.ACTIVITY_BINS,
        labels=config.ACTIVITY_LABELS
    )
    
    print("\nRegular user activity distribution:")
    activity_dist = regular_users['activity_level'].value_counts().sort_index()
    for level, count in activity_dist.items():
        print(f"  {level}: {count:,} users ({100*count/len(regular_users):.1f}%)")
    
    # Stratified sampling for regular users
    print(f"\nPerforming stratified sampling to get {n_regular_target:,} regular users...")
    
    # Calculate samples per stratum (proportional)
    samples_per_stratum = (activity_dist / activity_dist.sum() * n_regular_target).round().astype(int)
    
    # Adjust for rounding errors
    diff = n_regular_target - samples_per_stratum.sum()
    if diff != 0:
        largest_stratum = samples_per_stratum.idxmax()
        samples_per_stratum[largest_stratum] += diff
    
    print("\nSamples per activity level:")
    for level, n_samples in samples_per_stratum.items():
        print(f"  {level}: {n_samples:,} users")
    
    # Sample from each stratum
    sampled_regular = []
    for level in config.ACTIVITY_LABELS:
        stratum_users = regular_users[regular_users['activity_level'] == level]['customer_id']
        n_sample = min(samples_per_stratum[level], len(stratum_users))
        if n_sample > 0:
            sampled = stratum_users.sample(n=n_sample, random_state=config.RANDOM_STATE)
            sampled_regular.extend(sampled.tolist())

else:
    # Simple random sampling for regular users
    print(f"\nPerforming random sampling to get {n_regular_target:,} regular users...")
    n_sample = min(n_regular_target, len(regular_users))
    sampled_regular = regular_users['customer_id'].sample(
        n=n_sample, 
        random_state=config.RANDOM_STATE
    ).tolist()

# Combine both groups
selected_users = set(sampled_cold_start + sampled_regular)

print(f"\n‚úì Total selected users: {len(selected_users):,}")
if config.INCLUDE_COLD_START:
    print(f"  - Cold start: {len(sampled_cold_start):,} ({100*len(sampled_cold_start)/len(selected_users):.1f}%)")
    print(f"  - Regular: {len(sampled_regular):,} ({100*len(sampled_regular)/len(selected_users):.1f}%)")

# Verify sampling quality
sampled_activity = user_activity[user_activity['customer_id'].isin(selected_users)]
print(f"\nSampled users statistics:")
print(f"  Avg purchases: {sampled_activity['total_purchases'].mean():.2f}")
print(f"  Median purchases: {sampled_activity['total_purchases'].median():.2f}")
print(f"  Min purchases: {sampled_activity['total_purchases'].min():.0f}")
print(f"  Max purchases: {sampled_activity['total_purchases'].max():.0f}")
print(f"  Avg active weeks: {sampled_activity['active_weeks'].mean():.2f}")
print(f"  Purchases std: {sampled_activity['total_purchases'].std():.2f}")


  STEP 3: USER-BASED STRATIFIED SAMPLING
Calculating user activity metrics...
Total users in window: 719,806
  Avg purchases per user: 10.50
  Avg active weeks per user: 2.61

User segments:
  Cold start users (‚â§1 purchases): 72,280
  Regular users (‚â•1 purchases): 719,806

Sampling targets:
  Cold start: 7,500 users (15.0%)
  Regular: 42,500 users (85.0%)

‚úì Sampled 7,500 cold start users

Regular user activity distribution:
  low: 331,386 users (46.0%)
  medium: 161,171 users (22.4%)
  high: 133,525 users (18.6%)
  very_high: 79,738 users (11.1%)
  extreme: 13,986 users (1.9%)

Performing stratified sampling to get 42,500 regular users...

Samples per activity level:
  low: 19,566 users
  medium: 9,516 users
  high: 7,884 users
  very_high: 4,708 users
  extreme: 826 users

‚úì Total selected users: 49,576
  - Cold start: 7,500 (15.1%)
  - Regular: 42,500 (85.7%)

Sampled users statistics:
  Avg purchases: 9.14
  Median purchases: 5.00
  Min purchases: 1
  Max purchases: 342
  

In [74]:
# ============================================================================
# STEP 4: FILTER TRANSACTIONS TO SAMPLED USERS
# ============================================================================

print_section("STEP 4: FILTERING TRANSACTIONS TO SAMPLED USERS")

# Filter transactions
transactions = transactions[transactions['customer_id'].isin(selected_users)].copy()
print(f"‚úì Retained {len(transactions):,} transactions")
print(f"  Reduction: {100 * (1 - len(transactions) / len(transactions)):.1f}% (based on sampled users)")

# Now create train/val split
val_end_date = max_date
val_start_date = val_end_date - timedelta(weeks=config.N_VAL_WEEKS)
train_end_date = val_start_date - timedelta(days=1)
train_start_date = train_end_date - timedelta(weeks=config.N_TRAIN_WEEKS)

print(f"\nTemporal splits (from {config.TOTAL_WEEKS} week window):")
print(f"  Training:   {train_start_date.date()} to {train_end_date.date()} ({config.N_TRAIN_WEEKS} weeks)")
print(f"  Validation: {val_start_date.date()} to {val_end_date.date()} ({config.N_VAL_WEEKS} week)")

# Split transactions
train_transactions = transactions[transactions['t_dat'] <= train_end_date].copy()
val_transactions = transactions[transactions['t_dat'] > train_end_date].copy()

print(f"\nDataset split:")
print(f"  Training transactions: {len(train_transactions):,}")
print(f"  Validation transactions: {len(val_transactions):,}")

# Check how many sampled users appear in validation
val_users = set(val_transactions['customer_id'].unique())
print(f"  Users in validation: {len(val_users):,} ({100*len(val_users)/len(selected_users):.1f}% of sampled)")

del transactions
gc.collect()


  STEP 4: FILTERING TRANSACTIONS TO SAMPLED USERS
‚úì Retained 453,143 transactions
  Reduction: 0.0% (based on sampled users)

Temporal splits (from 24 week window):
  Training:   2020-06-29 to 2020-09-14 (11 weeks)
  Validation: 2020-09-15 to 2020-09-22 (1 week)

Dataset split:
  Training transactions: 436,663
  Validation transactions: 16,480
  Users in validation: 4,943 (10.0% of sampled)


451

In [75]:
# ============================================================================
# STEP 5: ITEM FILTERING
# ============================================================================

print_section("STEP 5: ITEM FILTERING")

# Count purchases per item in training window
item_counts = train_transactions['article_id'].value_counts()
print(f"Unique items in training: {len(item_counts):,}")

# Keep items with minimum purchases
valid_items = set(item_counts[item_counts >= config.MIN_ITEM_PURCHASES].index)
print(f"Items with >= {config.MIN_ITEM_PURCHASES} purchases: {len(valid_items):,}")

# Also include all items from validation (even if rare in training)
val_items = set(val_transactions['article_id'].unique())
print(f"Items in validation: {len(val_items):,}")

# Combine
selected_items = valid_items.union(val_items)
print(f"\nTotal selected items: {len(selected_items):,}")

# Filter transactions
train_transactions = train_transactions[train_transactions['article_id'].isin(selected_items)].copy()
val_transactions = val_transactions[val_transactions['article_id'].isin(selected_items)].copy()

print(f"\nAfter item filtering:")
print(f"  Training transactions: {len(train_transactions):,}")
print(f"  Validation transactions: {len(val_transactions):,}")

# Filter articles and customers tables
articles = articles[articles['article_id'].isin(selected_items)].copy()
customers = customers[customers['customer_id'].isin(selected_users)].copy()

print(f"  Articles retained: {len(articles):,}")
print(f"  Customers retained: {len(customers):,}")


  STEP 5: ITEM FILTERING
Unique items in training: 28,951
Items with >= 5 purchases: 14,851
Items in validation: 5,730

Total selected items: 16,616

After item filtering:
  Training transactions: 412,156
  Validation transactions: 16,480
  Articles retained: 16,616
  Customers retained: 49,576


In [76]:
# ============================================================================
# STEP 6: DATA TYPE OPTIMIZATION
# ============================================================================

print_section("STEP 6: MEMORY OPTIMIZATION")

print("Before optimization:")
print(f"  train_transactions: {train_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  val_transactions: {val_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  customers: {customers.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  articles: {articles.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Optimize transactions
train_transactions = reduce_mem_usage(train_transactions)
val_transactions = reduce_mem_usage(val_transactions)

# Convert categorical columns
for col in ['product_code', 'product_type_no', 'graphical_appearance_no', 
            'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
            'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']:
    if col in articles.columns:
        articles[col] = articles[col].astype('category')

# Optimize customer categoricals
for col in ['club_member_status', 'fashion_news_frequency', 'postal_code']:
    if col in customers.columns:
        customers[col] = customers[col].astype('category')

print("\nAfter optimization:")
print(f"  train_transactions: {train_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  val_transactions: {val_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  customers: {customers.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  articles: {articles.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


  STEP 6: MEMORY OPTIMIZATION
Before optimization:
  train_transactions: 57.78 MB
  val_transactions: 2.31 MB
  customers: 18.34 MB
  articles: 17.70 MB
Memory usage decreased from 13.36 MB to 13.36 MB (0.0% reduction)
Memory usage decreased from 0.53 MB to 0.53 MB (0.0% reduction)

After optimization:
  train_transactions: 57.78 MB
  val_transactions: 2.31 MB
  customers: 12.94 MB
  articles: 16.92 MB


In [77]:
# ============================================================================
# STEP 7: DATA VALIDATION & EDA
# ============================================================================

print_section("STEP 7: DATA VALIDATION & EDA")

# Validation checks
print("Data validation:")
print(f"  ‚úì No null customer_ids in train: {train_transactions['customer_id'].isnull().sum() == 0}")
print(f"  ‚úì No null article_ids in train: {train_transactions['article_id'].isnull().sum() == 0}")

# Weekly distribution
print("\nWeekly activity distribution (sampled users):")
weekly_users = train_transactions.groupby('week')['customer_id'].nunique()
for week, n_users in weekly_users.items():
    print(f"  Week {week}: {n_users:,} active users")

# Purchase distribution
print("\nPurchase distribution in validation week:")
if len(val_transactions) > 0:
    val_user_purchases = val_transactions.groupby('customer_id').size()
    print(f"  Mean purchases per user: {val_user_purchases.mean():.2f}")
    print(f"  Median purchases per user: {val_user_purchases.median():.0f}")
    print(f"  Users with 1 purchase: {(val_user_purchases == 1).sum():,}")
    print(f"  Users with 2-5 purchases: {((val_user_purchases >= 2) & (val_user_purchases <= 5)).sum():,}")
    print(f"  Users with 6+ purchases: {(val_user_purchases >= 6).sum():,}")
else:
    print("  ‚ö†Ô∏è No validation transactions for sampled users")


  STEP 7: DATA VALIDATION & EDA
Data validation:
  ‚úì No null customer_ids in train: True
  ‚úì No null article_ids in train: True

Weekly activity distribution (sampled users):
  Week 0: 4,926 active users
  Week 1: 3,240 active users
  Week 2: 3,976 active users
  Week 3: 3,284 active users
  Week 4: 3,771 active users
  Week 5: 4,157 active users
  Week 6: 5,213 active users
  Week 7: 5,419 active users
  Week 8: 4,900 active users
  Week 9: 4,432 active users
  Week 10: 7,090 active users
  Week 11: 7,371 active users
  Week 12: 5,377 active users
  Week 13: 4,692 active users
  Week 14: 4,717 active users
  Week 15: 4,910 active users
  Week 16: 4,985 active users
  Week 17: 4,756 active users
  Week 18: 4,366 active users
  Week 19: 4,289 active users
  Week 20: 5,006 active users
  Week 21: 4,634 active users
  Week 22: 4,647 active users

Purchase distribution in validation week:
  Mean purchases per user: 3.33
  Median purchases per user: 2
  Users with 1 purchase: 1,689
  U

In [78]:
# ============================================================================
# STEP 8: CREATE VALIDATION GROUND TRUTH
# ============================================================================

print_section("STEP 8: CREATING VALIDATION GROUND TRUTH")

# Create validation ground truth
if len(val_transactions) > 0:
    val_ground_truth = (
        val_transactions
        .groupby('customer_id')['article_id']
        .apply(list)
        .reset_index()
        .rename(columns={'article_id': 'purchased_articles'})
    )
    
    print(f"Validation ground truth:")
    print(f"  Users: {len(val_ground_truth):,}")
    print(f"  Total purchases: {val_ground_truth['purchased_articles'].apply(len).sum():,}")
    print(f"  Avg purchases per user: {val_ground_truth['purchased_articles'].apply(len).mean():.2f}")
else:
    val_ground_truth = pd.DataFrame(columns=['customer_id', 'purchased_articles'])
    print("‚ö†Ô∏è Empty validation ground truth")


  STEP 8: CREATING VALIDATION GROUND TRUTH
Validation ground truth:
  Users: 4,943
  Total purchases: 16,480
  Avg purchases per user: 3.33


In [79]:
# ============================================================================
# STEP 9: SAVE PROCESSED DATA
# ============================================================================

print_section("STEP 9: SAVING PROCESSED DATA")

# Save to parquet
print("Saving files...")

train_transactions.to_parquet(config.OUTPUT_PATH / 'train_transactions.parquet', index=False)
print(f"  ‚úì train_transactions.parquet ({len(train_transactions):,} rows)")

val_transactions.to_parquet(config.OUTPUT_PATH / 'val_transactions.parquet', index=False)
print(f"  ‚úì val_transactions.parquet ({len(val_transactions):,} rows)")

customers.to_parquet(config.OUTPUT_PATH / 'customers.parquet', index=False)
print(f"  ‚úì customers.parquet ({len(customers):,} rows)")

articles.to_parquet(config.OUTPUT_PATH / 'articles.parquet', index=False)
print(f"  ‚úì articles.parquet ({len(articles):,} rows)")

val_ground_truth.to_parquet(config.OUTPUT_PATH / 'val_ground_truth.parquet', index=False)
print(f"  ‚úì val_ground_truth.parquet ({len(val_ground_truth):,} rows)")

# Save user activity for analysis
sampled_activity.to_parquet(config.OUTPUT_PATH / 'user_activity_stats.parquet', index=False)
print(f"  ‚úì user_activity_stats.parquet ({len(sampled_activity):,} rows)")

# Save metadata
metadata = {
    'total_weeks': config.TOTAL_WEEKS,
    'train_weeks': config.N_TRAIN_WEEKS,
    'val_weeks': config.N_VAL_WEEKS,
    'train_start_date': str(train_start_date.date()),
    'train_end_date': str(train_end_date.date()),
    'val_start_date': str(val_start_date.date()),
    'val_end_date': str(val_end_date.date()),
    'target_users': config.TARGET_USERS,
    'actual_users': len(selected_users),
    'users_in_validation': len(val_users),
    'cold_start_users': len(sampled_cold_start) if config.INCLUDE_COLD_START else 0,
    'regular_users': len(sampled_regular) if config.INCLUDE_COLD_START else len(selected_users),
    'cold_start_ratio': config.COLD_START_RATIO if config.INCLUDE_COLD_START else 0,
    'n_items': len(selected_items),
    'n_train_transactions': len(train_transactions),
    'n_val_transactions': len(val_transactions),
    'stratified': config.STRATIFY_BY_ACTIVITY,
    'min_user_purchases': config.MIN_USER_PURCHASES,
}

import json
with open(config.OUTPUT_PATH / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"  ‚úì metadata.json")


  STEP 9: SAVING PROCESSED DATA
Saving files...
  ‚úì train_transactions.parquet (412,156 rows)
  ‚úì val_transactions.parquet (16,480 rows)
  ‚úì customers.parquet (49,576 rows)
  ‚úì articles.parquet (16,616 rows)
  ‚úì val_ground_truth.parquet (4,943 rows)
  ‚úì user_activity_stats.parquet (49,576 rows)
  ‚úì metadata.json


In [None]:
# ============================================================================
# SUMMARY
# ============================================================================

print_section("PREPROCESSING COMPLETE!")

print("\nFinal dataset summary:")
print(f"Total weeks considered: {config.TOTAL_WEEKS}")
print(f"Training weeks: {config.N_TRAIN_WEEKS}")
print(f"Validation weeks: {config.N_VAL_WEEKS}")
print(f"Target users: {config.TARGET_USERS:,}")
print(f"Actual sampled users: {len(selected_users):,}")
if config.INCLUDE_COLD_START:
    print(f"Cold start users: {len(sampled_cold_start):,} ({100*len(sampled_cold_start)/len(selected_users):.1f}%)")
    print(f"Regular users: {len(sampled_regular):,} ({100*len(sampled_regular)/len(selected_users):.1f}%)")
print(f"Users in validation: {len(val_users):,} ({100*len(val_users)/len(selected_users):.1f}%)")
print(f"Items: {len(selected_items):,}")
print(f"Train transactions: {len(train_transactions):,}")
print(f"Val transactions: {len(val_transactions):,}")
print(f"Avg transactions per user (train): {len(train_transactions)/len(selected_users):.2f}")

if config.STRATIFY_BY_ACTIVITY:
    print("\n Sampling was stratified by user activity level")
if config.INCLUDE_COLD_START:
    print(f"Cold start users included for testing recommendations with limited history")



  PREPROCESSING COMPLETE!

Final dataset summary:
  üìÖ Total weeks considered: 24
  üìÖ Training weeks: 11
  üìÖ Validation weeks: 1
  üë• Target users: 50,000
  üë• Actual sampled users: 49,576
  ‚ùÑÔ∏è  Cold start users: 7,500 (15.1%)
  üî• Regular users: 42,500 (85.7%)
  üë• Users in validation: 4,943 (10.0%)
  üõçÔ∏è  Items: 16,616
  üìä Train transactions: 412,156
  üìä Val transactions: 16,480
  üìä Avg transactions per user (train): 8.31

  üìà Sampling was stratified by user activity level
  ‚ùÑÔ∏è  Cold start users included for testing recommendations with limited history

‚úÖ Ready for Stage 2: Recall Strategies!

Next steps:
  1. Review the saved files in /kaggle/working/
  2. Check metadata.json for dataset info
  3. Analyze user_activity_stats.parquet for sampling quality
  4. Proceed to Stage 2 when ready


### Stage 2: Generating Candidates

In [117]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import warnings
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from text_features import integrate_text_features_stage2

warnings.filterwarnings('ignore')

In [129]:
# ============================================================================
# MEMORY MONITORING
# ============================================================================

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    gc.collect()

In [130]:
class Config:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_2')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    
    # Recall configuration - REDUCED for memory
    N_REPURCHASE_CANDIDATES = 25  # Reduced from 30
    N_POPULARITY_CANDIDATES = 25  # Reduced from 30
    N_COPURCHASE_CANDIDATES = 15  # Reduced from 20
    N_USERKNN_CANDIDATES = 15     # Reduced from 20
    N_CATEGORY_CANDIDATES = 15    # Reduced from 20
    
    # Processing parameters
    USER_CHUNK_SIZE = 1000  # Process users in chunks
    BASKET_CHUNK_SIZE = 5000  # Process baskets in chunks
    
    # EMERGENCY MODE: Use only recent data for repurchase
    USE_RECENT_ONLY_REPURCHASE = True  # Set to True if kernel keeps crashing
    REPURCHASE_RECENT_WEEKS = 8  # Only use last 8 weeks for repurchase
    
    # Item-to-Item CF parameters
    MIN_ITEM_SUPPORT = 3
    MAX_ITEM_NEIGHBORS = 30  # Reduced from 50
    
    # User-KNN parameters (ONLY for validation users)
    N_SIMILAR_USERS = 20  # Reduced from 30
    MIN_COMMON_ITEMS = 2
    
    # Time decay
    REPURCHASE_DECAY_RATE = 0.05
    POPULARITY_WINDOW_WEEKS = 2
    
    RANDOM_STATE = 42

config = Config()

In [131]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

def time_decay_score(days_ago, decay_rate=0.05):
    """Vectorized time decay"""
    return np.exp(-decay_rate * days_ago)

In [132]:
# ============================================================================
# LOAD DATA
# ============================================================================

print_section("LOADING DATA")

print("Loading data files...")
train_transactions = pd.read_parquet(config.DATA_PATH / 'train_transactions.parquet')
val_ground_truth = pd.read_parquet(config.DATA_PATH / 'val_ground_truth.parquet')
articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')

print(f"‚úì Train transactions: {len(train_transactions):,}")
print_memory()

all_users = train_transactions['customer_id'].unique()
all_items = train_transactions['article_id'].unique()
val_users = set(val_ground_truth['customer_id'].unique())
max_date = train_transactions['t_dat'].max()

print(f"‚úì Users: {len(all_users):,}, Items: {len(all_items):,}, Val users: {len(val_users):,}")


  LOADING DATA

Loading data files...
‚úì Train transactions: 412,156
  üíæ Memory: 0.35 GB
‚úì Users: 47,543, Items: 15,932, Val users: 4,943


In [133]:
# ============================================================================
# STRATEGY 1: REPURCHASE - CHUNKED PROCESSING
# ============================================================================

print_section("STRATEGY 1: REPURCHASE (CHUNKED)")

print("Processing in chunks to save memory...")
repurchase_chunks = []

# Split users into chunks
user_chunks = np.array_split(all_users, max(1, len(all_users) // config.USER_CHUNK_SIZE))

for i, user_chunk in enumerate(tqdm(user_chunks, desc="User chunks")):
    # Filter transactions for this chunk
    chunk_trans = train_transactions[
        train_transactions['customer_id'].isin(user_chunk)
    ].copy()
    
    # Get last purchase per user-item
    user_item_last = (
        chunk_trans
        .groupby(['customer_id', 'article_id'], as_index=False)['t_dat']
        .max()
    )
    
    # Calculate scores (vectorized) - with NaN handling
    user_item_last['days_ago'] = (max_date - user_item_last['t_dat']).dt.days
    
    # Drop any NaN values before converting to int
    user_item_last = user_item_last.dropna(subset=['days_ago'])
    
    # Now safe to convert to int
    user_item_last['days_ago'] = user_item_last['days_ago'].astype(np.int16)
    user_item_last['repurchase_score'] = time_decay_score(
        user_item_last['days_ago'].values, 
        config.REPURCHASE_DECAY_RATE
    ).astype(np.float32)
    
    # Get top N per user
    top_candidates = (
        user_item_last
        .sort_values(['customer_id', 'repurchase_score'], ascending=[True, False])
        .groupby('customer_id', as_index=False)
        .head(config.N_REPURCHASE_CANDIDATES)
        [['customer_id', 'article_id', 'repurchase_score']]
    )
    
    repurchase_chunks.append(top_candidates)
    
    # Clean up
    del chunk_trans, user_item_last, top_candidates
    force_garbage_collection()

# Combine chunks
repurchase_candidates = pd.concat(repurchase_chunks, ignore_index=True)
del repurchase_chunks
force_garbage_collection()

print(f"‚úì Generated {len(repurchase_candidates):,} repurchase candidates")
print_memory()

# Save intermediate result
repurchase_candidates.to_parquet(config.OUTPUT_PATH / 'temp_repurchase.parquet', index=False)
del repurchase_candidates
force_garbage_collection()


  STRATEGY 1: REPURCHASE (CHUNKED)

Processing in chunks to save memory...


User chunks:   0%|          | 0/47 [00:00<?, ?it/s]

‚úì Generated 325,177 repurchase candidates
  üíæ Memory: 1.36 GB


In [134]:
# ============================================================================
# STRATEGY 2: POPULARITY
# ============================================================================

print_section("STRATEGY 2: POPULARITY")

cutoff_date = max_date - timedelta(weeks=config.POPULARITY_WINDOW_WEEKS)
recent_trans = train_transactions[train_transactions['t_dat'] >= cutoff_date].copy()

print(f"Using {len(recent_trans):,} recent transactions")

# Vectorized calculations
recent_trans['days_ago'] = (max_date - recent_trans['t_dat']).dt.days

# Drop NaN values
recent_trans = recent_trans.dropna(subset=['days_ago'])

# Convert to int
recent_trans['days_ago'] = recent_trans['days_ago'].astype(np.int16)
recent_trans['weight'] = time_decay_score(recent_trans['days_ago'].values, 0.1).astype(np.float32)

# Aggregate
item_popularity = (
    recent_trans
    .groupby('article_id', as_index=False)
    .agg({'weight': 'sum', 'customer_id': 'nunique'})
    .rename(columns={'weight': 'weighted_purchases', 'customer_id': 'unique_buyers'})
)

item_popularity['popularity_score'] = (
    0.7 * item_popularity['weighted_purchases'] + 
    0.3 * item_popularity['unique_buyers']
)
item_popularity['popularity_score'] = (
    item_popularity['popularity_score'] / item_popularity['popularity_score'].max()
).astype(np.float32)

# Get top items
top_items = item_popularity.nlargest(config.N_POPULARITY_CANDIDATES, 'popularity_score')

print(f"‚úì Top {len(top_items)} popular items")

# Create candidates - CHUNKED
print("Creating popularity candidates in chunks...")
pop_chunks = []

for user_chunk in tqdm(np.array_split(all_users, 20), desc="Popularity chunks"):
    chunk_df = pd.DataFrame({
        'customer_id': np.repeat(user_chunk, len(top_items)),
        'article_id': np.tile(top_items['article_id'].values, len(user_chunk))
    })
    
    rank_penalty = np.tile(1 - np.arange(len(top_items)) * 0.01, len(user_chunk))
    scores = np.tile(top_items['popularity_score'].values, len(user_chunk))
    chunk_df['popularity_score'] = (scores * rank_penalty).astype(np.float32)
    
    pop_chunks.append(chunk_df)

popularity_candidates = pd.concat(pop_chunks, ignore_index=True)
del pop_chunks, recent_trans
force_garbage_collection()

print(f"‚úì Generated {len(popularity_candidates):,} popularity candidates")
print_memory()

# Save
popularity_candidates.to_parquet(config.OUTPUT_PATH / 'temp_popularity.parquet', index=False)
item_popularity.to_parquet(config.OUTPUT_PATH / 'item_popularity.parquet', index=False)
del popularity_candidates
force_garbage_collection()


  STRATEGY 2: POPULARITY

Using 32,152 recent transactions
‚úì Top 25 popular items
Creating popularity candidates in chunks...


Popularity chunks:   0%|          | 0/20 [00:00<?, ?it/s]

‚úì Generated 1,188,575 popularity candidates
  üíæ Memory: 1.47 GB


In [135]:
# ============================================================================
# STRATEGY 3: CO-PURCHASE (Item-to-Item CF) - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 3: CO-PURCHASE (Item-to-Item CF)")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_copurchase.parquet').exists():
    print("‚ö° Found existing co-purchase candidates, loading...")
    copurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet')
    print(f"‚úì Loaded {len(copurchase_candidates):,} co-purchase candidates")
    print(f"  Users with candidates: {copurchase_candidates['customer_id'].nunique():,}")
else:
    print("Building co-purchase matrix...")

    # Create item-to-item co-purchase matrix
    # Group by transaction/basket (same user, same day)
    train_transactions['basket_id'] = (
        train_transactions['customer_id'].astype(str) + '_' + 
        train_transactions['t_dat'].astype(str)
    )

    # Get baskets with multiple items
    basket_items = (
        train_transactions
        .groupby('basket_id')['article_id']
        .apply(list)
        .reset_index()
    )

    # Filter baskets with at least 2 items
    basket_items = basket_items[basket_items['article_id'].apply(len) >= 2]
    print(f"  Baskets with 2+ items: {len(basket_items):,}")

    # Build co-purchase counts
    print("Computing co-purchase frequencies...")
    copurchase_counts = defaultdict(lambda: defaultdict(int))

    for items in tqdm(basket_items['article_id'], desc="Processing baskets"):
        # For each pair of items in the basket
        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                item1, item2 = items[i], items[j]
                copurchase_counts[item1][item2] += 1
                copurchase_counts[item2][item1] += 1

    print(f"‚úì Built co-purchase matrix for {len(copurchase_counts):,} items")

    # Convert to item-to-item similarity scores
    print("Computing item-to-item similarity scores...")
    item_to_items = {}

    for item1 in tqdm(copurchase_counts.keys(), desc="Computing similarities"):
        # Get co-purchased items
        copurchased = copurchase_counts[item1]
        
        # Filter by minimum support
        copurchased = {
            item2: count 
            for item2, count in copurchased.items() 
            if count >= config.MIN_ITEM_SUPPORT
        }
        
        if copurchased:
            # Sort by count and take top K
            top_items = sorted(
                copurchased.items(), 
                key=lambda x: x[1], 
                reverse=True
            )[:config.MAX_ITEM_NEIGHBORS]
            
            # Normalize scores
            max_count = top_items[0][1]
            item_to_items[item1] = [
                (item2, count / max_count) 
                for item2, count in top_items
            ]

    print(f"‚úì Computed similarities for {len(item_to_items):,} items")

    # Save item-to-item similarity matrix for potential reuse
    print("Saving item-to-item similarity matrix...")
    import pickle
    with open(config.OUTPUT_PATH / 'item_to_items.pkl', 'wb') as f:
        pickle.dump(item_to_items, f)
    print(f"  ‚úì Saved item_to_items.pkl ({len(item_to_items):,} items)")

    # Generate co-purchase candidates for each user
    print("Generating co-purchase candidates...")
    copurchase_candidates = []

    # Get recent purchases for each user (last 10)
    user_recent_items = (
        train_transactions
        .sort_values('t_dat', ascending=False)
        .groupby('customer_id')['article_id']
        .apply(lambda x: list(x.unique()[:10]))
        .to_dict()
    )

    for user in tqdm(all_users, desc="User co-purchase recommendations"):
        if user not in user_recent_items:
            continue
        
        user_items = user_recent_items[user]
        candidate_scores = defaultdict(float)
        
        # Aggregate scores from all user's items
        for user_item in user_items:
            if user_item in item_to_items:
                for similar_item, score in item_to_items[user_item]:
                    if similar_item not in user_items:  # Don't recommend already purchased
                        candidate_scores[similar_item] += score
        
        # Get top N candidates
        if candidate_scores:
            top_candidates = sorted(
                candidate_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            )[:config.N_COPURCHASE_CANDIDATES]
            
            for item, score in top_candidates:
                copurchase_candidates.append({
                    'customer_id': user,
                    'article_id': item,
                    'copurchase_score': score
                })

    copurchase_candidates = pd.DataFrame(copurchase_candidates)
    print(f"‚úì Generated {len(copurchase_candidates):,} co-purchase candidates")
    print(f"  Users with candidates: {copurchase_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving co-purchase candidates...")
    copurchase_candidates.to_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet', index=False)
    print(f"‚úì Saved temp_copurchase.parquet ({len(copurchase_candidates):,} rows)")

    # Clean up memory
    del basket_items, copurchase_counts, item_to_items, user_recent_items
    force_garbage_collection()
    print("‚úì Memory cleaned")


  STRATEGY 3: CO-PURCHASE (Item-to-Item CF)

Building co-purchase matrix...
  Baskets with 2+ items: 83,983
Computing co-purchase frequencies...


Processing baskets:   0%|          | 0/83983 [00:00<?, ?it/s]

‚úì Built co-purchase matrix for 15,868 items
Computing item-to-item similarity scores...


Computing similarities:   0%|          | 0/15868 [00:00<?, ?it/s]

‚úì Computed similarities for 10,603 items
Saving item-to-item similarity matrix...
  ‚úì Saved item_to_items.pkl (10,603 items)
Generating co-purchase candidates...


User co-purchase recommendations:   0%|          | 0/47543 [00:00<?, ?it/s]

‚úì Generated 612,807 co-purchase candidates
  Users with candidates: 45,344

Saving co-purchase candidates...
‚úì Saved temp_copurchase.parquet (612,807 rows)
‚úì Memory cleaned


In [136]:
# ============================================================================
# STRATEGY 4: USER-KNN COLLABORATIVE FILTERING - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 4: USER-KNN COLLABORATIVE FILTERING")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_userknn.parquet').exists():
    print("‚ö° Found existing user-KNN candidates, loading...")
    userknn_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet')
    print(f"‚úì Loaded {len(userknn_candidates):,} user-KNN candidates")
    print(f"  Users with candidates: {userknn_candidates['customer_id'].nunique():,}")
else:
    print("Building user-item matrix...")

    # Create sparse user-item matrix (binary: 1 if purchased, 0 otherwise)
    # Map users and items to indices
    user_to_idx = {user: idx for idx, user in enumerate(all_users)}
    item_to_idx = {item: idx for idx, item in enumerate(all_items)}

    # Create matrix
    n_users = len(all_users)
    n_items = len(all_items)

    print(f"  Matrix size: {n_users:,} users x {n_items:,} items")

    # Use last 4 weeks for user similarity (more recent = more relevant)
    recent_date = max_date - timedelta(weeks=4)
    recent_user_items = train_transactions[train_transactions['t_dat'] >= recent_date].copy()

    user_item_matrix = lil_matrix((n_users, n_items), dtype=np.int8)

    print("Populating user-item matrix...")
    for _, row in tqdm(recent_user_items.iterrows(), total=len(recent_user_items), desc="Building matrix"):
        user_idx = user_to_idx[row['customer_id']]
        item_idx = item_to_idx[row['article_id']]
        user_item_matrix[user_idx, item_idx] = 1

    # Convert to CSR for efficient operations
    user_item_matrix = user_item_matrix.tocsr()
    print(f"‚úì Matrix density: {user_item_matrix.nnz / (n_users * n_items) * 100:.4f}%")

    # Save user-item matrix for potential reuse
    print("Saving user-item matrix...")
    from scipy.sparse import save_npz
    save_npz(config.OUTPUT_PATH / 'user_item_matrix.npz', user_item_matrix)
    print(f"  ‚úì Saved user_item_matrix.npz")

    # Save user/item mappings
    import pickle
    with open(config.OUTPUT_PATH / 'user_to_idx.pkl', 'wb') as f:
        pickle.dump(user_to_idx, f)
    with open(config.OUTPUT_PATH / 'item_to_idx.pkl', 'wb') as f:
        pickle.dump(item_to_idx, f)
    print(f"  ‚úì Saved index mappings")

    # Compute user-user similarity (only for validation users to save memory)
    print(f"Computing user similarities for {len(val_users):,} validation users...")

    val_user_indices = [user_to_idx[user] for user in val_users if user in user_to_idx]
    val_user_matrix = user_item_matrix[val_user_indices]

    # Normalize rows
    val_user_matrix_norm = normalize(val_user_matrix, norm='l2', axis=1)
    user_item_matrix_norm = normalize(user_item_matrix, norm='l2', axis=1)

    # Compute similarity (batch processing to avoid memory issues)
    print("Computing cosine similarities...")
    batch_size = 1000
    userknn_candidates = []

    for i in tqdm(range(0, len(val_user_indices), batch_size), desc="Similarity batches"):
        batch_indices = val_user_indices[i:i+batch_size]
        batch_matrix = val_user_matrix_norm[i:i+batch_size]
        
        # Compute similarity with all users
        similarities = cosine_similarity(batch_matrix, user_item_matrix_norm)
        
        # For each user in batch
        for j, user_idx in enumerate(batch_indices):
            user = all_users[user_idx]
            user_sims = similarities[j]
            
            # Get top similar users (exclude self)
            similar_user_indices = np.argsort(user_sims)[::-1][1:config.N_SIMILAR_USERS+1]
            
            # Get items purchased by similar users
            candidate_scores = defaultdict(float)
            user_purchased = set(
                train_transactions[train_transactions['customer_id'] == user]['article_id']
            )
            
            for sim_user_idx in similar_user_indices:
                sim_score = user_sims[sim_user_idx]
                if sim_score < 0.01:  # Skip very dissimilar users
                    continue
                
                sim_user = all_users[sim_user_idx]
                sim_user_items = train_transactions[
                    train_transactions['customer_id'] == sim_user
                ]['article_id'].unique()
                
                for item in sim_user_items:
                    if item not in user_purchased:
                        candidate_scores[item] += sim_score
            
            # Get top N candidates
            if candidate_scores:
                top_candidates = sorted(
                    candidate_scores.items(), 
                    key=lambda x: x[1], 
                    reverse=True
                )[:config.N_USERKNN_CANDIDATES]
                
                for item, score in top_candidates:
                    userknn_candidates.append({
                        'customer_id': user,
                        'article_id': item,
                        'userknn_score': score
                    })

    userknn_candidates = pd.DataFrame(userknn_candidates)
    print(f"‚úì Generated {len(userknn_candidates):,} user-KNN candidates")
    print(f"  Users with candidates: {userknn_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving user-KNN candidates...")
    userknn_candidates.to_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet', index=False)
    print(f"‚úì Saved temp_userknn.parquet ({len(userknn_candidates):,} rows)")

    # Clean up memory
    del user_item_matrix, val_user_matrix, val_user_matrix_norm, user_item_matrix_norm
    del user_to_idx, item_to_idx, recent_user_items
    force_garbage_collection()
    print("‚úì Memory cleaned")



  STRATEGY 4: USER-KNN COLLABORATIVE FILTERING

Building user-item matrix...
  Matrix size: 47,543 users x 15,932 items
Populating user-item matrix...


Building matrix:   0%|          | 0/62015 [00:00<?, ?it/s]

‚úì Matrix density: 0.0072%
Saving user-item matrix...
  ‚úì Saved user_item_matrix.npz
  ‚úì Saved index mappings
Computing user similarities for 4,943 validation users...
Computing cosine similarities...


Similarity batches:   0%|          | 0/4 [00:00<?, ?it/s]

‚úì Generated 30,107 user-KNN candidates
  Users with candidates: 2,018

Saving user-KNN candidates...
‚úì Saved temp_userknn.parquet (30,107 rows)
‚úì Memory cleaned


In [137]:
# ============================================================================
# STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_category.parquet').exists():
    print("‚ö° Found existing category candidates, loading...")
    category_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_category.parquet')
    print(f"‚úì Loaded {len(category_candidates):,} category candidates")
    print(f"  Users with candidates: {category_candidates['customer_id'].nunique():,}")
else:
    print("Computing user category preferences...")

    # Get user's category preferences
    user_categories = (
        train_transactions
        .merge(articles[['article_id', 'product_type_no', 'product_group_name']], on='article_id')
        .groupby(['customer_id', 'product_type_no'])
        .size()
        .reset_index(name='count')
    )

    # Get top 3 categories per user
    user_top_categories = (
        user_categories
        .sort_values(['customer_id', 'count'], ascending=[True, False])
        .groupby('customer_id')
        .head(3)
    )

    print(f"‚úì Computed preferences for {user_top_categories['customer_id'].nunique():,} users")

    # Save user category preferences for potential reuse
    print("Saving user category preferences...")
    user_top_categories.to_parquet(config.OUTPUT_PATH / 'user_category_preferences.parquet', index=False)
    print(f"  ‚úì Saved user_category_preferences.parquet")

    # Get popular items per category
    category_popular_items = (
        train_transactions[train_transactions['t_dat'] >= cutoff_date]
        .merge(articles[['article_id', 'product_type_no']], on='article_id')
        .groupby(['product_type_no', 'article_id'])
        .size()
        .reset_index(name='count')
        .sort_values(['product_type_no', 'count'], ascending=[True, False])
        .groupby('product_type_no')
        .head(10)
    )

    # Save category popular items for potential reuse
    print("Saving category popular items...")
    category_popular_items.to_parquet(config.OUTPUT_PATH / 'category_popular_items.parquet', index=False)
    print(f"  ‚úì Saved category_popular_items.parquet")

    print("Generating category-based candidates...")
    category_candidates = []

    for _, row in tqdm(user_top_categories.iterrows(), total=len(user_top_categories), desc="Category recommendations"):
        user = row['customer_id']
        category = row['product_type_no']
        
        # Get popular items in this category
        category_items = category_popular_items[
            category_popular_items['product_type_no'] == category
        ]['article_id'].tolist()
        
        # Get user's purchased items
        user_items = set(
            train_transactions[train_transactions['customer_id'] == user]['article_id']
        )
        
        # Recommend items not yet purchased
        for rank, item in enumerate(category_items):
            if item not in user_items and rank < config.N_CATEGORY_CANDIDATES:
                category_candidates.append({
                    'customer_id': user,
                    'article_id': item,
                    'category_score': 1.0 / (rank + 1)  # Rank-based score
                })

    category_candidates = pd.DataFrame(category_candidates)
    print(f"‚úì Generated {len(category_candidates):,} category-based candidates")
    print(f"  Users with candidates: {category_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving category candidates...")
    category_candidates.to_parquet(config.OUTPUT_PATH / 'temp_category.parquet', index=False)
    print(f"‚úì Saved temp_category.parquet ({len(category_candidates):,} rows)")

    # Clean up memory
    del user_categories, user_top_categories, category_popular_items
    force_garbage_collection()
    print("‚úì Memory cleaned")


  STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS

Computing user category preferences...
‚úì Computed preferences for 47,543 users
Saving user category preferences...
  ‚úì Saved user_category_preferences.parquet
Saving category popular items...
  ‚úì Saved category_popular_items.parquet
Generating category-based candidates...


Category recommendations:   0%|          | 0/142629 [00:00<?, ?it/s]

‚úì Generated 1,409,632 category-based candidates
  Users with candidates: 47,543

Saving category candidates...
‚úì Saved temp_category.parquet (1,409,632 rows)
‚úì Memory cleaned


In [138]:
print_section("STRATEGY 6: TEXT SIMILARITY RECOMMENDATIONS")

# Import the text feature module (save the artifact code as text_features.py)
from text_features import integrate_text_features_stage2

# Generate text-based candidates
text_candidates, article_embeddings, user_embeddings, text_cols = integrate_text_features_stage2(
    all_users=all_users,
    train_transactions=train_transactions,
    articles=articles,
    output_path=config.OUTPUT_PATH
)

# Save for later use
if text_candidates is not None and len(text_candidates) > 0:
    text_candidates.to_parquet(config.OUTPUT_PATH / 'temp_text_similarity.parquet', index=False)
    print(f"‚úì Saved {len(text_candidates):,} text similarity candidates")
else:
    print("‚ö†Ô∏è  No text candidates generated")


  STRATEGY 6: TEXT SIMILARITY RECOMMENDATIONS


  STAGE 2 ENHANCEMENT: TEXT-BASED CANDIDATES

Creating text corpus from articles...
  Available text columns: 12/12
    Processed 0 articles...
  ‚úì Created corpus for 16,616 articles

Computing text embeddings...
  Valid documents: 16,616
  Computing TF-IDF...
  ‚úì TF-IDF shape: (16616, 100)
  Reducing to 20 dimensions...
  ‚úì Embeddings shape: (16616, 20)
  Explained variance: 0.760

Computing user text preferences...
  Building user preference vectors...
  ‚úì Computed preferences for 47,543 users

Generating text similarity candidates...
  Processing 47,543 users...
    Processed 10,000 users...
    Processed 20,000 users...
    Processed 30,000 users...
    Processed 40,000 users...
  ‚úì Generated 713,145 text similarity candidates

‚úì Saved text similarity candidates to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/temp_text_similarity.parquet

Saving embeddings for Stage 3...
‚

In [139]:
print_section("COMBINING ALL RECALL STRATEGIES")

print("Loading candidates from parquet files...")

# Load all candidates (existing code)
print("  Loading repurchase candidates...")
repurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_repurchase.parquet')
print(f"    ‚úì {len(repurchase_candidates):,} candidates")

print("  Loading popularity candidates...")
popularity_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_popularity.parquet')
print(f"    ‚úì {len(popularity_candidates):,} candidates")

print("  Loading co-purchase candidates...")
copurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet')
print(f"    ‚úì {len(copurchase_candidates):,} candidates")

print("  Loading user-KNN candidates...")
userknn_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet')
print(f"    ‚úì {len(userknn_candidates):,} candidates")

print("  Loading category candidates...")
category_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_category.parquet')
print(f"    ‚úì {len(category_candidates):,} candidates")

# NEW: Load text similarity candidates
if (config.OUTPUT_PATH / 'temp_text_similarity.parquet').exists():
    print("  Loading text similarity candidates...")
    text_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_text_similarity.parquet')
    print(f"    ‚úì {len(text_candidates):,} candidates")
    has_text_candidates = True
else:
    print("  ‚ö†Ô∏è  No text similarity candidates found")
    has_text_candidates = False

print("\nMerging candidates from all strategies...")

# Start with repurchase candidates (existing code)
all_candidates = repurchase_candidates.copy()

# Merge popularity
all_candidates = all_candidates.merge(
    popularity_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_pop')
)

# Merge co-purchase
all_candidates = all_candidates.merge(
    copurchase_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_cop')
)

# Merge user-KNN
all_candidates = all_candidates.merge(
    userknn_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_knn')
)

# Merge category
all_candidates = all_candidates.merge(
    category_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_cat')
)

# NEW: Merge text similarity
if has_text_candidates:
    all_candidates = all_candidates.merge(
        text_candidates,
        on=['customer_id', 'article_id'],
        how='outer',
        suffixes=('', '_text')
    )

# Fill NaN scores with 0
score_columns = [
    'repurchase_score', 'popularity_score', 'copurchase_score', 
    'userknn_score', 'category_score'
]

# NEW: Add text similarity score
if has_text_candidates:
    score_columns.append('text_similarity_score')

all_candidates[score_columns] = all_candidates[score_columns].fillna(0)

print(f"‚úì Total unique user-item pairs: {len(all_candidates):,}")

# Count how many strategies recommend each item
all_candidates['n_strategies'] = (all_candidates[score_columns] > 0).sum(axis=1)

print("\nCandidate statistics:")
print(f"  Candidates per user: {len(all_candidates) / all_candidates['customer_id'].nunique():.2f}")
print(f"  Avg strategies per candidate: {all_candidates['n_strategies'].mean():.2f}")
print("\n  Candidates by number of strategies:")
for n in sorted(all_candidates['n_strategies'].unique()):
    count = (all_candidates['n_strategies'] == n).sum()
    pct = count / len(all_candidates) * 100
    print(f"    {n} strategies: {count:,} ({pct:.1f}%)")

# Save the merged candidates
print("\nSaving merged candidates...")
all_candidates.to_parquet(config.OUTPUT_PATH / 'all_candidates_merged.parquet', index=False)
print(f"‚úì Saved to all_candidates_merged.parquet ({len(all_candidates):,} rows)")

# Clean up to save memory
print("\nCleaning up temporary dataframes...")
del repurchase_candidates, popularity_candidates, copurchase_candidates
del userknn_candidates, category_candidates
if has_text_candidates:
    del text_candidates
gc.collect()
print("‚úì Memory cleaned")


  COMBINING ALL RECALL STRATEGIES

Loading candidates from parquet files...
  Loading repurchase candidates...
    ‚úì 325,177 candidates
  Loading popularity candidates...
    ‚úì 1,188,575 candidates
  Loading co-purchase candidates...
    ‚úì 612,807 candidates
  Loading user-KNN candidates...
    ‚úì 30,107 candidates
  Loading category candidates...
    ‚úì 1,409,632 candidates
  Loading text similarity candidates...
    ‚úì 713,145 candidates

Merging candidates from all strategies...
‚úì Total unique user-item pairs: 4,044,442

Candidate statistics:
  Candidates per user: 85.07
  Avg strategies per candidate: 1.06

  Candidates by number of strategies:
    1 strategies: 3,816,416 (94.4%)
    2 strategies: 221,301 (5.5%)
    3 strategies: 6,479 (0.2%)
    4 strategies: 242 (0.0%)
    5 strategies: 4 (0.0%)

Saving merged candidates...
‚úì Saved to all_candidates_merged.parquet (4,044,442 rows)

Cleaning up temporary dataframes...
‚úì Memory cleaned


### Stage 3: Extracting Features

In [162]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import warnings
from tqdm.auto import tqdm
import pickle

warnings.filterwarnings('ignore')

In [163]:
# ============================================================================
# MEMORY MONITORING
# ============================================================================

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    gc.collect()

In [164]:
# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2')
    
    # Processing
    CHUNK_SIZE = 50000  # Process candidates in chunks
    
    # Feature engineering windows
    RECENT_DAYS = 7  # Last week
    MEDIUM_DAYS = 30  # Last month
    
    RANDOM_STATE = 42

config = Config()

In [None]:
# UTILITY FUNCTIONS

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

In [None]:
# LOAD DATA

print_section("LOADING DATA")

print("Loading preprocessed data...")

# Check which candidate file exists
if (config.DATA_PATH / 'all_candidates_merged.parquet').exists():
    candidates = pd.read_parquet(config.DATA_PATH / 'all_candidates_merged.parquet')
    print(f"‚úì Loaded all_candidates_merged.parquet")
elif (config.DATA_PATH / 'recall_candidates.parquet').exists():
    candidates = pd.read_parquet(config.DATA_PATH / 'recall_candidates.parquet')
    print(f"‚úì Loaded recall_candidates.parquet")
else:
    raise FileNotFoundError("Could not find candidates file (all_candidates_merged.parquet or recall_candidates.parquet)")

train_transactions = pd.read_parquet(config.DATA_PATH / 'train_transactions.parquet')
articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')
customers = pd.read_parquet(config.DATA_PATH / 'customers.parquet')

print(f"‚úì Train transactions: {len(train_transactions):,}")
print(f"‚úì Articles: {len(articles):,}")
print(f"‚úì Customers: {len(customers):,}")
print(f"‚úì Candidates: {len(candidates):,}")
print_memory()

# Get max date
max_date = train_transactions['t_dat'].max()
print(f"‚úì Max date: {max_date.date()}")

# Load item popularity from recall stage
item_popularity = pd.read_parquet(config.DATA_PATH / 'item_popularity.parquet')
print(f"‚úì Item popularity scores: {len(item_popularity):,}")

# Check what score columns are available in candidates
available_scores = [col for col in candidates.columns if 'score' in col.lower()]
print(f"‚úì Available recall scores in candidates: {available_scores}")

# Item-to-item CF is already in the candidates as 'copurchase_score'
# We don't need to load a separate pkl file
has_copurchase_score = 'copurchase_score' in candidates.columns
if has_copurchase_score:
    print(f"‚úì Co-purchase scores available in candidates")
else:
    print("‚ö†Ô∏è  Co-purchase scores not found in candidates")


  LOADING DATA

Loading preprocessed data...
‚úì Loaded all_candidates_merged.parquet
‚úì Train transactions: 412,156
‚úì Articles: 16,616
‚úì Customers: 49,576
‚úì Candidates: 4,044,442
  üíæ Memory: 1.56 GB
‚úì Max date: 2020-09-14
‚úì Item popularity scores: 7,115
‚úì Available recall scores in candidates: ['repurchase_score', 'popularity_score', 'copurchase_score', 'userknn_score', 'category_score', 'text_similarity_score']
‚úì Co-purchase scores available in candidates


In [None]:
# PART 1: USER FEATURES (20-25 features)

print_section("PART 1: USER FEATURES")

# Check if user features already exist
if (config.OUTPUT_PATH / 'user_features.parquet').exists():
    print("‚ö° Found existing user_features.parquet, loading...")
    user_stats = pd.read_parquet(config.OUTPUT_PATH / 'user_features.parquet')
    print(f"‚úì Loaded {len(user_stats.columns)-1} user features from disk")
    print_memory()
else:
    print("Computing user-level features...")

    # Basic user statistics
    print("  [1/5] Basic purchase statistics...")
    user_stats = train_transactions.groupby('customer_id').agg({
        'article_id': 'count',  # Total purchases
        'price': ['mean', 'std', 'min', 'max'],  # Price statistics
        't_dat': ['min', 'max']  # First and last purchase dates
    }).reset_index()

    user_stats.columns = ['customer_id', 'n_purchases', 'avg_price', 'std_price', 
                          'min_price', 'max_price', 'first_purchase_date', 'last_purchase_date']

    # Calculate days since first/last purchase
    user_stats['days_since_first_purchase'] = (
        max_date - user_stats['first_purchase_date']
    ).dt.days.astype(np.int16)

    user_stats['days_since_last_purchase'] = (
        max_date - user_stats['last_purchase_date']
    ).dt.days.astype(np.int16)

    # Purchase frequency
    user_stats['purchase_frequency'] = (
        user_stats['n_purchases'] / (user_stats['days_since_first_purchase'] + 1)
    ).astype(np.float32)

    # Drop date columns (not needed anymore)
    user_stats = user_stats.drop(['first_purchase_date', 'last_purchase_date'], axis=1)

    print(f"  - Created {len(user_stats.columns)-1} basic features")

    # Recent activity features
    print("  [2/5] Recent activity features...")
    recent_cutoff = max_date - timedelta(days=config.RECENT_DAYS)
    recent_transactions = train_transactions[train_transactions['t_dat'] >= recent_cutoff]

    user_recent_stats = recent_transactions.groupby('customer_id').agg({
        'article_id': 'count',
        'price': 'mean'
    }).reset_index()
    user_recent_stats.columns = ['customer_id', 'n_purchases_last_week', 'avg_price_last_week']

    # Merge with main stats
    user_stats = user_stats.merge(user_recent_stats, on='customer_id', how='left')
    user_stats['n_purchases_last_week'] = user_stats['n_purchases_last_week'].fillna(0).astype(np.int16)
    user_stats['avg_price_last_week'] = user_stats['avg_price_last_week'].fillna(0).astype(np.float32)

    # Is user active recently
    user_stats['is_active_last_week'] = (user_stats['n_purchases_last_week'] > 0).astype(np.int8)

    del recent_transactions, user_recent_stats
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Diversity features
    print("  [3/5] Diversity features...")
    user_diversity = train_transactions.groupby('customer_id').agg({
        'article_id': 'nunique',
    }).reset_index()
    user_diversity.columns = ['customer_id', 'n_unique_articles']

    # Category diversity
    user_cat_diversity = (
        train_transactions
        .merge(articles[['article_id', 'product_type_no']], on='article_id')
        .groupby('customer_id')['product_type_no']
        .nunique()
        .reset_index()
    )
    user_cat_diversity.columns = ['customer_id', 'n_unique_categories']

    user_stats = user_stats.merge(user_diversity, on='customer_id', how='left')
    user_stats = user_stats.merge(user_cat_diversity, on='customer_id', how='left')

    user_stats['exploration_rate'] = (
        user_stats['n_unique_articles'] / user_stats['n_purchases']
    ).astype(np.float32)

    del user_diversity, user_cat_diversity
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Customer demographic features
    print("  [4/5] Demographic features...")
    customer_features = customers[['customer_id', 'age', 'FN', 'Active']].copy()

    # Merge with user stats
    user_stats = user_stats.merge(customer_features, on='customer_id', how='left')

    # Fill missing values
    user_stats['age'] = user_stats['age'].fillna(user_stats['age'].median()).astype(np.float32)
    user_stats['FN'] = user_stats['FN'].fillna(0).astype(np.float32)
    user_stats['Active'] = user_stats['Active'].fillna(0).astype(np.float32)

    del customer_features
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Purchase trend
    print("  [5/5] Purchase trend features...")
    # Split into two periods and compare
    mid_date = max_date - timedelta(days=config.MEDIUM_DAYS // 2)
    old_cutoff = max_date - timedelta(days=config.MEDIUM_DAYS)

    recent_period = train_transactions[train_transactions['t_dat'] >= mid_date]
    old_period = train_transactions[
        (train_transactions['t_dat'] >= old_cutoff) & (train_transactions['t_dat'] < mid_date)
    ]

    user_recent_count = recent_period.groupby('customer_id').size().reset_index(name='purchases_recent_period')
    user_old_count = old_period.groupby('customer_id').size().reset_index(name='purchases_old_period')

    user_trend = user_recent_count.merge(user_old_count, on='customer_id', how='outer').fillna(0)
    user_trend['purchase_trend'] = (
        (user_trend['purchases_recent_period'] - user_trend['purchases_old_period']) / 
        (user_trend['purchases_old_period'] + 1)
    ).astype(np.float32)

    user_stats = user_stats.merge(
        user_trend[['customer_id', 'purchase_trend']], 
        on='customer_id', 
        how='left'
    )
    user_stats['purchase_trend'] = user_stats['purchase_trend'].fillna(0).astype(np.float32)

    del recent_period, old_period, user_recent_count, user_old_count, user_trend
    force_garbage_collection()

    # Convert to optimal dtypes
    for col in user_stats.columns:
        if col != 'customer_id':
            if user_stats[col].dtype == 'float64':
                user_stats[col] = user_stats[col].astype(np.float32)
            elif user_stats[col].dtype == 'int64':
                user_stats[col] = user_stats[col].astype(np.int32)

    print(f"‚úì Created {len(user_stats.columns)-1} user features")
    print_memory()

    # Save user features for reuse
    print("\nSaving user features...")
    user_stats.to_parquet(config.OUTPUT_PATH / 'user_features.parquet', index=False)
    print(f"‚úì Saved user_features.parquet ({len(user_stats):,} rows, {len(user_stats.columns)-1} features)")


  PART 1: USER FEATURES

‚ö° Found existing user_features.parquet, loading...
‚úì Loaded 18 user features from disk
  üíæ Memory: 1.42 GB


In [None]:
# PART 2: ITEM FEATURES (20-25 features)

print_section("PART 2: ITEM FEATURES")

# Check if item features already exist
if (config.OUTPUT_PATH / 'item_features.parquet').exists():
    print("‚ö° Found existing item_features.parquet, loading...")
    item_stats = pd.read_parquet(config.OUTPUT_PATH / 'item_features.parquet')
    print(f"‚úì Loaded {len(item_stats.columns)-1} item features from disk")
    print_memory()
else:
    print("Computing item-level features...")

# Basic item statistics
print("  [1/4] Basic item statistics...")
item_stats = train_transactions.groupby('article_id').agg({
    'customer_id': 'nunique',  # Number of unique buyers
    'price': ['mean', 'std'],
    't_dat': ['min', 'max', 'count']
}).reset_index()

item_stats.columns = ['article_id', 'n_unique_buyers', 'avg_price', 'std_price',
                      'first_sale_date', 'last_sale_date', 'total_sales']

# Days since first/last sale
item_stats['days_since_first_sale'] = (
    max_date - item_stats['first_sale_date']
).dt.days.astype(np.int16)

item_stats['days_since_last_sale'] = (
    max_date - item_stats['last_sale_date']
).dt.days.astype(np.int16)

# Sales frequency
item_stats['sales_frequency'] = (
    item_stats['total_sales'] / (item_stats['days_since_first_sale'] + 1)
).astype(np.float32)

item_stats = item_stats.drop(['first_sale_date', 'last_sale_date'], axis=1)

print(f"  - Created {len(item_stats.columns)-1} basic features")

# Recent popularity
print("  [2/4] Recent popularity features...")
recent_cutoff = max_date - timedelta(days=config.RECENT_DAYS)
recent_sales = train_transactions[train_transactions['t_dat'] >= recent_cutoff]

item_recent_stats = recent_sales.groupby('article_id').agg({
    'customer_id': ['count', 'nunique']
}).reset_index()
item_recent_stats.columns = ['article_id', 'sales_last_week', 'buyers_last_week']

item_stats = item_stats.merge(item_recent_stats, on='article_id', how='left')
item_stats['sales_last_week'] = item_stats['sales_last_week'].fillna(0).astype(np.int16)
item_stats['buyers_last_week'] = item_stats['buyers_last_week'].fillna(0).astype(np.int16)

del recent_sales, item_recent_stats
force_garbage_collection()

print(f"  - Total features so far: {len(item_stats.columns)-1}")

# Sales trend
print("  [3/4] Sales trend features...")
mid_date = max_date - timedelta(days=config.MEDIUM_DAYS // 2)
old_cutoff = max_date - timedelta(days=config.MEDIUM_DAYS)

recent_period = train_transactions[train_transactions['t_dat'] >= mid_date]
old_period = train_transactions[
    (train_transactions['t_dat'] >= old_cutoff) & (train_transactions['t_dat'] < mid_date)
]

item_recent_count = recent_period.groupby('article_id').size().reset_index(name='sales_recent_period')
item_old_count = old_period.groupby('article_id').size().reset_index(name='sales_old_period')

item_trend = item_recent_count.merge(item_old_count, on='article_id', how='outer').fillna(0)
item_trend['sales_trend'] = (
    (item_trend['sales_recent_period'] - item_trend['sales_old_period']) / 
    (item_trend['sales_old_period'] + 1)
).astype(np.float32)

item_stats = item_stats.merge(
    item_trend[['article_id', 'sales_trend']], 
    on='article_id', 
    how='left'
)
item_stats['sales_trend'] = item_stats['sales_trend'].fillna(0).astype(np.float32)

del recent_period, old_period, item_recent_count, item_old_count, item_trend
force_garbage_collection()

print(f"  - Total features so far: {len(item_stats.columns)-1}")

# Merge with article metadata
print("  [4/4] Article metadata features...")
article_features = articles[[
    'article_id', 'product_type_no', 'graphical_appearance_no',
    'colour_group_code', 'perceived_colour_value_id', 
    'department_no', 'index_group_no', 'section_no', 'garment_group_no'
]].copy()

item_stats = item_stats.merge(article_features, on='article_id', how='left')

del article_features
force_garbage_collection()

# Add popularity scores from recall stage
item_stats = item_stats.merge(
    item_popularity[['article_id', 'popularity_score']], 
    on='article_id', 
    how='left'
)
item_stats['popularity_score'] = item_stats['popularity_score'].fillna(0).astype(np.float32)

# Convert to optimal dtypes
for col in item_stats.columns:
    if col != 'article_id':
        if item_stats[col].dtype == 'float64':
            item_stats[col] = item_stats[col].astype(np.float32)
        elif item_stats[col].dtype == 'int64':
            item_stats[col] = item_stats[col].astype(np.int32)

print(f"‚úì Created {len(item_stats.columns)-1} item features")
print_memory()

# Save item features for reuse
print("\nSaving item features...")
item_stats.to_parquet(config.OUTPUT_PATH / 'item_features.parquet', index=False)
print(f"‚úì Saved item_features.parquet ({len(item_stats):,} rows, {len(item_stats.columns)-1} features)")


  PART 2: ITEM FEATURES

‚ö° Found existing item_features.parquet, loading...
‚úì Loaded 19 item features from disk
  üíæ Memory: 1.42 GB
  [1/4] Basic item statistics...
  - Created 7 basic features
  [2/4] Recent popularity features...
  - Total features so far: 9
  [3/4] Sales trend features...
  - Total features so far: 10
  [4/4] Article metadata features...
‚úì Created 19 item features
  üíæ Memory: 2.07 GB

Saving item features...
‚úì Saved item_features.parquet (15,932 rows, 19 features)


In [None]:
# PART 3: USER-ITEM INTERACTION FEATURES (CHUNKED)

print_section("PART 3: USER-ITEM INTERACTION FEATURES")

print("Computing interaction features in chunks...")

# Precompute user purchase history for fast lookup
print("  [1/3] Building user purchase history...")
user_purchases = (
    train_transactions
    .groupby('customer_id')['article_id']
    .apply(set)
    .to_dict()
)

user_purchase_list = (
    train_transactions
    .sort_values('t_dat', ascending=False)
    .groupby('customer_id')['article_id']
    .apply(list)
    .to_dict()
)

# Save user purchase history for reuse
print("  [1.5/3] Saving user purchase history...")
user_purchase_df = pd.DataFrame([
    {'customer_id': user, 'purchased_articles': list(items)}
    for user, items in user_purchases.items()
])
user_purchase_df.to_parquet(config.OUTPUT_PATH / 'user_purchase_history.parquet', index=False)
print(f"  ‚úì Saved user_purchase_history.parquet")

del user_purchase_df
force_garbage_collection()

# User category preferences
user_categories = (
    train_transactions
    .merge(articles[['article_id', 'product_type_no']], on='article_id')
    .groupby(['customer_id', 'product_type_no'])
    .size()
    .reset_index(name='count')
)
user_top_category = (
    user_categories
    .sort_values(['customer_id', 'count'], ascending=[True, False])
    .groupby('customer_id')
    .first()
    .reset_index()
    [['customer_id', 'product_type_no']]
    .rename(columns={'product_type_no': 'top_category'})
)

# User price preferences
user_price_stats = train_transactions.groupby('customer_id')['price'].agg(['mean', 'std']).reset_index()
user_price_stats.columns = ['customer_id', 'user_avg_price', 'user_std_price']

# Save additional user stats
print("Saving additional user statistics...")
user_top_category.to_parquet(config.OUTPUT_PATH / 'user_top_categories.parquet', index=False)
user_price_stats.to_parquet(config.OUTPUT_PATH / 'user_price_stats.parquet', index=False)
print(f"  ‚úì Saved user_top_categories.parquet")
print(f"  ‚úì Saved user_price_stats.parquet")

print("  [2/3] Processing candidates in chunks...")

# Split candidates into chunks
n_chunks = max(1, len(candidates) // config.CHUNK_SIZE)
candidate_chunks = np.array_split(candidates, n_chunks)

feature_chunks = []

for chunk_idx, chunk in enumerate(tqdm(candidate_chunks, desc="Feature chunks")):
    # Start with the chunk
    chunk_features = chunk.copy()
    
    # Has user purchased this exact item before?
    chunk_features['has_purchased_item'] = chunk_features.apply(
        lambda row: 1 if row['article_id'] in user_purchases.get(row['customer_id'], set()) else 0,
        axis=1
    ).astype(np.int8)
    
    # If purchased before, get days since last purchase
    def days_since_purchase(row):
        user_items = user_purchase_list.get(row['customer_id'], [])
        if row['article_id'] in user_items:
            # Get position of first occurrence (most recent due to sort)
            try:
                idx = user_items.index(row['article_id'])
                # Approximate days (assuming 1 purchase per day on average)
                return min(idx, 365)
            except:
                return 365
        return 365
    
    chunk_features['days_since_item_purchase'] = chunk_features.apply(
        days_since_purchase, axis=1
    ).astype(np.int16)
    
    # Merge with item stats to get item metadata
    chunk_features = chunk_features.merge(
        item_stats[['article_id', 'product_type_no', 'avg_price', 'popularity_score']], 
        on='article_id', 
        how='left'
    )
    
    # Has user purchased items from this category?
    chunk_features = chunk_features.merge(user_top_category, on='customer_id', how='left')
    chunk_features['category_match'] = (
        chunk_features['product_type_no'] == chunk_features['top_category']
    ).astype(np.int8)
    chunk_features = chunk_features.drop(['product_type_no', 'top_category'], axis=1)
    
    # Price match features
    chunk_features = chunk_features.merge(user_price_stats, on='customer_id', how='left')
    chunk_features['price_vs_user_avg'] = (
        (chunk_features['avg_price'] - chunk_features['user_avg_price']) / 
        (chunk_features['user_std_price'] + 0.01)
    ).astype(np.float32)
    
    chunk_features['is_cheaper_than_usual'] = (
        chunk_features['avg_price'] < chunk_features['user_avg_price']
    ).astype(np.int8)
    
    chunk_features = chunk_features.drop(['user_avg_price', 'user_std_price', 'avg_price'], axis=1)
    
    # Co-purchase score is already in candidates, but create derived features
    if 'copurchase_score' in chunk_features.columns:
        # Normalize copurchase score by user's max copurchase score
        user_max_copurchase = chunk_features.groupby('customer_id')['copurchase_score'].transform('max')
        chunk_features['copurchase_score_normalized'] = (
            chunk_features['copurchase_score'] / (user_max_copurchase + 0.001)
        ).astype(np.float32)
        
        # Binary: has any copurchase signal
        chunk_features['has_copurchase_signal'] = (
            chunk_features['copurchase_score'] > 0
        ).astype(np.int8)
    
    # Recall strategy coverage (how many strategies recommended this item)
    # Already in candidates as 'n_strategies'
    
    # Rank features (rank within each recall strategy)
    for score_col in ['repurchase_score', 'popularity_score', 'copurchase_score', 
                      'userknn_score', 'category_score']:
        if score_col in chunk_features.columns:
            chunk_features[f'{score_col}_rank'] = (
                chunk_features.groupby('customer_id')[score_col]
                .rank(method='dense', ascending=False)
                .astype(np.int16)
            )
    
    # Overall candidate rank (by combined score if available)
    if 'n_strategies' in chunk_features.columns:
        chunk_features['overall_rank'] = (
            chunk_features.groupby('customer_id')['n_strategies']
            .rank(method='dense', ascending=False)
            .astype(np.int16)
        )
    
    # Clean up
    chunk_features = chunk_features.fillna(0)
    
    feature_chunks.append(chunk_features)
    
    # Clean up
    del chunk_features
    if chunk_idx % 10 == 0:
        force_garbage_collection()

print("  [3/3] Combining feature chunks...")
all_features = pd.concat(feature_chunks, ignore_index=True)
del feature_chunks
force_garbage_collection()

print(f"‚úì Created interaction features for {len(all_features):,} candidates")
print_memory()

# MERGE ALL FEATURES

print_section("MERGING ALL FEATURES")

print("Merging user, item, and interaction features...")

# Merge user features
all_features = all_features.merge(user_stats, on='customer_id', how='left')
print(f"  ‚úì Merged user features")

# Merge item features (already partially merged, merge remaining)
remaining_item_cols = [col for col in item_stats.columns if col not in all_features.columns]
remaining_item_cols.append('article_id')
all_features = all_features.merge(item_stats[remaining_item_cols], on='article_id', how='left')
print(f"  ‚úì Merged item features")

# Fill any remaining NaNs (handle categorical columns separately)
print("Filling missing values...")

# Identify categorical columns
categorical_cols = all_features.select_dtypes(include=['category']).columns.tolist()
numerical_cols = all_features.select_dtypes(include=[np.number]).columns.tolist()

# Fill numerical columns with 0
if numerical_cols:
    all_features[numerical_cols] = all_features[numerical_cols].fillna(0)

# Fill categorical columns with their mode or a default value
for col in categorical_cols:
    if all_features[col].isna().any():
        # Get the most frequent category
        mode_value = all_features[col].mode()
        if len(mode_value) > 0:
            all_features[col] = all_features[col].fillna(mode_value[0])
        else:
            # If no mode, convert to string and fill with 'unknown'
            all_features[col] = all_features[col].astype(str).fillna('unknown')

print(f"\n‚úì Total features: {len(all_features.columns) - 2} (excluding customer_id, article_id)")
print(f"‚úì Total candidate-feature pairs: {len(all_features):,}")
print_memory()


  PART 3: USER-ITEM INTERACTION FEATURES

Computing interaction features in chunks...
  [1/3] Building user purchase history...
  [1.5/3] Saving user purchase history...
  ‚úì Saved user_purchase_history.parquet
Saving additional user statistics...
  ‚úì Saved user_top_categories.parquet
  ‚úì Saved user_price_stats.parquet
  [2/3] Processing candidates in chunks...


Feature chunks:   0%|          | 0/80 [00:00<?, ?it/s]

  [3/3] Combining feature chunks...
‚úì Created interaction features for 4,044,442 candidates
  üíæ Memory: 2.43 GB

  MERGING ALL FEATURES

Merging user, item, and interaction features...
  ‚úì Merged user features
  ‚úì Merged item features
Filling missing values...

‚úì Total features: 55 (excluding customer_id, article_id)
‚úì Total candidate-feature pairs: 4,044,442
  üíæ Memory: 1.84 GB


In [171]:
print_section("PART 4: TEXT-BASED SEMANTIC FEATURES")

# Import the text feature module
from text_features import integrate_text_features_stage3

# Check if embeddings exist
embeddings_path = config.DATA_PATH  # Or wherever you ran Stage 2

if (embeddings_path / 'article_embeddings.pkl').exists():
    print("Found saved text embeddings, integrating text features...")
    
    all_features = integrate_text_features_stage3(
        all_features=all_features,
        articles=articles,
        train_transactions=train_transactions,
        embeddings_path=embeddings_path
    )
    
    print(f"\n‚úì Enhanced features with text semantics")
    print(f"  Total features now: {len(all_features.columns) - 2}")
    print_memory()
else:
    print("‚ö†Ô∏è  Text embeddings not found, skipping text features")
    print("   Run Stage 2 with text feature integration first")



  PART 4: TEXT-BASED SEMANTIC FEATURES

Found saved text embeddings, integrating text features...

  STAGE 3 ENHANCEMENT: TEXT-BASED FEATURES

Loading saved embeddings...
  ‚úì Loaded 16,616 article embeddings
  ‚úì Loaded 47,543 user embeddings

Enhancing features with text semantics...

Creating category encoding features...
  Processing 7 categorical columns...
  ‚úì Created 14 category encoding features

Computing semantic diversity features...
  ‚úì Computed diversity for 47,543 users

  Merging category features...
  Merging semantic diversity features...
  Computing user-item text similarities in chunks...
    Processing chunk 1/80...
    Processing chunk 11/80...
    Processing chunk 21/80...
    Processing chunk 31/80...
    Processing chunk 41/80...
    Processing chunk 51/80...
    Processing chunk 61/80...
    Processing chunk 71/80...
  ‚úì Text enhancement complete

‚úì Enhanced features with text semantics
  Total features now: 72
  üíæ Memory: 3.28 GB


In [None]:
print_section("SAVING FEATURES")

print("Saving feature matrix...")

# IMPORTANT: Check if text features were added
text_features_added = any('text_similarity' in col or 'semantic' in col 
                          for col in all_features.columns)

if text_features_added:
    print("  ‚úì Text semantic features detected in feature matrix")
else:
    print("  ‚ö†Ô∏è  No text semantic features detected")

# Save the complete feature matrix
all_features.to_parquet(config.OUTPUT_PATH / 'training_features.parquet', index=False)

file_size = (config.OUTPUT_PATH / 'training_features.parquet').stat().st_size / 1024**2
print(f"‚úì Saved training_features.parquet ({file_size:.2f} MB)")

# Save feature names for later use
feature_names = [col for col in all_features.columns 
                 if col not in ['customer_id', 'article_id']]

with open(config.OUTPUT_PATH / 'feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

print(f"‚úì Saved feature_names.txt ({len(feature_names)} features)")

# NEW: Save feature metadata including text features info
feature_metadata = {
    'total_features': len(feature_names),
    'has_text_features': text_features_added,
    'feature_list': feature_names,
    'timestamp': str(datetime.now())
}

import json
with open(config.OUTPUT_PATH / 'feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)
print(f"‚úì Saved feature_metadata.json")

# Print feature summary
print("\n" + "="*80)
print("FEATURE SUMMARY")
print("="*80)
print(f"\nTotal features: {len(feature_names)}")

# Categorize features
user_features = [f for f in feature_names if any(x in f.lower() for x in [
    'user', 'customer', 'purchase', 'age', 'active', 'fn', 'trend'
])]

item_features = [f for f in feature_names if any(x in f.lower() for x in [
    'item', 'article', 'sales', 'product', 'colour', 'color', 
    'department', 'section', 'garment', 'frequency', 'count'
]) and f not in user_features]

interaction_features = [f for f in feature_names if any(x in f.lower() for x in [
    'score', 'rank', 'strategies', 'match', 'purchased', 'category_match',
    'price_vs', 'cheaper'
]) and f not in user_features and f not in item_features]

# NEW: Identify text features
text_features = [f for f in feature_names if any(x in f.lower() for x in [
    'text_similarity', 'semantic', 'embedding'
])]

print(f"\nFeature breakdown:")
print(f"  - User features: {len(user_features)}")
print(f"  - Item features: {len(item_features)}")
print(f"  - Interaction features: {len(interaction_features)}")

# NEW: Show text features if present
if text_features:
    print(f"  - Text semantic features: {len(text_features)} ‚úì")
    print(f"\n  Text features included:")
    for feat in text_features:
        print(f"    ‚Ä¢ {feat}")
else:
    print(f"  - Text semantic features: 0 (not integrated)")

# NEW: Show sample of other feature categories
print(f"\n  Sample user features:")
for feat in user_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print(f"\n  Sample item features:")
for feat in item_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print(f"\n  Sample interaction features:")
for feat in interaction_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print("\n SAVED FILES \n")
print("\nIntermediate feature files (for reuse):")
print("user_features.parquet - User-level features")
print("item_features.parquet - Item-level features")
print("user_purchase_history.parquet - User purchase history")
print("user_top_categories.parquet - User category preferences")
print("user_price_stats.parquet - User price statistics")

# NEW: Check for text-related files
if (config.OUTPUT_PATH / 'article_embeddings.pkl').exists():
    print("article_embeddings.pkl - Article text embeddings")
if (config.OUTPUT_PATH / 'user_embeddings.pkl').exists():
    print("user_embeddings.pkl - User preference embeddings")

print("\nFinal output files:")
print("training_features.parquet - Complete feature matrix for training")
print("feature_names.txt - List of all feature names")
print("feature_metadata.json - Feature metadata and info")

print("\n STAGE 3 COMPLETE! \n")

# NEW: Validation checks
print("\nValidation checks:")
print(f"No NaN values: {all_features.isnull().sum().sum() == 0}")
print(f"Total rows: {len(all_features):,}")
print(f"Total features: {len(feature_names)}")
print(f"Memory usage: {all_features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

if text_features_added:
    print(f"Text features integrated: YES ‚ú®")
    print(f"{len(text_features)} text-based features added")
else:
    print(f"Text features not integrated")
    print(f"Run Stage 2 with text integration first")
    print(f"Or check if embeddings exist in {config.DATA_PATH}")

if not text_features_added:
    print("Consider integrating text features for better performance")
    print("(See text_feature_integration guide)")
else:
    print("Ready for Model Training")

if text_features_added:
    print("\n Text semantic features successfully integrated!")


  SAVING FEATURES

Saving feature matrix...
  ‚úì Text semantic features detected in feature matrix
‚úì Saved training_features.parquet (288.07 MB)
‚úì Saved feature_names.txt (72 features)
‚úì Saved feature_metadata.json

FEATURE SUMMARY

Total features: 72

Feature breakdown:
  - User features: 23
  - Item features: 24
  - Interaction features: 10
  - Text semantic features: 4 ‚úì

  Text features included:
    ‚Ä¢ text_similarity_score
    ‚Ä¢ semantic_diversity
    ‚Ä¢ semantic_range
    ‚Ä¢ user_item_text_similarity

  Sample user features:
    ‚Ä¢ repurchase_score
    ‚Ä¢ copurchase_score
    ‚Ä¢ userknn_score
    ‚Ä¢ has_purchased_item
    ‚Ä¢ days_since_item_purchase

  Sample item features:
    ‚Ä¢ n_unique_articles
    ‚Ä¢ total_sales
    ‚Ä¢ sales_frequency
    ‚Ä¢ sales_last_week
    ‚Ä¢ product_type_no

  Sample interaction features:
    ‚Ä¢ popularity_score_x
    ‚Ä¢ category_score
    ‚Ä¢ text_similarity_score
    ‚Ä¢ n_strategies
    ‚Ä¢ popularity_score_y

SAVED FILES

### Getting Image features

In [None]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
import warnings
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import pickle

warnings.filterwarnings('ignore')

try:
    import torch
    import torchvision
    from torchvision import models, transforms
    from PIL import Image
    HAS_TORCH = True
    print("‚úì PyTorch available")
    print(f"  PyTorch version: {torch.__version__}")
    
    # Check for MPS (Metal Performance Shaders) support
    if torch.backends.mps.is_available():
        print("‚úì MPS (Metal Performance Shaders) available - Apple Silicon GPU acceleration enabled!")
    else:
        print("‚ö†Ô∏è  MPS not available - will use CPU")
        
except ImportError:
    HAS_TORCH = False
    print("‚ö†Ô∏è  PyTorch not available. Install with: pip install torch torchvision pillow")

try:
    from transformers import CLIPProcessor, CLIPModel
    HAS_CLIP = True
    print("‚úì CLIP/FashionCLIP available")
except ImportError:
    HAS_CLIP = False
    print("‚ö†Ô∏è  CLIP/FashionCLIP not available. Install with: pip install transformers")

‚úì PyTorch available
  PyTorch version: 2.9.1
‚úì MPS (Metal Performance Shaders) available - Apple Silicon GPU acceleration enabled!
‚úì CLIP/FashionCLIP available


In [None]:
# MEMORY MONITORING

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()  # Clear MPS cache

In [None]:
# CONFIGURATION

class Config:
    # Paths - UPDATE THESE FOR YOUR LOCAL SETUP
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')  # Where your parquet files are
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')  # Where to save outputs
    IMAGE_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations/images')  # H&M image directory
    
    # Image embedding options
    EMBEDDING_METHOD = 'fashion-clip'  # Options: 'fashion-clip', 'resnet50', 'clip', 'efficientnet'
    EMBEDDING_DIM = 512  # Output dimension (will be projected from original)
    
    # Processing - Optimized for Apple Silicon
    BATCH_SIZE = 64  # M4 can handle larger batches efficiently
    IMAGE_SIZE = 224  # Input size for models
    
    # Memory optimization
    PROCESS_SUBSET = False  # Set True to process only subset (for testing)
    SUBSET_SIZE = 10000  # Number of images to process if PROCESS_SUBSET=True
    USE_FP16 = False  # MPS doesn't fully support FP16 yet, keep False
    
    # Apple Silicon specific
    USE_MPS = True  # Enable MPS acceleration
    NUM_WORKERS = 4  # For data loading (M4 has excellent multi-core)
    
    RANDOM_STATE = 42

config = Config()

# Create output directory if it doesn't exist
config.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# UTILITY FUNCTIONS

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

def get_device():
    """Get the best available device for Apple Silicon"""
    if config.USE_MPS and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

In [None]:
# IMAGE EMBEDDING EXTRACTION

class ImageEmbeddingExtractor:
    """Extract embeddings from product images using pre-trained models"""
    
    def __init__(self, method='fashion-clip', device=None):
        self.method = method
        self.device = device if device else get_device()
        self.model = None
        self.transform = None
        
        print(f"Using device: {self.device}")
        
        if method == 'fashion-clip':
            self._load_fashion_clip()
        elif method == 'resnet50':
            self._load_resnet()
        elif method == 'clip':
            self._load_clip()
        elif method == 'efficientnet':
            self._load_efficientnet()
    
    def _load_fashion_clip(self):
        """Load pre-trained FashionCLIP - optimized for fashion domain"""
        print("Loading FashionCLIP")
        
        # Load FashionCLIP 2.0 - uses better base model
        self.model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
        self.processor = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.output_dim = 512  # FashionCLIP outputs 512-dim embeddings
        print(f"‚úì FashionCLIP 2.0 loaded on {self.device}, output dim: {self.output_dim}")
        print("  Model details: ViT-B/32 architecture, trained on Farfetch dataset")
        print("  Benefits: Better fashion understanding, semantic similarity, zero-shot capabilities")
    
    def _load_resnet(self):
        """Load pre-trained ResNet50 - optimized for Apple Silicon"""
        print("Loading ResNet50 (pre-trained on ImageNet)...")
        
        # Load model with updated weights parameter (new PyTorch API)
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        
        # Remove final classification layer
        self.model = torch.nn.Sequential(*list(model.children())[:-1])
        self.model.eval()
        
        # Move to MPS device
        self.model = self.model.to(self.device)
        
        # Image preprocessing - using updated normalize values from weights
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(config.IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                std=[0.229, 0.224, 0.225])
        ])
        
        self.output_dim = 2048
        print(f"‚úì ResNet50 loaded on {self.device}, output dim: {self.output_dim}")
    
    def _load_efficientnet(self):
        """Load pre-trained EfficientNet-B0 - optimized for Apple Silicon"""
        print("Loading EfficientNet-B0 (pre-trained on ImageNet)...")
        
        # Load with updated weights parameter
        model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
        
        # Remove final classification layer
        self.model = torch.nn.Sequential(*list(model.children())[:-1])
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(config.IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                std=[0.229, 0.224, 0.225])
        ])
        
        self.output_dim = 1280
        print(f"‚úì EfficientNet-B0 loaded on {self.device}, output dim: {self.output_dim}")
    
    def _load_clip(self):
        """Load CLIP model - optimized for Apple Silicon"""
        print("Loading CLIP (vision-language model)...")
        
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.output_dim = 512
        print(f"‚úì CLIP loaded on {self.device}, output dim: {self.output_dim}")
    
    def extract_single(self, image_path):
        """Extract embedding from a single image"""
        try:
            # Load and preprocess image
            img = Image.open(image_path).convert('RGB')
            
            if self.method in ['clip', 'fashion-clip']:
                inputs = self.processor(images=img, return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    image_features = self.model.get_image_features(**inputs)
                
                embedding = image_features.squeeze().cpu().numpy()
            else:
                img_tensor = self.transform(img).unsqueeze(0).to(self.device)
                
                with torch.no_grad():
                    embedding = self.model(img_tensor).squeeze().cpu().numpy()
            
            return embedding
        
        except Exception as e:
            # Return zero vector if image processing fails
            print(f"‚ö†Ô∏è  Error processing {image_path}: {e}")
            return np.zeros(self.output_dim, dtype=np.float32)
    
    def extract_batch(self, image_paths):
        """Extract embeddings from a batch of images - optimized for MPS"""
        embeddings = []
        
        # For CLIP-based models (fashion-clip and clip), process individually
        # because batch processing with processor is more complex
        if self.method in ['clip', 'fashion-clip']:
            for img_path in image_paths:
                emb = self.extract_single(img_path)
                embeddings.append(emb)
            return np.array(embeddings)
        
        # For CNN models (ResNet, EfficientNet), process as batch
        batch_tensors = []
        valid_indices = []
        
        # Load and preprocess all images in batch
        for idx, img_path in enumerate(image_paths):
            try:
                img = Image.open(img_path).convert('RGB')
                img_tensor = self.transform(img)
                batch_tensors.append(img_tensor)
                valid_indices.append(idx)
            except Exception as e:
                print(f"‚ö†Ô∏è  Error loading {img_path}: {e}")
                embeddings.append(np.zeros(self.output_dim, dtype=np.float32))
        
        # Process batch
        if batch_tensors:
            batch = torch.stack(batch_tensors).to(self.device)
            
            with torch.no_grad():
                batch_embeddings = self.model(batch).squeeze().cpu().numpy()
            
            # Handle single vs multiple embeddings
            if len(batch_tensors) == 1:
                batch_embeddings = batch_embeddings.reshape(1, -1)
            
            # Insert embeddings at correct positions
            result_embeddings = []
            batch_idx = 0
            for idx in range(len(image_paths)):
                if idx in valid_indices:
                    result_embeddings.append(batch_embeddings[batch_idx])
                    batch_idx += 1
                else:
                    result_embeddings.append(np.zeros(self.output_dim, dtype=np.float32))
            
            return np.array(result_embeddings)
        
        return np.array(embeddings)

In [None]:
# MAIN EXTRACTION PIPELINE

print_section("STAGE 1.5: IMAGE EMBEDDING EXTRACTION (APPLE SILICON OPTIMIZED)")

# Check if embeddings already exist
if (config.OUTPUT_PATH / 'image_embeddings_3.parquet').exists():
    print("‚ö° Found existing image_embeddings_3.parquet!")
    print("   Delete this file if you want to re-extract embeddings.")
    
    # Load existing embeddings
    image_embeddings_df = pd.read_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet')
    print(f"‚úì Loaded {len(image_embeddings_df):,} image embeddings from disk")
    
else:
    print("Image embeddings not found. Starting extraction...")
    
    # Check prerequisites
    if not HAS_TORCH:
        raise ImportError("PyTorch is required. Install with: pip install torch torchvision pillow")
    
    # Load articles
    print("\nLoading articles metadata...")
    articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')
    print(f"‚úì Loaded {len(articles):,} articles")
    
    # Determine which articles to process
    if config.PROCESS_SUBSET:
        articles = articles.head(config.SUBSET_SIZE)
        print(f"‚ö†Ô∏è  Processing subset of {len(articles):,} articles (PROCESS_SUBSET=True)")
    
    article_ids = articles['article_id'].tolist()
    
    # Check image directory structure
    print(f"\nChecking image directory: {config.IMAGE_PATH}")
    if not config.IMAGE_PATH.exists():
        raise FileNotFoundError(f"Image directory not found: {config.IMAGE_PATH}")
    
    # H&M images are organized as: images/0XX/0XXXXXXXX.jpg
    # Where first 3 digits determine subfolder
    
    # Find available images
    print("Scanning for available images...")
    available_images = {}
    missing_count = 0
    
    for article_id in tqdm(article_ids, desc="Checking images"):
        # Try different possible paths
        article_str = str(article_id).zfill(10)  # Pad to 10 digits
        subfolder = article_str[:3]
        
        # Possible image paths
        possible_paths = [
            config.IMAGE_PATH / subfolder / f"{article_str}.jpg",
            config.IMAGE_PATH / f"{article_str}.jpg",
            config.IMAGE_PATH / f"{article_id}.jpg",
        ]
        
        image_found = False
        for img_path in possible_paths:
            if img_path.exists():
                available_images[article_id] = img_path
                image_found = True
                break
        
        if not image_found:
            missing_count += 1
    
    print(f"\n‚úì Found {len(available_images):,} images")
    print(f"  Missing {missing_count:,} images ({100*missing_count/len(article_ids):.1f}%)")
    
    # Initialize extractor
    print(f"\nInitializing {config.EMBEDDING_METHOD} model...")
    extractor = ImageEmbeddingExtractor(method=config.EMBEDDING_METHOD)
    print_memory()
    
    # Extract embeddings
    print(f"\nExtracting embeddings for {len(available_images):,} images...")
    print(f"  Batch size: {config.BATCH_SIZE}")
    print(f"  Device: {extractor.device}")
    estimated_time = len(available_images) / (config.BATCH_SIZE * 10)  # ~10 batches/sec on M4
    print(f"  Estimated time: {estimated_time:.1f} minutes")
    
    embeddings_dict = {}
    batch_article_ids = []
    batch_image_paths = []
    
    for article_id, img_path in tqdm(available_images.items(), desc="Extracting embeddings"):
        batch_article_ids.append(article_id)
        batch_image_paths.append(img_path)
        
        # Process batch
        if len(batch_article_ids) >= config.BATCH_SIZE:
            batch_embeddings = extractor.extract_batch(batch_image_paths)
            
            for aid, emb in zip(batch_article_ids, batch_embeddings):
                embeddings_dict[aid] = emb
            
            # Clear batch
            batch_article_ids = []
            batch_image_paths = []
            
            # Periodic garbage collection
            if len(embeddings_dict) % (config.BATCH_SIZE * 10) == 0:
                force_garbage_collection()
    
    # Process remaining batch
    if batch_article_ids:
        batch_embeddings = extractor.extract_batch(batch_image_paths)
        for aid, emb in zip(batch_article_ids, batch_embeddings):
            embeddings_dict[aid] = emb
    
    print(f"\n‚úì Extracted {len(embeddings_dict):,} embeddings")
    print_memory()
    
    # Create DataFrame
    print("\nCreating embeddings DataFrame...")
    
    # For missing images, use mean embedding or zero vector
    mean_embedding = np.mean(list(embeddings_dict.values()), axis=0) if embeddings_dict else np.zeros(extractor.output_dim)
    
    all_embeddings = []
    for article_id in article_ids:
        if article_id in embeddings_dict:
            emb = embeddings_dict[article_id]
        else:
            emb = mean_embedding  # Use mean for missing images
        
        all_embeddings.append(emb)
    
    # Create DataFrame with article_id and embedding columns
    image_embeddings_df = pd.DataFrame({
        'article_id': article_ids
    })
    
    # Add embedding dimensions as separate columns
    embedding_matrix = np.array(all_embeddings)
    
    # Project to target dimension if needed
    if embedding_matrix.shape[1] != config.EMBEDDING_DIM:
        print(f"\nProjecting embeddings from {embedding_matrix.shape[1]} to {config.EMBEDDING_DIM} dimensions...")
        
        pca = PCA(n_components=config.EMBEDDING_DIM, random_state=config.RANDOM_STATE)
        embedding_matrix = pca.fit_transform(embedding_matrix)
        
        print(f"  Explained variance: {pca.explained_variance_ratio_.sum():.3f}")
    
    # Add embedding columns
    for i in range(embedding_matrix.shape[1]):
        image_embeddings_df[f'image_emb_{i}'] = embedding_matrix[:, i].astype(np.float32)
    
    # Save embeddings
    print("\nSaving image embeddings...")
    image_embeddings_df.to_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet', index=False)
    
    file_size = (config.OUTPUT_PATH / 'image_embeddings_3.parquet').stat().st_size / 1024**2
    print(f"‚úì Saved image_embeddings.parquet ({file_size:.2f} MB)")
    
    # Clean up
    del extractor, embeddings_dict, embedding_matrix
    force_garbage_collection()


  STAGE 1.5: IMAGE EMBEDDING EXTRACTION (APPLE SILICON OPTIMIZED)

Image embeddings not found. Starting extraction...

Loading articles metadata...
‚úì Loaded 16,616 articles

Checking image directory: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations/images
Scanning for available images...


Checking images:   0%|          | 0/16616 [00:00<?, ?it/s]


‚úì Found 16,593 images
  Missing 23 images (0.1%)

Initializing fashion-clip model...
Using device: mps
Loading FashionCLIP (fine-tuned on 800K+ fashion products)...
  This model is specifically trained for fashion and will give MUCH better embeddings!


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d85895de-c9e4-4046-bd79-6ab3147b5848)')' thrown while requesting HEAD https://huggingface.co/patrickjohncyh/fashion-clip/resolve/main/processor_config.json
Retrying in 1s [Retry 1/5].


‚úì FashionCLIP 2.0 loaded on mps, output dim: 512
  Model details: ViT-B/32 architecture, trained on Farfetch dataset
  Benefits: Better fashion understanding, semantic similarity, zero-shot capabilities
  üíæ Memory: 0.49 GB

Extracting embeddings for 16,593 images...
  Batch size: 64
  Device: mps
  Estimated time: 25.9 minutes


Extracting embeddings:   0%|          | 0/16593 [00:00<?, ?it/s]


‚úì Extracted 16,593 embeddings
  üíæ Memory: 1.29 GB

Creating embeddings DataFrame...

Saving image embeddings...
‚úì Saved image_embeddings.parquet (32.66 MB)


In [None]:
# INTEGRATE INTO TRAINING FEATURES

print_section("INTEGRATING IMAGE EMBEDDINGS INTO TRAINING FEATURES")

# Check for different possible training feature files
training_file = None
possible_files = [
    'training_features.parquet'
]

for filename in possible_files:
    if (Path("/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/" + filename)).exists():
        training_file = filename
        break
try:
    image_embeddings_df = pd.read_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet')
except Exception as e:
    print(f"unable to load image embeddings")

if training_file is None:
    print("No training features file found!")
    print(f"\n Image embeddings saved to: {config.OUTPUT_PATH / 'image_embeddings.parquet'}")
else:
    # Load training features
    print(f"Loading {training_file}...")
    training_features = pd.read_parquet("/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/" + training_file)
    print(f"‚úì Loaded training features: {len(training_features):,} rows")
    print(f"  Current features: {len(training_features.columns) - 2}")  # Exclude customer_id, article_id
    print_memory()

    # Merge image embeddings
    print("\nMerging image embeddings with training features...")
    print(f"  Image embeddings: {len(image_embeddings_df):,} articles")

    # Merge on article_id
    training_features = training_features.merge(
        image_embeddings_df,
        on='article_id',
        how='left'
    )

    print(f"Merged successfully")
    print(f"New features: {len(training_features.columns) - 2}")

    # Check for missing embeddings
    image_cols = [col for col in training_features.columns if col.startswith('image_emb_')]
    missing_embeddings = training_features[image_cols].isna().any(axis=1).sum()

    if missing_embeddings > 0:
        print(f"\n Found {missing_embeddings:,} rows with missing image embeddings")
        print("Filling with mean values...")
        
        # Fill with mean
        for col in image_cols:
            mean_val = training_features[col].mean()
            training_features[col] = training_features[col].fillna(mean_val)
        
        print("‚úì Filled missing values")

    # Convert to float32 to save memory
    print("\nOptimizing data types...")
    for col in image_cols:
        training_features[col] = training_features[col].astype(np.float32)

    print_memory()

    # Save updated training features
    print(f"\nSaving updated {training_file}...")
    training_features.to_parquet(
        config.OUTPUT_PATH / training_file,
        index=False
    )

    file_size = (config.OUTPUT_PATH / training_file).stat().st_size / 1024**2
    print(f"‚úì Saved {training_file} ({file_size:.2f} MB)")

    # Update feature names
    feature_names = [col for col in training_features.columns if col not in ['customer_id', 'article_id']]

    with open(config.OUTPUT_PATH / 'feature_names.txt', 'w') as f:
        f.write('\n'.join(feature_names))

    print(f"‚úì Updated feature_names.txt ({len(feature_names)} features)")

    # ============================================================================
    # SUMMARY
    # ============================================================================

    print_section("IMAGE EMBEDDING INTEGRATION COMPLETE!")

    print("Summary:")
    print(f"Image embeddings extracted: {len(image_embeddings_df):,}")
    print(f"Image embedding dimensions: {config.EMBEDDING_DIM}")
    print(f"Total features: {len(feature_names)}")
    print(f"User features: {len([f for f in feature_names if any(x in f for x in ['user', 'purchase', 'age'])])}")
    print(f"Item features: {len([f for f in feature_names if any(x in f for x in ['sales', 'product', 'department'])])}")
    print(f"Interaction features: {len([f for f in feature_names if any(x in f for x in ['match', 'rank', 'score', 'similarity'])])}")
    print(f"Image features: {len(image_cols)}")

    print("\nFiles created/updated:")
    print(f"image_embeddings.parquet - Image embeddings for all articles")
    print(f"{training_file} - Updated with image embeddings")
    print(f"feature_names.txt - Updated feature list")

print(" Ready for Stage 4: Model Training with Image Features!")


  INTEGRATING IMAGE EMBEDDINGS INTO TRAINING FEATURES

Loading training_features.parquet...
‚úì Loaded training features: 4,044,442 rows
  Current features: 72
  üíæ Memory: 2.13 GB

Merging image embeddings with training features...
  Image embeddings: 16,616 articles
‚úì Merged successfully
  New features: 584

Optimizing data types...
  üíæ Memory: 0.60 GB

Saving updated training_features.parquet...
‚úì Saved training_features.parquet (5126.99 MB)
‚úì Updated feature_names.txt (584 features)

  IMAGE EMBEDDING INTEGRATION COMPLETE!

Summary:
  üì∏ Image embeddings extracted: 16,616
  üìä Image embedding dimensions: 512
  üéØ Total features: 584
     - User features: 531
     - Item features: 12
     - Interaction features: 16
     - Image features: 512

Files created/updated:
  ‚úì image_embeddings.parquet - Image embeddings for all articles
  ‚úì training_features.parquet - Updated with image embeddings
  ‚úì feature_names.txt - Updated feature list

‚úÖ Ready for Stage 4: M

In [13]:
import pandas as pd

# Path to your file
file_path = '/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/training_features.parquet'

try:
    # Load the parquet file
    print(f"Loading {file_path}...")
    df = pd.read_parquet(file_path)

    # 1. Show dimensions (rows, columns)
    print(f"\nShape: {df.shape}")

    # 2. Show column names
    print(f"\nColumns: {df.columns.tolist()}")

    # 3. Show the first 5 rows
    print("\nFirst 5 rows:")
    print(df[100:105]["userknn_score"])

    # 4. (Optional) Show data types and memory usage
    print("\nInfo:")
    print(df.info())

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Loading /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/training_features.parquet...

Shape: (4044442, 74)

Columns: ['customer_id', 'article_id', 'repurchase_score', 'popularity_score_x', 'copurchase_score', 'userknn_score', 'category_score', 'text_similarity_score', 'n_strategies', 'has_purchased_item', 'days_since_item_purchase', 'popularity_score_y', 'category_match', 'price_vs_user_avg', 'is_cheaper_than_usual', 'copurchase_score_normalized', 'has_copurchase_signal', 'repurchase_score_rank', 'copurchase_score_rank', 'userknn_score_rank', 'category_score_rank', 'overall_rank', 'n_purchases', 'avg_price', 'std_price', 'min_price', 'max_price', 'days_since_first_purchase', 'days_since_last_purchase', 'purchase_frequency', 'n_purchases_last_week', 'avg_price_last_week', 'is_active_last_week', 'n_unique_articles', 'n_unique_categories', 'exploration_rate', 'age', 'FN', 'Active', 'purchase_trend', 'n_unique_buyers', 'total_sales', 'days_since_first_sal

### Stage 4: Ensemble Training with Advanced Collaborative Filtering

This stage implements a comprehensive ensemble approach combining advanced collaborative filtering methods with existing models to maximize MAP@12 performance.

**Key Components:**

1. **Advanced CF Methods:**
   - **SVD++**: Improved SVD with implicit feedback handling
   - **ALS (Alternating Least Squares)**: Optimized for sparse matrices
   - **NMF (Non-negative Matrix Factorization)**: Additive parts-based representation
   - **Enhanced User/Item CF**: Improved similarity metrics

2. **Ensemble Strategies:**
   - **Weighted Average**: Optimized weights on validation set
   - **Stacked Ensemble**: CF predictions as features for LightGBM/Neural Tower
   - **Rank-Based Ensemble**: Borda count and reciprocal rank fusion
   - **Two-Stage Ensemble**: CF for candidate generation, advanced models for reranking

3. **Integration:**
   - Combines CF methods with LightGBM and Neural Tower models
   - Optimizes ensemble weights using validation MAP@12
   - Provides comprehensive evaluation and comparison

**Expected Improvement:**
- Baseline CF: ~0.78 MAP@12
- With Advanced CF: ~0.79-0.80 MAP@12
- With Optimized Ensemble: ~0.82-0.85 MAP@12


In [23]:
# ============================================================================
# IMPORTS AND CONFIGURATION FOR ENSEMBLE TRAINING
# ============================================================================

import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import json
import gc
from tqdm.auto import tqdm
import warnings
from scipy.optimize import minimize
from scipy.sparse import csr_matrix, csc_matrix
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb

warnings.filterwarnings('ignore')

# Try to import advanced CF libraries
# Note: surprise (scikit-surprise) requires NumPy < 2.0
# If you get NumPy compatibility errors, run: pip install "numpy<2" scikit-surprise
HAS_SURPRISE = False
try:
    # Check NumPy version first
    import numpy as np
    if np.__version__.startswith('2.'):
        print("‚ö†Ô∏è  NumPy 2.x detected. scikit-surprise requires NumPy < 2.0")
        print("   To fix: pip install 'numpy<2' scikit-surprise")
        print("   Will use enhanced SVD instead of SVD++")
    else:
        from surprise import SVDpp, Dataset, Reader
        HAS_SURPRISE = True
        print("‚úì scikit-surprise loaded successfully")
except ImportError as e:
    print(f"‚ö†Ô∏è  surprise library not found or incompatible: {e}")
    print("   Install with: pip install 'numpy<2' scikit-surprise")
    print("   Will use custom SVD++ implementation (enhanced SVD)")
except Exception as e:
    print(f"‚ö†Ô∏è  Error loading surprise library: {e}")
    print("   This is likely a NumPy compatibility issue")
    print("   Fix with: pip install 'numpy<2' scikit-surprise")
    print("   Will use enhanced SVD instead of SVD++")

try:
    from implicit import als
    HAS_IMPLICIT = True
except ImportError:
    HAS_IMPLICIT = False
    print("‚ö†Ô∏è  implicit library not found. Install with: pip install implicit")
    print("   Will use alternative ALS implementation")

# Configuration
class EnsembleConfig:
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    MODEL_PATH.mkdir(exist_ok=True, parents=True)
    
    RANDOM_STATE = 42
    
    # CF Model Parameters
    SVD_N_COMPONENTS = 50
    SVDPP_N_FACTORS = 50
    ALS_FACTORS = 50
    ALS_REGULARIZATION = 0.1
    NMF_N_COMPONENTS = 50
    
    # Ensemble Parameters
    ENSEMBLE_METHODS = ['weighted', 'stacked', 'rank_based']
    OPTIMIZE_WEIGHTS = True
    
    # Evaluation
    TOP_K = 12  # MAP@12

print("‚úì Ensemble Configuration loaded")
print(f"  Model path: {EnsembleConfig.MODEL_PATH}")
print(f"  Data path: {EnsembleConfig.DATA_PATH}")
print(f"  Surprise available: {HAS_SURPRISE}")
print(f"  Implicit available: {HAS_IMPLICIT}")


‚ö†Ô∏è  NumPy 2.x detected. scikit-surprise requires NumPy < 2.0
   To fix: pip install 'numpy<2' scikit-surprise
   Will use enhanced SVD instead of SVD++
‚úì Ensemble Configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models
  Data path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2
  Surprise available: False
  Implicit available: True


In [24]:
# ============================================================================
# MAP@12 EVALUATION FUNCTION
# ============================================================================

def calculate_map_at_k(y_true, y_pred, k=12):
    """
    Calculate Mean Average Precision at K (MAP@K)
    """
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
            
        true_set = set(true_items)
        hits = 0
        precision_sum = 0.0
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_set:
                hits += 1
                precision_sum += hits / (i + 1)
        
        if hits > 0:
            ap = precision_sum / len(true_items)
            aps.append(ap)
    
    return np.mean(aps) if len(aps) > 0 else 0.0


def evaluate_map_at_12(df, predictions, customer_col='customer_id', 
                       article_col='article_id', label_col='label', k=12):
    """
    Evaluate MAP@12 for a dataframe with predictions
    """
    df_eval = df[[customer_col, article_col, label_col]].copy()
    df_eval['pred_score'] = predictions[:len(df_eval)]
    
    # Get true positives (purchased items) for each customer
    true_positives = df_eval[df_eval[label_col] == 1].groupby(customer_col)[article_col].apply(list).to_dict()
    
    # Get top-k predictions for each customer
    top_predictions = (df_eval.groupby(customer_col)
                      .apply(lambda x: x.nlargest(k, 'pred_score')[article_col].tolist())
                      .to_dict())
    
    # Calculate MAP@12
    y_true = []
    y_pred = []
    
    for customer_id in true_positives.keys():
        if customer_id in top_predictions:
            y_true.append(true_positives[customer_id])
            y_pred.append(top_predictions[customer_id])
    
    return calculate_map_at_k(y_true, y_pred, k=k)

print("‚úì MAP@12 evaluation functions loaded")


‚úì MAP@12 evaluation functions loaded


In [25]:
# ============================================================================
# LOAD DATA AND PREPARE USER-ITEM MATRIX
# ============================================================================

print("\n" + "="*80)
print("LOADING DATA FOR ENSEMBLE TRAINING")
print("="*80)

# Load training transactions
print("\nüìä Loading training transactions...")
train_transactions = pd.read_parquet(EnsembleConfig.DATA_PATH / 'train_transactions.parquet')
print(f"‚úì Loaded {len(train_transactions):,} training transactions")

# Load validation data
print("\nüìä Loading validation data...")
val_data = pd.read_parquet(EnsembleConfig.MODEL_PATH / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")

# Create user-item interaction matrix
print("\nüìä Creating user-item interaction matrix...")
# Get unique users and items
unique_users = train_transactions['customer_id'].unique()
unique_items = train_transactions['article_id'].unique()

# Create mappings
user_to_idx = {user: idx for idx, user in enumerate(unique_users)}
item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

print(f"  Users: {len(unique_users):,}")
print(f"  Items: {len(unique_items):,}")

# Create sparse matrix (binary: 1 if purchased, 0 otherwise)
print("\nüìä Building sparse user-item matrix...")
rows = []
cols = []
data = []

for _, row in tqdm(train_transactions.iterrows(), total=len(train_transactions), desc="Building matrix"):
    user_idx = user_to_idx[row['customer_id']]
    item_idx = item_to_idx[row['article_id']]
    rows.append(user_idx)
    cols.append(item_idx)
    data.append(1.0)  # Binary interaction

user_item_matrix = csr_matrix((data, (rows, cols)), 
                              shape=(len(unique_users), len(unique_items)))
print(f"‚úì Created user-item matrix: {user_item_matrix.shape}")
print(f"  Sparsity: {(1 - user_item_matrix.nnz / (user_item_matrix.shape[0] * user_item_matrix.shape[1])) * 100:.2f}%")
print(f"  Non-zero entries: {user_item_matrix.nnz:,}")

# Store for later use
matrix_data = {
    'user_item_matrix': user_item_matrix,
    'user_to_idx': user_to_idx,
    'item_to_idx': item_to_idx,
    'idx_to_user': idx_to_user,
    'idx_to_item': idx_to_item,
    'unique_users': unique_users,
    'unique_items': unique_items
}

gc.collect()
print("\n‚úì Data loading complete!")



LOADING DATA FOR ENSEMBLE TRAINING

üìä Loading training transactions...
‚úì Loaded 412,156 training transactions

üìä Loading validation data...
‚úì Loaded 120,970 validation samples

üìä Creating user-item interaction matrix...
  Users: 47,543
  Items: 15,932

üìä Building sparse user-item matrix...


Building matrix:   0%|          | 0/412156 [00:00<?, ?it/s]

‚úì Created user-item matrix: (47543, 15932)
  Sparsity: 99.95%
  Non-zero entries: 357,554

‚úì Data loading complete!


In [26]:
# ============================================================================
# ADVANCED CF MODELS: SVD++, ALS, NMF
# ============================================================================

print("\n" + "="*80)
print("TRAINING ADVANCED COLLABORATIVE FILTERING MODELS")
print("="*80)

cf_models = {}
cf_predictions = {}

# 1. SVD (Truncated SVD for sparse matrices)
print("\n1Ô∏è‚É£ Training SVD (Truncated SVD)...")
svd_model = TruncatedSVD(n_components=EnsembleConfig.SVD_N_COMPONENTS, 
                        random_state=EnsembleConfig.RANDOM_STATE)
user_factors = svd_model.fit_transform(user_item_matrix)
item_factors = svd_model.components_.T
cf_models['svd'] = {'user_factors': user_factors, 'item_factors': item_factors, 'model': svd_model}
print(f"‚úì SVD trained: {user_factors.shape[0]} users, {item_factors.shape[0]} items")

# 2. SVD++ (if surprise available, otherwise use enhanced SVD)
if HAS_SURPRISE:
    print("\n2Ô∏è‚É£ Training SVD++ (with implicit feedback)...")
    try:
        # Prepare data for surprise
        reader = Reader(rating_scale=(0, 1))
        train_data_list = []
        for user_id, item_id in zip(train_transactions['customer_id'], train_transactions['article_id']):
            train_data_list.append([user_id, item_id, 1.0])
        
        train_df = pd.DataFrame(train_data_list, columns=['user', 'item', 'rating'])
        data = Dataset.load_from_df(train_df, reader)
        trainset = data.build_full_trainset()
        
        svdpp_model = SVDpp(n_factors=EnsembleConfig.SVDPP_N_FACTORS, 
                           random_state=EnsembleConfig.RANDOM_STATE, 
                           verbose=False)
        svdpp_model.fit(trainset)
        cf_models['svdpp'] = svdpp_model
        print("‚úì SVD++ trained")
    except Exception as e:
        print(f"‚ö†Ô∏è  SVD++ training failed: {e}")
        print("   Using enhanced SVD instead")
        cf_models['svdpp'] = None
else:
    print("\n2Ô∏è‚É£ SVD++ not available (surprise library not installed)")
    print("   Using enhanced SVD with more components instead")
    svd_enhanced = TruncatedSVD(n_components=EnsembleConfig.SVDPP_N_FACTORS, 
                                random_state=EnsembleConfig.RANDOM_STATE)
    user_factors_enhanced = svd_enhanced.fit_transform(user_item_matrix)
    item_factors_enhanced = svd_enhanced.components_.T
    cf_models['svdpp'] = {'user_factors': user_factors_enhanced, 
                          'item_factors': item_factors_enhanced, 
                          'model': svd_enhanced}
    print("‚úì Enhanced SVD trained (SVD++ alternative)")

# 3. ALS (Alternating Least Squares)
print("\n3Ô∏è‚É£ Training ALS (Alternating Least Squares)...")
if HAS_IMPLICIT:
    try:
        # Convert to CSC format for implicit
        item_user_matrix = user_item_matrix.T.tocsc()
        als_model = als.AlternatingLeastSquares(
            factors=EnsembleConfig.ALS_FACTORS,
            regularization=EnsembleConfig.ALS_REGULARIZATION,
            iterations=15,
            random_state=EnsembleConfig.RANDOM_STATE
        )
        als_model.fit(item_user_matrix)
        cf_models['als'] = als_model
        print("‚úì ALS trained")
    except Exception as e:
        print(f"‚ö†Ô∏è  ALS training failed: {e}")
        cf_models['als'] = None
else:
    # Alternative: Use NMF as ALS alternative
    print("   Implicit library not available, using NMF as alternative")
    cf_models['als'] = None

# 4. NMF (Non-negative Matrix Factorization)
print("\n4Ô∏è‚É£ Training NMF (Non-negative Matrix Factorization)...")
try:
    # NMF requires non-negative values, so we use binary matrix
    nmf_model = NMF(n_components=EnsembleConfig.NMF_N_COMPONENTS, 
                   random_state=EnsembleConfig.RANDOM_STATE,
                   max_iter=200,
                   verbose=0)
    user_factors_nmf = nmf_model.fit_transform(user_item_matrix)
    item_factors_nmf = nmf_model.components_.T
    cf_models['nmf'] = {'user_factors': user_factors_nmf, 
                       'item_factors': item_factors_nmf, 
                       'model': nmf_model}
    print("‚úì NMF trained")
except Exception as e:
    print(f"‚ö†Ô∏è  NMF training failed: {e}")
    cf_models['nmf'] = None

# 5. Enhanced User-Based CF (with improved similarity)
print("\n5Ô∏è‚É£ Computing User-Based CF similarity matrix...")
user_similarity = cosine_similarity(user_item_matrix, dense_output=False)
cf_models['user_cf'] = {'similarity': user_similarity}
print(f"‚úì User similarity matrix computed: {user_similarity.shape}")

# 6. Enhanced Item-Based CF
print("\n6Ô∏è‚É£ Computing Item-Based CF similarity matrix...")
item_similarity = cosine_similarity(user_item_matrix.T, dense_output=False)
cf_models['item_cf'] = {'similarity': item_similarity}
print(f"‚úì Item similarity matrix computed: {item_similarity.shape}")

print("\n‚úÖ All CF models trained!")
gc.collect()



TRAINING ADVANCED COLLABORATIVE FILTERING MODELS

1Ô∏è‚É£ Training SVD (Truncated SVD)...
‚úì SVD trained: 47543 users, 15932 items

2Ô∏è‚É£ SVD++ not available (surprise library not installed)
   Using enhanced SVD with more components instead
‚úì Enhanced SVD trained (SVD++ alternative)

3Ô∏è‚É£ Training ALS (Alternating Least Squares)...


  0%|          | 0/15 [00:00<?, ?it/s]

‚úì ALS trained

4Ô∏è‚É£ Training NMF (Non-negative Matrix Factorization)...
‚úì NMF trained

5Ô∏è‚É£ Computing User-Based CF similarity matrix...
‚úì User similarity matrix computed: (47543, 47543)

6Ô∏è‚É£ Computing Item-Based CF similarity matrix...
‚úì Item similarity matrix computed: (15932, 15932)

‚úÖ All CF models trained!


0

In [27]:
# ============================================================================
# GENERATE PREDICTIONS FROM CF MODELS
# ============================================================================

print("\n" + "="*80)
print("GENERATING CF PREDICTIONS FOR VALIDATION SET")
print("="*80)

def predict_svd(user_id, item_id, model_data):
    """Predict using SVD"""
    if user_id not in user_to_idx or item_id not in item_to_idx:
        return 0.0
    user_idx = user_to_idx[user_id]
    item_idx = item_to_idx[item_id]
    return np.dot(model_data['user_factors'][user_idx], model_data['item_factors'][item_idx])

def predict_svdpp(user_id, item_id, model):
    """Predict using SVD++"""
    if HAS_SURPRISE and model is not None:
        try:
            return model.predict(user_id, item_id).est
        except:
            return 0.0
    elif isinstance(model, dict):
        # Enhanced SVD fallback
        return predict_svd(user_id, item_id, model)
    return 0.0

def predict_als(user_id, item_id, model):
    """Predict using ALS"""
    if model is None:
        return 0.0
    if user_id not in user_to_idx or item_id not in item_to_idx:
        return 0.0
    user_idx = user_to_idx[user_id]
    item_idx = item_to_idx[item_id]
    try:
        return model.predict(user_idx, item_idx)[0]
    except:
        return 0.0

def predict_nmf(user_id, item_id, model_data):
    """Predict using NMF"""
    if user_id not in user_to_idx or item_id not in item_to_idx:
        return 0.0
    user_idx = user_to_idx[user_id]
    item_idx = item_to_idx[item_id]
    return np.dot(model_data['user_factors'][user_idx], model_data['item_factors'][item_idx])

def predict_user_cf(user_id, item_id, model_data, k=50):
    """Predict using User-Based CF"""
    if user_id not in user_to_idx or item_id not in item_to_idx:
        return 0.0
    user_idx = user_to_idx[user_id]
    item_idx = item_to_idx[item_id]
    
    # Get top-k similar users
    similarity_scores = model_data['similarity'][user_idx].toarray().flatten()
    top_k_users = np.argsort(similarity_scores)[-k:][::-1]
    
    # Weighted average of item interactions
    score = 0.0
    total_sim = 0.0
    for similar_user_idx in top_k_users:
        if similar_user_idx != user_idx:
            sim = similarity_scores[similar_user_idx]
            if sim > 0:
                item_interaction = user_item_matrix[similar_user_idx, item_idx]
                score += sim * item_interaction
                total_sim += sim
    
    return score / (total_sim + 1e-8)

def predict_item_cf(user_id, item_id, model_data, k=50):
    """Predict using Item-Based CF"""
    if user_id not in user_to_idx or item_id not in item_to_idx:
        return 0.0
    user_idx = user_to_idx[user_id]
    item_idx = item_to_idx[item_id]
    
    # Get user's purchased items
    user_items = user_item_matrix[user_idx].nonzero()[1]
    if len(user_items) == 0:
        return 0.0
    
    # Get top-k similar items
    similarity_scores = model_data['similarity'][item_idx].toarray().flatten()
    top_k_items = np.argsort(similarity_scores)[-k:][::-1]
    
    # Weighted average
    score = 0.0
    total_sim = 0.0
    for similar_item_idx in top_k_items:
        if similar_item_idx != item_idx:
            sim = similarity_scores[similar_item_idx]
            if sim > 0:
                user_interaction = user_item_matrix[user_idx, similar_item_idx]
                score += sim * user_interaction
                total_sim += sim
    
    return score / (total_sim + 1e-8)

# Generate predictions for all CF models
print("\nüìä Generating predictions...")
cf_predictions = {}

# Group validation data by user for efficiency
val_data_grouped = val_data.groupby('customer_id')

for model_name in ['svd', 'svdpp', 'als', 'nmf', 'user_cf', 'item_cf']:
    if model_name not in cf_models or cf_models[model_name] is None:
        continue
    
    print(f"\n  Generating {model_name} predictions...")
    predictions = []
    
    model = cf_models[model_name]
    
    for user_id, group in tqdm(val_data_grouped, desc=f"  {model_name}", leave=False):
        user_predictions = []
        for _, row in group.iterrows():
            item_id = row['article_id']
            
            if model_name == 'svd':
                pred = predict_svd(user_id, item_id, model)
            elif model_name == 'svdpp':
                pred = predict_svdpp(user_id, item_id, model)
            elif model_name == 'als':
                pred = predict_als(user_id, item_id, model)
            elif model_name == 'nmf':
                pred = predict_nmf(user_id, item_id, model)
            elif model_name == 'user_cf':
                pred = predict_user_cf(user_id, item_id, model)
            elif model_name == 'item_cf':
                pred = predict_item_cf(user_id, item_id, model)
            else:
                pred = 0.0
            
            user_predictions.append(pred)
        
        predictions.extend(user_predictions)
    
    cf_predictions[model_name] = np.array(predictions)
    
    # Evaluate individual model
    map12_score = evaluate_map_at_12(val_data, cf_predictions[model_name])
    print(f"    MAP@12: {map12_score:.6f}")

print("\n‚úÖ CF predictions generated!")
gc.collect()



GENERATING CF PREDICTIONS FOR VALIDATION SET

üìä Generating predictions...

  Generating svd predictions...


  svd:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.759315

  Generating svdpp predictions...


  svdpp:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.759315

  Generating als predictions...


  als:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.761694

  Generating nmf predictions...


  nmf:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.759361

  Generating user_cf predictions...


  user_cf:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.759267

  Generating item_cf predictions...


  item_cf:   0%|          | 0/42126 [00:00<?, ?it/s]

    MAP@12: 0.760927

‚úÖ CF predictions generated!


0

In [31]:
# ============================================================================
# LOAD EXISTING MODEL PREDICTIONS (LightGBM, Neural Tower)
# ============================================================================

print("\n" + "="*80)
print("LOADING EXISTING MODEL PREDICTIONS")
print("="*80)

existing_predictions = {}

# Try to load LightGBM predictions
try:
    lgb_path = EnsembleConfig.MODEL_PATH / 'lgb_ranker_lambdarank_predictions_val.parquet'
    if lgb_path.exists():
        lgb_preds = pd.read_parquet(lgb_path)
        if 'pred_score' in lgb_preds.columns:
            # Align with val_data - ensure exact match by index
            val_data_with_idx = val_data[['customer_id', 'article_id']].reset_index()
            lgb_aligned = pd.merge(val_data_with_idx, 
                                   lgb_preds[['customer_id', 'article_id', 'pred_score']],
                                   on=['customer_id', 'article_id'], 
                                   how='left',
                                   suffixes=('', '_lgb'))
            lgb_aligned = lgb_aligned.sort_values('index').reset_index(drop=True)
            existing_predictions['lightgbm'] = lgb_aligned['pred_score'].fillna(0.0).values
            
            # Verify length matches
            if len(existing_predictions['lightgbm']) != len(val_data):
                print(f"‚ö†Ô∏è  LightGBM predictions length mismatch: {len(existing_predictions['lightgbm'])} vs {len(val_data)}")
                # Truncate or pad to match
                if len(existing_predictions['lightgbm']) > len(val_data):
                    existing_predictions['lightgbm'] = existing_predictions['lightgbm'][:len(val_data)]
                else:
                    existing_predictions['lightgbm'] = np.pad(existing_predictions['lightgbm'], 
                                                              (0, len(val_data) - len(existing_predictions['lightgbm'])), 
                                                              'constant', constant_values=0.0)
            
            map12_lgb = evaluate_map_at_12(val_data, existing_predictions['lightgbm'])
            print(f"‚úì LightGBM loaded: MAP@12 = {map12_lgb:.6f}")
        else:
            print("‚ö†Ô∏è  LightGBM predictions file missing 'pred_score' column")
    else:
        print("‚ö†Ô∏è  LightGBM predictions file not found")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load LightGBM: {e}")

# Try to load Neural Tower predictions
try:
    neural_path = EnsembleConfig.MODEL_PATH / 'neural_tower_predictions_val.parquet'
    if neural_path.exists():
        neural_preds = pd.read_parquet(neural_path)
        if 'pred_score' in neural_preds.columns:
            # Align with val_data - ensure exact match by index
            val_data_with_idx = val_data[['customer_id', 'article_id']].reset_index()
            neural_aligned = pd.merge(val_data_with_idx, 
                                      neural_preds[['customer_id', 'article_id', 'pred_score']],
                                      on=['customer_id', 'article_id'], 
                                      how='left',
                                      suffixes=('', '_neural'))
            neural_aligned = neural_aligned.sort_values('index').reset_index(drop=True)
            existing_predictions['neural_tower'] = neural_aligned['pred_score'].fillna(0.0).values
            
            # Verify length matches
            if len(existing_predictions['neural_tower']) != len(val_data):
                print(f"‚ö†Ô∏è  Neural Tower predictions length mismatch: {len(existing_predictions['neural_tower'])} vs {len(val_data)}")
                # Truncate or pad to match
                if len(existing_predictions['neural_tower']) > len(val_data):
                    existing_predictions['neural_tower'] = existing_predictions['neural_tower'][:len(val_data)]
                else:
                    existing_predictions['neural_tower'] = np.pad(existing_predictions['neural_tower'], 
                                                                  (0, len(val_data) - len(existing_predictions['neural_tower'])), 
                                                                  'constant', constant_values=0.0)
            
            map12_neural = evaluate_map_at_12(val_data, existing_predictions['neural_tower'])
            print(f"‚úì Neural Tower loaded: MAP@12 = {map12_neural:.6f}")
        else:
            print("‚ö†Ô∏è  Neural Tower predictions file missing 'pred_score' column")
    else:
        print("‚ö†Ô∏è  Neural Tower predictions file not found")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load Neural Tower: {e}")

# Combine all predictions
all_predictions = {**cf_predictions, **existing_predictions}
print(f"\nüìä Total models available: {len(all_predictions)}")
print(f"   CF models: {list(cf_predictions.keys())}")
print(f"   Advanced models: {list(existing_predictions.keys())}")

gc.collect()



LOADING EXISTING MODEL PREDICTIONS
‚ö†Ô∏è  LightGBM predictions file not found
‚ö†Ô∏è  Neural Tower predictions length mismatch: 123954 vs 120970
‚úì Neural Tower loaded: MAP@12 = 0.758364

üìä Total models available: 7
   CF models: ['svd', 'svdpp', 'als', 'nmf', 'user_cf', 'item_cf']
   Advanced models: ['neural_tower']


0

In [32]:
# ============================================================================
# ENSEMBLE FRAMEWORK
# ============================================================================

print("\n" + "="*80)
print("ENSEMBLE FRAMEWORK")
print("="*80)

def normalize_predictions(predictions_dict):
    """Normalize all predictions to [0, 1] range"""
    normalized = {}
    target_length = len(val_data)
    
    for model_name, preds in predictions_dict.items():
        preds_array = np.array(preds)
        
        # Ensure correct length
        if len(preds_array) != target_length:
            if len(preds_array) > target_length:
                preds_array = preds_array[:target_length]
            else:
                preds_array = np.pad(preds_array, (0, target_length - len(preds_array)), 
                                    'constant', constant_values=0.0)
        
        # Normalize
        min_val = preds_array.min()
        max_val = preds_array.max()
        if max_val > min_val:
            normalized[model_name] = (preds_array - min_val) / (max_val - min_val)
        else:
            normalized[model_name] = preds_array
    return normalized

def weighted_ensemble(predictions_dict, weights_dict):
    """Weighted average ensemble"""
    # Ensure all predictions have the same length
    target_length = len(val_data)
    
    # Normalize and align all predictions
    normalized_preds = {}
    for model_name, preds in predictions_dict.items():
        preds_array = np.array(preds)
        
        # Align length
        if len(preds_array) != target_length:
            if len(preds_array) > target_length:
                preds_array = preds_array[:target_length]
            else:
                preds_array = np.pad(preds_array, (0, target_length - len(preds_array)), 
                                    'constant', constant_values=0.0)
        
        # Normalize to [0, 1]
        min_val = preds_array.min()
        max_val = preds_array.max()
        if max_val > min_val:
            normalized_preds[model_name] = (preds_array - min_val) / (max_val - min_val)
        else:
            normalized_preds[model_name] = preds_array
    
    # Weighted combination
    ensemble_pred = np.zeros(target_length)
    for model_name, weight in weights_dict.items():
        if model_name in normalized_preds:
            ensemble_pred += normalized_preds[model_name] * weight
    
    return ensemble_pred

def rank_based_ensemble(predictions_dict, method='borda_count'):
    """Rank-based ensemble (Borda count or Reciprocal Rank Fusion)"""
    ensemble_scores = np.zeros(len(val_data))
    
    # Group by user
    val_data_grouped = val_data.groupby('customer_id')
    user_indices = {}
    start_idx = 0
    
    for user_id, group in val_data_grouped:
        end_idx = start_idx + len(group)
        user_indices[user_id] = (start_idx, end_idx)
        start_idx = end_idx
    
    # For each user, compute ranks and combine
    for user_id, (start_idx, end_idx) in user_indices.items():
        user_scores = {}
        
        # Get ranks for each model
        for model_name, preds in predictions_dict.items():
            user_preds = preds[start_idx:end_idx]
            # Convert to ranks (higher score = lower rank number)
            ranks = np.argsort(-user_preds)  # Descending order
            user_scores[model_name] = ranks
        
        # Combine ranks
        if method == 'borda_count':
            # Sum of ranks (lower rank = better)
            combined_ranks = sum(user_scores.values())
            # Convert back to scores (lower rank = higher score)
            ensemble_scores[start_idx:end_idx] = 1.0 / (combined_ranks + 1)
        elif method == 'reciprocal_rank':
            # Sum of 1/(rank+1)
            combined_scores = sum(1.0 / (ranks + 1) for ranks in user_scores.values())
            ensemble_scores[start_idx:end_idx] = combined_scores
    
    return ensemble_scores

def stacked_ensemble(predictions_dict, val_data_with_features=None):
    """Stacked ensemble: Use CF predictions as features for LightGBM"""
    # Create feature matrix from predictions
    feature_matrix = np.column_stack([preds for preds in predictions_dict.values()])
    
    # Normalize features
    scaler = MinMaxScaler()
    feature_matrix_scaled = scaler.fit_transform(feature_matrix)
    
    # Train LightGBM on these features
    train_data = pd.read_parquet(EnsembleConfig.MODEL_PATH / 'train_data.parquet')
    
    # Generate CF predictions for training data (simplified - use same approach)
    # For now, use validation predictions as proxy (in practice, regenerate for train)
    print("‚ö†Ô∏è  Stacked ensemble requires training predictions - using validation as proxy")
    print("   For full implementation, regenerate CF predictions on training set")
    
    # Create LightGBM dataset
    train_features = feature_matrix_scaled  # Simplified
    train_labels = val_data['label'].values
    
    # Train LightGBM
    train_data_lgb = lgb.Dataset(train_features, label=train_labels)
    
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1
    }
    
    model = lgb.train(params, train_data_lgb, num_boost_round=100)
    
    # Predict
    predictions = model.predict(feature_matrix_scaled)
    return predictions

print("‚úì Ensemble functions defined")



ENSEMBLE FRAMEWORK
‚úì Ensemble functions defined


In [33]:
# ============================================================================
# OPTIMIZE ENSEMBLE WEIGHTS
# ============================================================================

print("\n" + "="*80)
print("OPTIMIZING ENSEMBLE WEIGHTS")
print("="*80)

def objective_function(weights, predictions_dict, val_data):
    """Objective function for weight optimization (maximize MAP@12)"""
    # Normalize weights to sum to 1
    weights = weights / (weights.sum() + 1e-8)
    
    # Create weights dictionary
    model_names = list(predictions_dict.keys())
    weights_dict = {model_names[i]: weights[i] for i in range(len(model_names))}
    
    # Get ensemble predictions
    ensemble_pred = weighted_ensemble(predictions_dict, weights_dict)
    
    # Evaluate
    map12_score = evaluate_map_at_12(val_data, ensemble_pred)
    
    # Return negative (we're minimizing)
    return -map12_score

# Verify all predictions have the same length before optimization
print("\nüìä Verifying prediction alignment...")
target_length = len(val_data)
prediction_lengths = {name: len(preds) for name, preds in all_predictions.items()}
print(f"  Target length: {target_length}")
for name, length in prediction_lengths.items():
    if length != target_length:
        print(f"  ‚ö†Ô∏è  {name}: {length} (will be aligned)")
    else:
        print(f"  ‚úì {name}: {length}")

# Align all predictions to same length
aligned_predictions = {}
for model_name, preds in all_predictions.items():
    preds_array = np.array(preds)
    if len(preds_array) != target_length:
        if len(preds_array) > target_length:
            aligned_predictions[model_name] = preds_array[:target_length]
        else:
            aligned_predictions[model_name] = np.pad(preds_array, 
                                                     (0, target_length - len(preds_array)), 
                                                     'constant', constant_values=0.0)
    else:
        aligned_predictions[model_name] = preds_array

# Optimize weights
if EnsembleConfig.OPTIMIZE_WEIGHTS and len(aligned_predictions) > 1:
    print("\nüìä Optimizing ensemble weights...")
    
    model_names = list(aligned_predictions.keys())
    n_models = len(model_names)
    
    # Initial weights (equal)
    initial_weights = np.ones(n_models) / n_models
    
    # Constraints: weights sum to 1, all positive
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0}
    bounds = [(0.0, 1.0)] * n_models
    
    # Optimize
    result = minimize(
        objective_function,
        x0=initial_weights,
        args=(aligned_predictions, val_data),
        method='SLSQP',
        bounds=bounds,
        constraints=constraints,
        options={'maxiter': 100, 'ftol': 1e-6}
    )
    
    if result.success:
        optimal_weights = result.x / result.x.sum()
        optimal_weights_dict = {model_names[i]: optimal_weights[i] for i in range(n_models)}
        
        print("\n‚úÖ Weight optimization complete!")
        print("\nüìä Optimal weights:")
        for model_name, weight in sorted(optimal_weights_dict.items(), key=lambda x: x[1], reverse=True):
            print(f"  {model_name:20s}: {weight:.4f}")
        
        # Evaluate optimized ensemble
        ensemble_pred_optimized = weighted_ensemble(all_predictions, optimal_weights_dict)
        map12_optimized = evaluate_map_at_12(val_data, ensemble_pred_optimized)
        print(f"\nüìà Optimized Ensemble MAP@12: {map12_optimized:.6f}")
        
    else:
        print(f"‚ö†Ô∏è  Optimization did not converge: {result.message}")
        # Use equal weights as fallback
        optimal_weights_dict = {name: 1.0/len(model_names) for name in model_names}
        print("   Using equal weights as fallback")
else:
    # Use equal weights
    model_names = list(aligned_predictions.keys())
    optimal_weights_dict = {name: 1.0/len(model_names) for name in model_names}
    print("\nüìä Using equal weights (optimization disabled or insufficient models)")

# Update all_predictions to use aligned versions
all_predictions = aligned_predictions

gc.collect()



OPTIMIZING ENSEMBLE WEIGHTS

üìä Verifying prediction alignment...
  Target length: 120970
  ‚úì svd: 120970
  ‚úì svdpp: 120970
  ‚úì als: 120970
  ‚úì nmf: 120970
  ‚úì user_cf: 120970
  ‚úì item_cf: 120970
  ‚úì neural_tower: 120970

üìä Optimizing ensemble weights...

‚úÖ Weight optimization complete!

üìä Optimal weights:
  svd                 : 0.1429
  svdpp               : 0.1429
  als                 : 0.1429
  nmf                 : 0.1429
  user_cf             : 0.1429
  item_cf             : 0.1429
  neural_tower        : 0.1429

üìà Optimized Ensemble MAP@12: 0.758457


0

In [34]:
# ============================================================================
# EVALUATE ALL ENSEMBLE METHODS
# ============================================================================

print("\n" + "="*80)
print("ENSEMBLE EVALUATION AND COMPARISON")
print("="*80)

ensemble_results = {}

# 1. Individual models
print("\nüìä Individual Model Performance:")
print("-" * 80)
individual_results = {}
for model_name, predictions in all_predictions.items():
    map12_score = evaluate_map_at_12(val_data, predictions)
    individual_results[model_name] = map12_score
    print(f"  {model_name:20s}: MAP@12 = {map12_score:.6f}")

# 2. Weighted Ensemble (Optimized)
print("\nüìä Weighted Ensemble (Optimized Weights):")
print("-" * 80)
ensemble_pred_weighted = weighted_ensemble(all_predictions, optimal_weights_dict)
map12_weighted = evaluate_map_at_12(val_data, ensemble_pred_weighted)
ensemble_results['weighted_optimized'] = {
    'map12': map12_weighted,
    'predictions': ensemble_pred_weighted
}
print(f"  MAP@12 = {map12_weighted:.6f}")

# 3. Weighted Ensemble (Equal Weights)
print("\nüìä Weighted Ensemble (Equal Weights):")
print("-" * 80)
equal_weights = {name: 1.0/len(all_predictions) for name in all_predictions.keys()}
ensemble_pred_equal = weighted_ensemble(all_predictions, equal_weights)
map12_equal = evaluate_map_at_12(val_data, ensemble_pred_equal)
ensemble_results['weighted_equal'] = {
    'map12': map12_equal,
    'predictions': ensemble_pred_equal
}
print(f"  MAP@12 = {map12_equal:.6f}")

# 4. Rank-Based Ensemble (Borda Count)
print("\nüìä Rank-Based Ensemble (Borda Count):")
print("-" * 80)
try:
    ensemble_pred_borda = rank_based_ensemble(all_predictions, method='borda_count')
    map12_borda = evaluate_map_at_12(val_data, ensemble_pred_borda)
    ensemble_results['rank_borda'] = {
        'map12': map12_borda,
        'predictions': ensemble_pred_borda
    }
    print(f"  MAP@12 = {map12_borda:.6f}")
except Exception as e:
    print(f"  ‚ö†Ô∏è  Borda count failed: {e}")

# 5. Best Individual Model
best_individual = max(individual_results.items(), key=lambda x: x[1])
print(f"\nüìä Best Individual Model: {best_individual[0]} (MAP@12 = {best_individual[1]:.6f})")

# 6. Best Ensemble
best_ensemble = max(ensemble_results.items(), key=lambda x: x[1]['map12'])
print(f"\nüèÜ Best Ensemble Method: {best_ensemble[0]} (MAP@12 = {best_ensemble[1]['map12']:.6f})")

# Improvement
improvement = best_ensemble[1]['map12'] - best_individual[1]
improvement_pct = (improvement / best_individual[1]) * 100
print(f"\nüìà Ensemble Improvement: {improvement:+.6f} ({improvement_pct:+.2f}%)")

# Summary
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print(f"\n‚úÖ Best Individual Model: {best_individual[0]}")
print(f"   MAP@12: {best_individual[1]:.6f}")
print(f"\n‚úÖ Best Ensemble Method: {best_ensemble[0]}")
print(f"   MAP@12: {best_ensemble[1]['map12']:.6f}")
print(f"   Improvement: {improvement:+.6f} ({improvement_pct:+.2f}%)")

# Save best ensemble predictions
best_predictions = best_ensemble[1]['predictions']
val_data_ensemble = val_data.copy()
val_data_ensemble['pred_score'] = best_predictions

output_path = EnsembleConfig.MODEL_PATH / 'ensemble_predictions_val.parquet'
val_data_ensemble[['customer_id', 'article_id', 'label', 'pred_score']].to_parquet(
    output_path, index=False
)
print(f"\nüíæ Saved best ensemble predictions to {output_path}")

# Save ensemble metadata
ensemble_metadata = {
    'best_ensemble_method': best_ensemble[0],
    'best_ensemble_map12': float(best_ensemble[1]['map12']),
    'best_individual_model': best_individual[0],
    'best_individual_map12': float(best_individual[1]),
    'improvement': float(improvement),
    'improvement_pct': float(improvement_pct),
    'optimal_weights': {k: float(v) for k, v in optimal_weights_dict.items()},
    'individual_results': {k: float(v) for k, v in individual_results.items()},
    'ensemble_results': {k: {'map12': float(v['map12'])} for k, v in ensemble_results.items()}
}

metadata_path = EnsembleConfig.MODEL_PATH / 'ensemble_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(ensemble_metadata, f, indent=2)
print(f"üíæ Saved ensemble metadata to {metadata_path}")

gc.collect()
print("\n‚úÖ Ensemble evaluation complete!")



ENSEMBLE EVALUATION AND COMPARISON

üìä Individual Model Performance:
--------------------------------------------------------------------------------
  svd                 : MAP@12 = 0.759315
  svdpp               : MAP@12 = 0.759315
  als                 : MAP@12 = 0.761694
  nmf                 : MAP@12 = 0.759361
  user_cf             : MAP@12 = 0.759267
  item_cf             : MAP@12 = 0.760927
  neural_tower        : MAP@12 = 0.758364

üìä Weighted Ensemble (Optimized Weights):
--------------------------------------------------------------------------------
  MAP@12 = 0.758457

üìä Weighted Ensemble (Equal Weights):
--------------------------------------------------------------------------------
  MAP@12 = 0.758457

üìä Rank-Based Ensemble (Borda Count):
--------------------------------------------------------------------------------
  MAP@12 = 0.759194

üìä Best Individual Model: als (MAP@12 = 0.761694)

üèÜ Best Ensemble Method: rank_borda (MAP@12 = 0.759194)

üìà Ensem

### Stage 7: Evaluation & Metrics

This stage provides comprehensive evaluation and comparison of all models:
- **Model Comparison**: LightGBM vs Neural Towers performance
- **Ensemble Evaluation**: Weighted combination of best models
- **Detailed Metrics**: MAP@12, Precision@K, Recall@K, NDCG@K
- **Feature Analysis**: Importance analysis and ablation studies
- **Final Ranking**: Generate top-12 predictions for each user
- **Submission Preparation**: Format predictions for Kaggle submission

**Key Features:**
- Comprehensive metric suite
- Model ensemble strategies
- Performance visualization
- Submission file generation


In [None]:
# IMPORTS AND CONFIGURATION FOR EVALUATION

import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import json
import gc
from tqdm.auto import tqdm
import warnings
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Configuration
class EvaluationConfig:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    # Evaluation metrics
    K_VALUES = [1, 3, 5, 10, 12]  # Different K values for evaluation
    
    # Ensemble weights (can be tuned)
    ENSEMBLE_WEIGHTS = {
        'lgb_classifier': 0.2,
        'lgb_ranker_lambdarank': 0.3,
        'lgb_ranker_xendcg': 0.2,
        'neural_tower': 0.3
    }
    
    RANDOM_STATE = 42

print("‚úì Evaluation Configuration loaded")
print(f"  Model path: {EvaluationConfig.MODEL_PATH}")


‚úì Evaluation Configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models


In [None]:
# COMPREHENSIVE EVALUATION METRICS

def calculate_map_at_k(y_true, y_pred, k=12):
    """Calculate Mean Average Precision at K (MAP@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        # Calculate AP for this user
        hits = 0
        precision_sum = 0.0
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                hits += 1
                precision_sum += hits / (i + 1)
        
        if hits > 0:
            ap = precision_sum / len(true_items)
            aps.append(ap)
    
    return np.mean(aps) if len(aps) > 0 else 0.0


def calculate_precision_at_k(y_true, y_pred, k=12):
    """Calculate Precision@K"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    precisions = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(pred_items) == 0:
            continue
        
        hits = sum(1 for item in pred_items if item in true_items)
        precision = hits / len(pred_items)
        precisions.append(precision)
    
    return np.mean(precisions) if len(precisions) > 0 else 0.0


def calculate_recall_at_k(y_true, y_pred, k=12):
    """Calculate Recall@K"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    recalls = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        hits = sum(1 for item in pred_items if item in true_items)
        recall = hits / len(true_items)
        recalls.append(recall)
    
    return np.mean(recalls) if len(recalls) > 0 else 0.0


def calculate_ndcg_at_k(y_true, y_pred, k=12):
    """Calculate Normalized Discounted Cumulative Gain at K (NDCG@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    ndcgs = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        # Calculate DCG
        dcg = 0.0
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                dcg += 1.0 / np.log2(i + 2)  # i+2 because log2(1) = 0
        
        # Calculate IDCG (ideal DCG)
        idcg = 0.0
        num_relevant = min(len(true_items), len(pred_items))
        for i in range(num_relevant):
            idcg += 1.0 / np.log2(i + 2)
        
        if idcg > 0:
            ndcg = dcg / idcg
            ndcgs.append(ndcg)
    
    return np.mean(ndcgs) if len(ndcgs) > 0 else 0.0


def evaluate_all_metrics(df, predictions, k_values=[1, 3, 5, 10, 12]):
    """
    Evaluate all metrics for different K values
    
    Args:
        df: DataFrame with columns ['customer_id', 'article_id', 'label']
        predictions: Array of prediction scores
        k_values: List of K values to evaluate
    
    Returns:
        Dictionary of metrics
    """
    # Group by customer
    grouped = df.groupby('customer_id')
    
    # Prepare true and predicted items for each user
    y_true = []
    y_pred = []
    
    for customer_id, group in grouped:
        # True items (purchased articles)
        true_items = set(group[group['label'] == 1]['article_id'].values)
        y_true.append(true_items)
        
        # Predicted items (sorted by score)
        customer_df = group.copy()
        customer_df['pred_score'] = predictions[:len(customer_df)]
        customer_df = customer_df.sort_values('pred_score', ascending=False)
        pred_items = customer_df['article_id'].values.tolist()
        y_pred.append(pred_items)
        
        # Remove used predictions
        predictions = predictions[len(customer_df):]
    
    # Calculate metrics for each K
    results = {}
    for k in k_values:
        results[f'MAP@{k}'] = calculate_map_at_k(y_true, y_pred, k)
        results[f'Precision@{k}'] = calculate_precision_at_k(y_true, y_pred, k)
        results[f'Recall@{k}'] = calculate_recall_at_k(y_true, y_pred, k)
        results[f'NDCG@{k}'] = calculate_ndcg_at_k(y_true, y_pred, k)
    
    return results


print("‚úì Evaluation metrics functions defined")

‚úì Evaluation metrics functions defined


In [None]:
# LOAD ALL MODEL PREDICTIONS

print("\n" + "="*80)
print("LOADING MODEL PREDICTIONS")
print("="*80)

# Load validation data
print("\nLoading validation data...")
val_data = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")

# Load LightGBM predictions
lgb_predictions = {}
print("\nLoading LightGBM predictions...")
try:
    ensemble_preds = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'ensemble_predictions_val.parquet')
    if 'ensemble_weighted' in ensemble_preds.columns:
        lgb_predictions['ensemble_weighted'] = ensemble_preds['ensemble_weighted'].values
    if 'ensemble_average' in ensemble_preds.columns:
        lgb_predictions['ensemble_average'] = ensemble_preds['ensemble_average'].values
    print(f"‚úì Loaded ensemble predictions")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load ensemble predictions: {e}")

# Load individual LightGBM model predictions if available
lgb_models = ['lgb_classifier', 'lgb_ranker_lambdarank', 'lgb_ranker_xendcg', 'lgb_classifier_deep']
for model_name in lgb_models:
    try:
        pred_file = EvaluationConfig.MODEL_PATH / f'{model_name}_predictions_val.parquet'
        if pred_file.exists():
            preds = pd.read_parquet(pred_file)
            if 'pred_score' in preds.columns:
                lgb_predictions[model_name] = preds['pred_score'].values
                print(f"‚úì Loaded {model_name} predictions")
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load {model_name}: {e}")

# Load Neural Tower predictions
neural_predictions = {}
print("\nLoading Neural Tower predictions...")
try:
    neural_preds = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'neural_tower_predictions_val.parquet')
    if 'pred_score' in neural_preds.columns:
        neural_predictions['neural_tower'] = neural_preds['pred_score'].values
        print(f"‚úì Loaded Neural Tower predictions")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load Neural Tower predictions: {e}")

# Combine all predictions
all_predictions = {**lgb_predictions, **neural_predictions}

print(f"\n‚úì Total models loaded: {len(all_predictions)}")
print(f"  Models: {list(all_predictions.keys())}")

gc.collect()



LOADING MODEL PREDICTIONS

Loading validation data...
‚úì Loaded 120,970 validation samples

Loading LightGBM predictions...
‚úì Loaded ensemble predictions

Loading Neural Tower predictions...
‚úì Loaded Neural Tower predictions

‚úì Total models loaded: 1
  Models: ['neural_tower']


0

In [None]:
# EVALUATE ALL MODELS

print("\n" + "="*80)
print("EVALUATING ALL MODELS")
print("="*80)

# Store all evaluation results
evaluation_results = {}

for model_name, predictions in tqdm(all_predictions.items(), desc="Evaluating models"):
    print(f"\nüìä Evaluating {model_name}...")
    
    # Ensure predictions match validation data length
    if len(predictions) != len(val_data):
        print(f"‚ö†Ô∏è  Prediction length mismatch: {len(predictions)} vs {len(val_data)}")
        min_len = min(len(predictions), len(val_data))
        predictions = predictions[:min_len]
        val_data_eval = val_data.iloc[:min_len].copy()
    else:
        val_data_eval = val_data.copy()
    
    # Evaluate all metrics
    metrics = evaluate_all_metrics(val_data_eval, predictions.copy(), k_values=EvaluationConfig.K_VALUES)
    evaluation_results[model_name] = metrics
    
    # Print key metrics
    print(f"  MAP@12: {metrics['MAP@12']:.6f}")
    print(f"  Precision@12: {metrics['Precision@12']:.6f}")
    print(f"  Recall@12: {metrics['Recall@12']:.6f}")
    print(f"  NDCG@12: {metrics['NDCG@12']:.6f}")

# Create comparison DataFrame
comparison_df = pd.DataFrame(evaluation_results).T
comparison_df = comparison_df.sort_values('MAP@12', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print("\n" + comparison_df.to_string())

# Save comparison results
comparison_path = EvaluationConfig.MODEL_PATH / 'model_comparison.csv'
comparison_df.to_csv(comparison_path)
print(f"\n‚úì Saved comparison results to {comparison_path}")

gc.collect()



EVALUATING ALL MODELS


Evaluating models:   0%|          | 0/1 [00:00<?, ?it/s]


üìä Evaluating neural_tower...
  MAP@12: 0.776586
  Precision@12: 0.332563
  Recall@12: 1.022496
  NDCG@12: 0.842860

MODEL COMPARISON SUMMARY

                 MAP@1  Precision@1  Recall@1    NDCG@1     MAP@3  Precision@3  Recall@3  NDCG@3     MAP@5  Precision@5  Recall@5    NDCG@5   MAP@10  Precision@10  Recall@10   NDCG@10    MAP@12  Precision@12  Recall@12  NDCG@12
neural_tower  0.634642     0.331862  0.385099  0.606797  0.693216     0.332012  0.835952  0.7677  0.748244     0.332417  0.977017  0.822857  0.77599      0.332564   1.021678  0.842445  0.776586      0.332563   1.022496  0.84286

‚úì Saved comparison results to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/model_comparison.csv


21

In [None]:
# CREATE FINAL ENSEMBLE

print("\n" + "="*80)
print("CREATING FINAL ENSEMBLE")
print("="*80)

# Select best models for ensemble
best_models = ['neural_tower', 'lgb_ranker_lambdarank', 'lgb_classifier']

# Filter to available models
available_models = [m for m in best_models if m in all_predictions]
print(f"\nüì¶ Available models for ensemble: {available_models}")

if len(available_models) == 0:
    print("‚ö†Ô∏è  No models available for ensemble!")
else:
    # Normalize predictions to [0, 1] range
    normalized_preds = {}
    for model_name in available_models:
        preds = all_predictions[model_name].copy()
        min_pred = preds.min()
        max_pred = preds.max()
        if max_pred > min_pred:
            normalized = (preds - min_pred) / (max_pred - min_pred)
        else:
            normalized = preds
        normalized_preds[model_name] = normalized
        print(f"  {model_name}: [{preds.min():.4f}, {preds.max():.4f}] -> [0, 1]")
    
    # Create ensemble with equal weights (can be tuned)
    ensemble_weights = {m: 1.0 / len(available_models) for m in available_models}
    print(f"\n‚öñÔ∏è  Ensemble weights: {ensemble_weights}")
    
    # Calculate weighted ensemble
    ensemble_pred = np.zeros(len(normalized_preds[available_models[0]]))
    for model_name, weight in ensemble_weights.items():
        ensemble_pred += weight * normalized_preds[model_name]
    
    # Evaluate ensemble
    print("\nüìä Evaluating final ensemble...")
    ensemble_metrics = evaluate_all_metrics(val_data, ensemble_pred.copy(), k_values=EvaluationConfig.K_VALUES)
    evaluation_results['final_ensemble'] = ensemble_metrics
    
    print(f"  MAP@12: {ensemble_metrics['MAP@12']:.6f}")
    print(f"  Precision@12: {ensemble_metrics['Precision@12']:.6f}")
    print(f"  Recall@12: {ensemble_metrics['Recall@12']:.6f}")
    print(f"  NDCG@12: {ensemble_metrics['NDCG@12']:.6f}")
    
    # Save ensemble predictions
    ensemble_df = val_data[['customer_id', 'article_id', 'label']].copy()
    ensemble_df['pred_score'] = ensemble_pred
    ensemble_path = EvaluationConfig.MODEL_PATH / 'final_ensemble_predictions_val.parquet'
    ensemble_df.to_parquet(ensemble_path, index=False)
    print(f"\n‚úì Saved ensemble predictions to {ensemble_path}")
    
    # Update comparison
    comparison_df = pd.DataFrame(evaluation_results).T
    comparison_df = comparison_df.sort_values('MAP@12', ascending=False)
    print("\n" + "="*80)
    print("UPDATED MODEL COMPARISON (with ensemble)")
    print("="*80)
    print("\n" + comparison_df.to_string())

gc.collect()



CREATING FINAL ENSEMBLE

üì¶ Available models for ensemble: ['neural_tower']
  neural_tower: [0.0000, 1.0000] -> [0, 1]

‚öñÔ∏è  Ensemble weights: {'neural_tower': 1.0}

üìä Evaluating final ensemble...
  MAP@12: 0.776586
  Precision@12: 0.332563
  Recall@12: 1.022496
  NDCG@12: 0.842860

‚úì Saved ensemble predictions to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/final_ensemble_predictions_val.parquet

UPDATED MODEL COMPARISON (with ensemble)

                   MAP@1  Precision@1  Recall@1    NDCG@1     MAP@3  Precision@3  Recall@3  NDCG@3     MAP@5  Precision@5  Recall@5    NDCG@5   MAP@10  Precision@10  Recall@10   NDCG@10    MAP@12  Precision@12  Recall@12  NDCG@12
neural_tower    0.634642     0.331862  0.385099  0.606797  0.693216     0.332012  0.835952  0.7677  0.748244     0.332417  0.977017  0.822857  0.77599      0.332564   1.021678  0.842445  0.776586      0.332563   1.022496  0.84286
final_ensemble  0.634642     

24

In [None]:
# GENERATE FINAL RANKINGS FOR SUBMISSION

print("\n" + "="*80)
print("GENERATING FINAL RANKINGS")
print("="*80)

# Use best model (or ensemble if available)
if 'final_ensemble' in evaluation_results:
    best_model_name = 'final_ensemble'
    best_predictions = ensemble_pred
    print(f"\n‚úÖ Using final ensemble for submission")
elif 'neural_tower' in all_predictions:
    best_model_name = 'neural_tower'
    best_predictions = all_predictions['neural_tower']
    print(f"\n‚úÖ Using Neural Tower for submission")
elif len(all_predictions) > 0:
    # Use model with best MAP@12
    best_model_name = comparison_df.index[0]
    best_predictions = all_predictions[best_model_name]
    print(f"\n‚úÖ Using {best_model_name} for submission")
else:
    raise ValueError("No predictions available!")

# Create predictions DataFrame
pred_df = val_data[['customer_id', 'article_id']].copy()
pred_df['pred_score'] = best_predictions[:len(pred_df)]

# Generate top-12 predictions for each user
print("\nüìä Generating top-12 rankings per user...")
rankings = []
for customer_id, group in tqdm(pred_df.groupby('customer_id'), desc="Ranking users"):
    # Sort by prediction score (descending)
    group_sorted = group.sort_values('pred_score', ascending=False)
    
    # Get top 12 article IDs
    top_articles = group_sorted.head(12)['article_id'].values
    
    # Format as space-separated string
    predictions_str = ' '.join([str(art) for art in top_articles])
    
    rankings.append({
        'customer_id': customer_id,
        'prediction': predictions_str
    })

# Create submission DataFrame
submission_df = pd.DataFrame(rankings)
submission_df = submission_df.sort_values('customer_id')

print(f"\n‚úì Generated rankings for {len(submission_df):,} users")
print(f"  Average articles per user: {submission_df['prediction'].str.split().str.len().mean():.2f}")

# Save submission file
submission_path = EvaluationConfig.MODEL_PATH / 'submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\n‚úì Saved submission file to {submission_path}")

# Display sample
print("\nüìÑ Sample submission (first 5 rows):")
print(submission_df.head().to_string(index=False))

gc.collect()



GENERATING FINAL RANKINGS

‚úÖ Using final ensemble for submission

üìä Generating top-12 rankings per user...


Ranking users:   0%|          | 0/42126 [00:00<?, ?it/s]


‚úì Generated rankings for 42,126 users
  Average articles per user: 2.87

‚úì Saved submission file to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/submission.csv

üìÑ Sample submission (first 5 rows):
                                                     customer_id                              prediction
0000945f66de1a11d9447609b8b41b1bc987ba185a5496ae8831e8493afa24ff                               811899002
00012315fd38859ff2c446876ca507abbcbcf582d0e266b1b696941c16e777a2                               872600009
00061a04f030bdf3665b09829192ca8c13c4de6dd9ae9d38d0d0b5ce3a1cfc6f                     799365013 883724001
00089f13f465ec902e5c49a3bb408c5e31205096d6f267543f1893303e456016                               858052005
000e3f587242eb077685a487ad27dad632a4801576dfd16967280f0da3a78c2e 706016001 620425012 857713001 684209004


210688

In [None]:
# FINAL SUMMARY AND RESULTS

print("\n" + "="*80)
print("FINAL EVALUATION SUMMARY")
print("="*80)

# Best model
best_model = comparison_df.index[0]
best_map12 = comparison_df.loc[best_model, 'MAP@12']

print(f"\nüèÜ Best Model: {best_model}")
print(f"   MAP@12: {best_map12:.6f}")

# Model rankings
print(f"\nüìä Model Rankings (by MAP@12):")
print("-" * 80)
for idx, (model_name, row) in enumerate(comparison_df.iterrows(), 1):
    marker = "ü•á" if idx == 1 else "ü•à" if idx == 2 else "ü•â" if idx == 3 else "  "
    print(f"{marker} {idx}. {model_name:30s} MAP@12: {row['MAP@12']:.6f}")

# Key metrics for best model
print(f"\nüìà Detailed Metrics for Best Model ({best_model}):")
print("-" * 80)
best_metrics = comparison_df.loc[best_model]
for metric_name in ['MAP@12', 'Precision@12', 'Recall@12', 'NDCG@12']:
    print(f"  {metric_name:20s}: {best_metrics[metric_name]:.6f}")

# Files saved
print(f"\nüíæ Generated Files:")
print("-" * 80)
print(f"  Model Comparison: {EvaluationConfig.MODEL_PATH / 'model_comparison.csv'}")
if 'final_ensemble' in evaluation_results:
    print(f"  Ensemble Predictions: {EvaluationConfig.MODEL_PATH / 'final_ensemble_predictions_val.parquet'}")
print(f"  Submission File: {EvaluationConfig.MODEL_PATH / 'submission.csv'}")

# Performance summary
print(f"\nüìä Performance Summary:")
print("-" * 80)
print(f"  Total Models Evaluated: {len(comparison_df)}")
print(f"  Best MAP@12: {best_map12:.6f}")
print(f"  Improvement over baseline: {((best_map12 - comparison_df['MAP@12'].min()) / comparison_df['MAP@12'].min() * 100):.2f}%")

print("\n" + "="*80)
print("‚úÖ Step 3 Complete: Evaluation & Metrics")
print("   Ready for final submission!")
print("="*80)



FINAL EVALUATION SUMMARY

üèÜ Best Model: neural_tower
   MAP@12: 0.776586

üìä Model Rankings (by MAP@12):
--------------------------------------------------------------------------------
ü•á 1. neural_tower                   MAP@12: 0.776586
ü•à 2. final_ensemble                 MAP@12: 0.776586

üìà Detailed Metrics for Best Model (neural_tower):
--------------------------------------------------------------------------------
  MAP@12              : 0.776586
  Precision@12        : 0.332563
  Recall@12           : 1.022496
  NDCG@12             : 0.842860

üíæ Generated Files:
--------------------------------------------------------------------------------
  Model Comparison: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/model_comparison.csv
  Ensemble Predictions: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/final_ensemble_predictions_val.parquet
  Submission File

### Ablation Study: Impact of Image Features

This ablation study evaluates the contribution of image features by training models **without** image embeddings and comparing performance with the full models.

**Study Design:**
- Train LightGBM Ranker without image features
- Train Neural Tower without image features (2-tower: User + Item only)
- Compare MAP@12, Precision@12, Recall@12, NDCG@12
- Quantify the impact of image features on recommendation quality

**Hypothesis:** Image features should improve recommendation quality, especially for visual fashion items.


In [None]:
# ============================================================================
# ABLATION STUDY: CONFIGURATION AND IMPORTS
# ============================================================================

import pandas as pd
import numpy as np
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import pickle
import gc
from tqdm.auto import tqdm
import warnings
import json
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# Configuration (reuse from previous cells)
ABLATION_CONFIG = {
    'DATA_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2'),
    'MODEL_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models'),
    'RANDOM_STATE': 42,
    'BATCH_SIZE': 2048,
    'N_EPOCHS': 20,
    'LEARNING_RATE': 1e-3,
    'EARLY_STOPPING_PATIENCE': 5
}

# Device for neural network
if torch.backends.mps.is_available():
    DEVICE = torch.device('mps')
    print("üöÄ Using Apple Silicon GPU (MPS)")
else:
    DEVICE = torch.device('cpu')
    print("‚ö†Ô∏è  Using CPU")

print("‚úì Ablation study configuration loaded")
print(f"  Model path: {ABLATION_CONFIG['MODEL_PATH']}")
print(f"  Device: {DEVICE}")


In [None]:
# ============================================================================
# LOAD DATA WITHOUT IMAGE FEATURES
# ============================================================================

print("\n" + "="*80)
print("LOADING DATA FOR ABLATION STUDY (NO IMAGE FEATURES)")
print("="*80)

# Load train and validation datasets
print("\nLoading train_data.parquet...")
train_data = pd.read_parquet(ABLATION_CONFIG['MODEL_PATH'] / 'train_data.parquet')
print(f"‚úì Loaded {len(train_data):,} training samples")

print("\nLoading val_data.parquet...")
val_data = pd.read_parquet(ABLATION_CONFIG['MODEL_PATH'] / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")

# Identify feature groups (EXCLUDE image features)
exclude_cols = ['customer_id', 'article_id', 'label', 'user_type', 'train_label', 'val_label']
all_feature_cols = [col for col in train_data.columns if col not in exclude_cols]

# Separate features (NO IMAGE FEATURES)
user_feature_cols = [col for col in all_feature_cols if any(col.startswith(prefix) for prefix in 
    ['n_', 'avg_', 'std_', 'min_', 'max_', 'days_', 'purchase_', 'exploration_', 'age', 'FN', 'Active', 'unique_'])]

item_feature_cols = [col for col in all_feature_cols if any(col.startswith(prefix) for prefix in 
    ['product_', 'graphical_', 'colour_', 'perceived_', 'department_', 'index_', 'section_', 
     'garment_', 'popularity_', 'sales_', 'buyers_'])]

# Explicitly EXCLUDE image features
image_feature_cols = [col for col in all_feature_cols if col.startswith('image_emb_')]
print(f"\n‚ö†Ô∏è  EXCLUDING {len(image_feature_cols)} image features from ablation study")

# Combined features (user + item, NO image)
feature_cols = user_feature_cols + item_feature_cols

print(f"\n‚úì Feature separation (NO IMAGE):")
print(f"  User features: {len(user_feature_cols)}")
print(f"  Item features: {len(item_feature_cols)}")
print(f"  Image features (EXCLUDED): {len(image_feature_cols)}")
print(f"  Total features: {len(feature_cols)}")

# Prepare feature matrices
X_train = train_data[feature_cols].copy()
y_train = train_data['label'].copy()
X_val = val_data[feature_cols].copy()
y_val = val_data['label'].copy()

# Handle categorical and missing values
for col in feature_cols:
    if X_train[col].dtype.name == 'category':
        all_values = pd.concat([X_train[col], X_val[col]]).unique()
        train_cat = pd.Categorical(X_train[col], categories=all_values)
        val_cat = pd.Categorical(X_val[col], categories=all_values)
        X_train[col] = train_cat.codes
        X_val[col] = val_cat.codes
        X_train[col] = X_train[col].replace(-1, 0)
        X_val[col] = X_val[col].replace(-1, 0)
    else:
        X_train[col] = X_train[col].fillna(0)
        X_val[col] = X_val[col].fillna(0)

print(f"\n‚úì Feature matrices prepared:")
print(f"  Train: {X_train.shape}")
print(f"  Val: {X_val.shape}")

gc.collect()


In [None]:
# ============================================================================
# MAP@12 EVALUATION FUNCTION
# ============================================================================

def calculate_map_at_k(y_true, y_pred, k=12):
    """Calculate Mean Average Precision at K (MAP@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        hits = 0
        precision_sum = 0.0
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                hits += 1
                precision_sum += hits / (i + 1)
        
        if hits > 0:
            ap = precision_sum / len(true_items)
            aps.append(ap)
    
    return np.mean(aps) if len(aps) > 0 else 0.0


def evaluate_map_at_12(df, predictions):
    """Evaluate MAP@12 on validation set"""
    grouped = df.groupby('customer_id')
    
    y_true = []
    y_pred = []
    
    for customer_id, group in grouped:
        true_items = set(group[group['label'] == 1]['article_id'].values)
        y_true.append(true_items)
        
        customer_df = group.copy()
        customer_df['pred_score'] = predictions[:len(customer_df)]
        customer_df = customer_df.sort_values('pred_score', ascending=False)
        pred_items = customer_df['article_id'].values.tolist()
        y_pred.append(pred_items)
        
        predictions = predictions[len(customer_df):]
    
    return calculate_map_at_k(y_true, y_pred, k=12)


print("‚úì MAP@12 evaluation function loaded")


In [None]:
# ============================================================================
# TRAIN LIGHTGBM RANKER WITHOUT IMAGE FEATURES
# ============================================================================

print("\n" + "="*80)
print("TRAINING LIGHTGBM RANKER (NO IMAGE FEATURES)")
print("="*80)

# Prepare data for ranking (group by customer_id)
train_customer_ids = train_data['customer_id'].values
val_customer_ids = val_data['customer_id'].values

# Sort data by customer_id for group information
train_sort_idx = train_customer_ids.argsort()
val_sort_idx = val_customer_ids.argsort()

X_train_sorted = X_train.iloc[train_sort_idx].reset_index(drop=True)
y_train_sorted = y_train.iloc[train_sort_idx].reset_index(drop=True)
train_customer_ids_sorted = train_customer_ids[train_sort_idx]

X_val_sorted = X_val.iloc[val_sort_idx].reset_index(drop=True)
y_val_sorted = y_val.iloc[val_sort_idx].reset_index(drop=True)
val_customer_ids_sorted = val_customer_ids[val_sort_idx]

# Group information for ranking
train_groups = pd.Series(train_customer_ids_sorted).value_counts().sort_index().values
val_groups = pd.Series(val_customer_ids_sorted).value_counts().sort_index().values

# Create LightGBM datasets
train_dataset = lgb.Dataset(
    X_train_sorted,
    label=y_train_sorted,
    group=train_groups,
    free_raw_data=False
)

val_dataset = lgb.Dataset(
    X_val_sorted,
    label=y_val_sorted,
    group=val_groups,
    reference=train_dataset,
    free_raw_data=False
)

# Model configuration (LambdaRank)
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': ABLATION_CONFIG['RANDOM_STATE'],
    'force_col_wise': True,
    'label_gain': [0, 1],  # 0 for negative, 1 for positive
}

# Train model
print("\nTraining LightGBM Ranker...")
model_lgb_no_image = lgb.train(
    params,
    train_dataset,
    num_boost_round=500,
    valid_sets=[val_dataset],
    valid_names=['val'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=50)
    ]
)

# Make predictions
print("\nGenerating predictions...")
preds_lgb_no_image = model_lgb_no_image.predict(X_val_sorted, num_iteration=model_lgb_no_image.best_iteration)

# Evaluate
map12_lgb_no_image = evaluate_map_at_12(val_data.iloc[val_sort_idx].reset_index(drop=True), preds_lgb_no_image.copy())

print(f"\n‚úì LightGBM Ranker (NO IMAGE) Results:")
print(f"  MAP@12: {map12_lgb_no_image:.6f}")
print(f"  Best iteration: {model_lgb_no_image.best_iteration}")

# Save predictions
pred_df_lgb = val_data.iloc[val_sort_idx].reset_index(drop=True)[['customer_id', 'article_id', 'label']].copy()
pred_df_lgb['pred_score'] = preds_lgb_no_image
pred_path_lgb = ABLATION_CONFIG['MODEL_PATH'] / 'lgb_ranker_no_image_predictions.parquet'
pred_df_lgb.to_parquet(pred_path_lgb, index=False)
print(f"‚úì Saved predictions to {pred_path_lgb}")

gc.collect()


In [None]:
# ============================================================================
# TWO-TOWER NEURAL NETWORK (USER + ITEM, NO IMAGE)
# ============================================================================

class TwoTowerModel(nn.Module):
    """
    Two-tower neural network for recommendation (NO IMAGE):
    - User Tower: User features -> User embedding
    - Item Tower: Item features -> Item embedding
    - Fusion: Concatenated embeddings -> Final prediction
    """
    
    def __init__(self, 
                 user_feature_dim,
                 item_feature_dim,
                 user_embedding_dim=128,
                 item_embedding_dim=64,
                 fusion_hidden_dims=[256, 128, 64],
                 dropout_rate=0.3):
        super(TwoTowerModel, self).__init__()
        
        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_feature_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, user_embedding_dim),
            nn.BatchNorm1d(user_embedding_dim),
            nn.ReLU()
        )
        
        # Item Tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_feature_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, item_embedding_dim),
            nn.BatchNorm1d(item_embedding_dim),
            nn.ReLU()
        )
        
        # Fusion Layer
        fusion_layers = []
        input_dim = user_embedding_dim + item_embedding_dim
        
        for hidden_dim in fusion_hidden_dims:
            fusion_layers.extend([
                nn.Linear(input_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            input_dim = hidden_dim
        
        # Output layer
        fusion_layers.append(nn.Linear(input_dim, 1))
        fusion_layers.append(nn.Sigmoid())
        
        self.fusion = nn.Sequential(*fusion_layers)
    
    def forward(self, user_features, item_features):
        # User embedding
        user_emb = self.user_tower(user_features)
        
        # Item embedding
        item_emb = self.item_tower(item_features)
        
        # Concatenate
        fused = torch.cat([user_emb, item_emb], dim=1)
        
        # Final prediction
        output = self.fusion(fused)
        
        return output.squeeze()

print("‚úì TwoTowerModel class defined (NO IMAGE)")


In [None]:
# ============================================================================
# DATASET CLASS AND DATA PREPARATION FOR 2-TOWER MODEL
# ============================================================================

class TwoTowerDataset(Dataset):
    """Dataset for 2-tower training (NO IMAGE)"""
    
    def __init__(self, df, user_features, item_features, labels=None):
        self.df = df.reset_index(drop=True)
        self.user_features = user_features.values.astype(np.float32)
        self.item_features = item_features.values.astype(np.float32)
        self.labels = labels.values.astype(np.float32) if labels is not None else None
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        user_feat = torch.FloatTensor(self.user_features[idx])
        item_feat = torch.FloatTensor(self.item_features[idx])
        
        if self.labels is not None:
            label = torch.FloatTensor([self.labels[idx]])
            return user_feat, item_feat, label
        else:
            return user_feat, item_feat

# Prepare user and item features separately
X_train_user = train_data[user_feature_cols].copy()
X_train_item = train_data[item_feature_cols].copy()
X_val_user = val_data[user_feature_cols].copy()
X_val_item = val_data[item_feature_cols].copy()

# Fill missing values and handle categorical
for df in [X_train_user, X_train_item, X_val_user, X_val_item]:
    for col in df.columns:
        if df[col].dtype.name == 'category':
            df[col] = pd.Categorical(df[col]).codes
            df[col] = df[col].replace(-1, 0)
        else:
            df[col] = df[col].fillna(0)

# Standardize features
scaler_user = StandardScaler()
scaler_item = StandardScaler()

X_train_user_scaled = pd.DataFrame(
    scaler_user.fit_transform(X_train_user),
    columns=user_feature_cols
)
X_val_user_scaled = pd.DataFrame(
    scaler_user.transform(X_val_user),
    columns=user_feature_cols
)

X_train_item_scaled = pd.DataFrame(
    scaler_item.fit_transform(X_train_item),
    columns=item_feature_cols
)
X_val_item_scaled = pd.DataFrame(
    scaler_item.transform(X_val_item),
    columns=item_feature_cols
)

# Create datasets
train_dataset_2tower = TwoTowerDataset(
    train_data,
    X_train_user_scaled,
    X_train_item_scaled,
    y_train
)

val_dataset_2tower = TwoTowerDataset(
    val_data,
    X_val_user_scaled,
    X_val_item_scaled,
    y_val
)

# Create data loaders
train_loader_2tower = DataLoader(
    train_dataset_2tower,
    batch_size=ABLATION_CONFIG['BATCH_SIZE'],
    shuffle=True,
    num_workers=0,
    pin_memory=False
)

val_loader_2tower = DataLoader(
    val_dataset_2tower,
    batch_size=ABLATION_CONFIG['BATCH_SIZE'],
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

print(f"\n‚úì Data loaders created:")
print(f"  Train batches: {len(train_loader_2tower)}")
print(f"  Val batches: {len(val_loader_2tower)}")

gc.collect()


In [None]:
# ============================================================================
# TRAIN 2-TOWER NEURAL NETWORK (NO IMAGE)
# ============================================================================

print("\n" + "="*80)
print("TRAINING 2-TOWER NEURAL NETWORK (NO IMAGE FEATURES)")
print("="*80)

# Initialize model
model_2tower = TwoTowerModel(
    user_feature_dim=len(user_feature_cols),
    item_feature_dim=len(item_feature_cols),
    user_embedding_dim=128,
    item_embedding_dim=64,
    fusion_hidden_dims=[256, 128, 64],
    dropout_rate=0.3
).to(DEVICE)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(
    model_2tower.parameters(),
    lr=ABLATION_CONFIG['LEARNING_RATE'],
    weight_decay=1e-5
)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',
    factor=0.5,
    patience=3
)

# Training history
history_2tower = {
    'train_loss': [],
    'val_loss': [],
    'val_map12': []
}

best_map12_2tower = 0.0
best_epoch_2tower = 0
patience_counter_2tower = 0

val_customer_ids_list = val_data['customer_id'].values
val_article_ids_list = val_data['article_id'].values

for epoch in range(ABLATION_CONFIG['N_EPOCHS']):
    # Training phase
    model_2tower.train()
    train_loss = 0.0
    train_batches = 0
    
    for user_feat, item_feat, labels in tqdm(train_loader_2tower, desc=f"Epoch {epoch+1}/{ABLATION_CONFIG['N_EPOCHS']} [Train]"):
        user_feat = user_feat.to(DEVICE)
        item_feat = item_feat.to(DEVICE)
        labels = labels.to(DEVICE).squeeze()
        
        optimizer.zero_grad()
        outputs = model_2tower(user_feat, item_feat)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_batches += 1
    
    avg_train_loss = train_loss / train_batches
    
    # Validation phase
    model_2tower.eval()
    val_loss = 0.0
    val_batches = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch_idx, (user_feat, item_feat, labels) in enumerate(tqdm(val_loader_2tower, desc=f"Epoch {epoch+1}/{ABLATION_CONFIG['N_EPOCHS']} [Val]")):
            user_feat = user_feat.to(DEVICE)
            item_feat = item_feat.to(DEVICE)
            labels = labels.to(DEVICE).squeeze()
            
            outputs = model_2tower(user_feat, item_feat)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            val_batches += 1
            
            predictions = outputs.cpu().numpy().flatten()
            labels_np = labels.cpu().numpy().flatten()
            
            all_predictions.extend(predictions.tolist())
            all_labels.extend(labels_np.tolist())
    
    avg_val_loss = val_loss / val_batches
    
    # Calculate MAP@12
    val_eval_df = pd.DataFrame({
        'customer_id': val_customer_ids_list[:len(all_predictions)],
        'article_id': val_article_ids_list[:len(all_predictions)],
        'label': all_labels[:len(all_predictions)],
        'pred_score': all_predictions
    })
    
    map12_score = evaluate_map_at_12(val_eval_df, np.array(all_predictions))
    
    # Update learning rate
    scheduler.step(map12_score)
    
    # Store history
    history_2tower['train_loss'].append(avg_train_loss)
    history_2tower['val_loss'].append(avg_val_loss)
    history_2tower['val_map12'].append(map12_score)
    
    # Print epoch summary
    print(f"\nEpoch {epoch+1}/{ABLATION_CONFIG['N_EPOCHS']}:")
    print(f"  Train Loss: {avg_train_loss:.6f}")
    print(f"  Val Loss: {avg_val_loss:.6f}")
    print(f"  Val MAP@12: {map12_score:.6f}")
    print(f"  LR: {optimizer.param_groups[0]['lr']:.2e}")
    
    # Save best model
    if map12_score > best_map12_2tower:
        best_map12_2tower = map12_score
        best_epoch_2tower = epoch + 1
        patience_counter_2tower = 0
        
        # Save checkpoint
        checkpoint_dir = ABLATION_CONFIG['MODEL_PATH'] / 'checkpoints'
        checkpoint_dir.mkdir(exist_ok=True)
        torch.save({
            'model_state_dict': model_2tower.state_dict(),
            'map12': map12_score,
            'epoch': epoch + 1
        }, checkpoint_dir / '2tower_no_image_best.pt')
        
        print(f"  ‚úì Saved best model (MAP@12: {map12_score:.6f})")
    else:
        patience_counter_2tower += 1
        print(f"  No improvement ({patience_counter_2tower}/{ABLATION_CONFIG['EARLY_STOPPING_PATIENCE']})")
    
    # Early stopping
    if patience_counter_2tower >= ABLATION_CONFIG['EARLY_STOPPING_PATIENCE']:
        print(f"\n‚ö†Ô∏è  Early stopping triggered after {epoch+1} epochs")
        print(f"   Best MAP@12: {best_map12_2tower:.6f} at epoch {best_epoch_2tower}")
        break

# Load best model
checkpoint = torch.load(checkpoint_dir / '2tower_no_image_best.pt', weights_only=False)
model_2tower.load_state_dict(checkpoint['model_state_dict'])

# Final evaluation
model_2tower.eval()
all_predictions_final = []
val_customer_ids_final = []
val_article_ids_final = []

with torch.no_grad():
    for user_feat, item_feat, labels in val_loader_2tower:
        user_feat = user_feat.to(DEVICE)
        item_feat = item_feat.to(DEVICE)
        
        outputs = model_2tower(user_feat, item_feat)
        predictions = outputs.cpu().numpy().flatten()
        
        all_predictions_final.extend(predictions.tolist())

# Save predictions
pred_df_2tower = val_data[['customer_id', 'article_id', 'label']].copy()
pred_df_2tower['pred_score'] = all_predictions_final[:len(pred_df_2tower)]
pred_path_2tower = ABLATION_CONFIG['MODEL_PATH'] / '2tower_no_image_predictions.parquet'
pred_df_2tower.to_parquet(pred_path_2tower, index=False)

print(f"\n‚úì 2-Tower Model (NO IMAGE) Results:")
print(f"  MAP@12: {best_map12_2tower:.6f}")
print(f"  Best epoch: {best_epoch_2tower}")
print(f"‚úì Saved predictions to {pred_path_2tower}")

gc.collect()


In [None]:
# ============================================================================
# COMPARE RESULTS: WITH vs WITHOUT IMAGE FEATURES
# ============================================================================

print("\n" + "="*80)
print("ABLATION STUDY: COMPARISON RESULTS")
print("="*80)

# Load full model results (if available)
print("\nüìä Loading full model results...")
full_model_results = {}

# Try to load LightGBM with image features
try:
    lgb_full_preds = pd.read_parquet(ABLATION_CONFIG['MODEL_PATH'] / 'lgb_ranker_lambdarank_predictions_val.parquet')
    if 'pred_score' in lgb_full_preds.columns:
        map12_lgb_full = evaluate_map_at_12(lgb_full_preds, lgb_full_preds['pred_score'].values)
        full_model_results['LightGBM_Ranker (WITH Image)'] = map12_lgb_full
        print(f"‚úì Loaded LightGBM Ranker (WITH Image): MAP@12 = {map12_lgb_full:.6f}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load LightGBM full model: {e}")

# Try to load Neural Tower with image features
try:
    neural_full_preds = pd.read_parquet(ABLATION_CONFIG['MODEL_PATH'] / 'neural_tower_predictions_val.parquet')
    if 'pred_score' in neural_full_preds.columns:
        map12_neural_full = evaluate_map_at_12(neural_full_preds, neural_full_preds['pred_score'].values)
        full_model_results['Neural_Tower_3Tower (WITH Image)'] = map12_neural_full
        print(f"‚úì Loaded Neural Tower 3-Tower (WITH Image): MAP@12 = {map12_neural_full:.6f}")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load Neural Tower full model: {e}")

# Ablation study results
ablation_results = {
    'LightGBM_Ranker (NO Image)': map12_lgb_no_image,
    'Neural_Tower_2Tower (NO Image)': best_map12_2tower
}

# Combine all results
all_results = {**full_model_results, **ablation_results}

# Create comparison DataFrame
comparison_results = []
for model_name, map12_score in all_results.items():
    comparison_results.append({
        'Model': model_name,
        'MAP@12': map12_score,
        'Has_Image_Features': 'WITH Image' in model_name
    })

comparison_df = pd.DataFrame(comparison_results)
comparison_df = comparison_df.sort_values('MAP@12', ascending=False)

print("\n" + "="*80)
print("COMPARISON: WITH vs WITHOUT IMAGE FEATURES")
print("="*80)
print("\n" + comparison_df.to_string(index=False))

# Calculate impact of image features
print("\n" + "="*80)
print("IMPACT ANALYSIS: IMAGE FEATURES")
print("="*80)

# LightGBM comparison
if 'LightGBM_Ranker (WITH Image)' in full_model_results and 'LightGBM_Ranker (NO Image)' in ablation_results:
    lgb_with = full_model_results['LightGBM_Ranker (WITH Image)']
    lgb_without = ablation_results['LightGBM_Ranker (NO Image)']
    lgb_improvement = ((lgb_with - lgb_without) / lgb_without) * 100
    print(f"\nüìä LightGBM Ranker:")
    print(f"  WITH Image Features:  {lgb_with:.6f}")
    print(f"  WITHOUT Image Features: {lgb_without:.6f}")
    print(f"  Improvement: {lgb_improvement:+.2f}%")
    print(f"  Absolute Gain: {lgb_with - lgb_without:+.6f}")

# Neural Tower comparison
if 'Neural_Tower_3Tower (WITH Image)' in full_model_results and 'Neural_Tower_2Tower (NO Image)' in ablation_results:
    neural_with = full_model_results['Neural_Tower_3Tower (WITH Image)']
    neural_without = ablation_results['Neural_Tower_2Tower (NO Image)']
    neural_improvement = ((neural_with - neural_without) / neural_without) * 100
    print(f"\nüìä Neural Tower:")
    print(f"  3-Tower (WITH Image):  {neural_with:.6f}")
    print(f"  2-Tower (NO Image):    {neural_without:.6f}")
    print(f"  Improvement: {neural_improvement:+.2f}%")
    print(f"  Absolute Gain: {neural_with - neural_without:+.6f}")

# Save comparison results
comparison_path = ABLATION_CONFIG['MODEL_PATH'] / 'ablation_study_comparison.csv'
comparison_df.to_csv(comparison_path, index=False)
print(f"\n‚úì Saved comparison results to {comparison_path}")

# Summary
print("\n" + "="*80)
print("ABLATION STUDY SUMMARY")
print("="*80)
print("\n‚úÖ Ablation study complete!")
print(f"   Total models compared: {len(all_results)}")
print(f"   Best model: {comparison_df.iloc[0]['Model']} (MAP@12: {comparison_df.iloc[0]['MAP@12']:.6f})")

if len(full_model_results) > 0:
    print(f"\nüí° Key Finding:")
    avg_improvement = 0.0
    count = 0
    if 'LightGBM_Ranker (WITH Image)' in full_model_results:
        avg_improvement += lgb_improvement
        count += 1
    if 'Neural_Tower_3Tower (WITH Image)' in full_model_results:
        avg_improvement += neural_improvement
        count += 1
    if count > 0:
        avg_improvement /= count
        print(f"   Average improvement with image features: {avg_improvement:+.2f}%")
        if avg_improvement > 0:
            print(f"   ‚úì Image features provide significant value!")
        else:
            print(f"   ‚ö†Ô∏è  Image features may not be critical for this dataset")

gc.collect()
