### Stage 0: EDA and Baselines

In [None]:
# ============================================================================
# EDA: IMPORTS AND CONFIGURATION
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
from datetime import datetime, timedelta
import gc

warnings.filterwarnings('ignore')

# Set style for presentation-quality plots
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

# Configuration
EDA_CONFIG = {
    'DATA_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations'),
    'PROCESSED_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2'),
    'MODEL_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models'),
    'OUTPUT_DIR': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/eda_plots')
}

EDA_CONFIG['OUTPUT_DIR'].mkdir(exist_ok=True, parents=True)

print("‚úì EDA Configuration loaded")
print(f"  Output directory: {EDA_CONFIG['OUTPUT_DIR']}")


In [None]:
# ============================================================================
# EDA 1: RAW H&M DATASET ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("EDA: RAW H&M DATASET")
print("="*80)

# Load raw data
print("\n Loading raw dataset...")
try:
    transactions = pd.read_csv(EDA_CONFIG['DATA_PATH'] / 'transactions_train.csv', 
                              dtype={'article_id': str, 'customer_id': str})
    articles = pd.read_csv(EDA_CONFIG['DATA_PATH'] / 'articles.csv')
    customers = pd.read_csv(EDA_CONFIG['DATA_PATH'] / 'customers.csv')
    
    print(f"‚úì Transactions: {len(transactions):,} rows")
    print(f"‚úì Articles: {len(articles):,} items")
    print(f"‚úì Customers: {len(customers):,} users")
    
    # Convert date
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    
    # Basic statistics
    print("\nüìà Dataset Statistics:")
    print(f"  Date range: {transactions['t_dat'].min()} to {transactions['t_dat'].max()}")
    print(f"  Unique users: {transactions['customer_id'].nunique():,}")
    print(f"  Unique items: {transactions['article_id'].nunique():,}")
    print(f"  Total purchases: {len(transactions):,}")
    print(f"  Avg purchases per user: {len(transactions) / transactions['customer_id'].nunique():.2f}")
    print(f"  Avg purchases per item: {len(transactions) / transactions['article_id'].nunique():.2f}")
    
    # 1. Transaction Volume Over Time
    print("\n Creating temporal analysis plots...")
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Daily transaction volume
    daily_trans = transactions.groupby(transactions['t_dat'].dt.date).size()
    axes[0, 0].plot(daily_trans.index, daily_trans.values, linewidth=2)
    axes[0, 0].set_title('Daily Transaction Volume', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Date')
    axes[0, 0].set_ylabel('Number of Transactions')
    axes[0, 0].grid(True, alpha=0.3)
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Weekly transaction volume
    weekly_trans = transactions.groupby(transactions['t_dat'].dt.isocalendar().week).size()
    axes[0, 1].bar(weekly_trans.index, weekly_trans.values, alpha=0.7)
    axes[0, 1].set_title('Weekly Transaction Volume', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Week Number')
    axes[0, 1].set_ylabel('Number of Transactions')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # User activity distribution
    user_activity = transactions.groupby('customer_id').size()
    axes[1, 0].hist(user_activity.values, bins=50, edgecolor='black', alpha=0.7)
    axes[1, 0].set_title('User Purchase Activity Distribution', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Number of Purchases per User')
    axes[1, 0].set_ylabel('Number of Users')
    axes[1, 0].set_yscale('log')
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Item popularity distribution
    item_popularity = transactions.groupby('article_id').size()
    axes[1, 1].hist(item_popularity.values, bins=50, edgecolor='black', alpha=0.7, color='orange')
    axes[1, 1].set_title('Item Popularity Distribution', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Number of Purchases per Item')
    axes[1, 1].set_ylabel('Number of Items')
    axes[1, 1].set_yscale('log')
    axes[1, 1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '1_raw_dataset_temporal.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 1_raw_dataset_temporal.png")
    plt.close()
    
    # 2. Category Analysis
    print("\n Creating category analysis plots...")
    if 'article_id' in articles.columns and 'product_type_name' in articles.columns:
        # Convert article_id to string in articles to match transactions
        articles_for_merge = articles[['article_id', 'product_type_name', 'product_group_name']].copy()
        articles_for_merge['article_id'] = articles_for_merge['article_id'].astype(str)
        
        # Merge transactions with articles
        trans_articles = transactions.merge(articles_for_merge, 
                                           on='article_id', how='left')
        
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Top product types
        top_product_types = trans_articles['product_type_name'].value_counts().head(15)
        axes[0].barh(range(len(top_product_types)), top_product_types.values, alpha=0.7)
        axes[0].set_yticks(range(len(top_product_types)))
        axes[0].set_yticklabels(top_product_types.index, fontsize=9)
        axes[0].set_title('Top 15 Product Types by Sales', fontsize=14, fontweight='bold')
        axes[0].set_xlabel('Number of Transactions')
        axes[0].grid(True, alpha=0.3, axis='x')
        
        # Top product groups
        top_product_groups = trans_articles['product_group_name'].value_counts().head(15)
        axes[1].barh(range(len(top_product_groups)), top_product_groups.values, alpha=0.7, color='green')
        axes[1].set_yticks(range(len(top_product_groups)))
        axes[1].set_yticklabels(top_product_groups.index, fontsize=9)
        axes[1].set_title('Top 15 Product Groups by Sales', fontsize=14, fontweight='bold')
        axes[1].set_xlabel('Number of Transactions')
        axes[1].grid(True, alpha=0.3, axis='x')
        
        plt.tight_layout()
        plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '2_raw_dataset_categories.png', dpi=300, bbox_inches='tight')
        print(f"‚úì Saved: 2_raw_dataset_categories.png")
        plt.close()
    
    # 3. User Behavior Analysis
    print("\n Creating user behavior analysis...")
    user_stats = transactions.groupby('customer_id').agg({
        'article_id': ['count', 'nunique'],
        't_dat': ['min', 'max']
    }).reset_index()
    user_stats.columns = ['customer_id', 'total_purchases', 'unique_items', 'first_purchase', 'last_purchase']
    user_stats['purchase_span_days'] = (user_stats['last_purchase'] - user_stats['first_purchase']).dt.days
    user_stats['avg_items_per_purchase'] = user_stats['total_purchases'] / user_stats['unique_items']
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Purchase count distribution
    axes[0, 0].hist(user_stats['total_purchases'], bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Distribution of Total Purchases per User', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Total Purchases')
    axes[0, 0].set_ylabel('Number of Users')
    axes[0, 0].set_yscale('log')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Unique items distribution
    axes[0, 1].hist(user_stats['unique_items'], bins=50, edgecolor='black', alpha=0.7, color='orange')
    axes[0, 1].set_title('Distribution of Unique Items per User', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Unique Items Purchased')
    axes[0, 1].set_ylabel('Number of Users')
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Purchase span
    axes[1, 0].hist(user_stats['purchase_span_days'], bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[1, 0].set_title('Distribution of Purchase Span (Days)', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Days Between First and Last Purchase')
    axes[1, 0].set_ylabel('Number of Users')
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Scatter: Purchases vs Unique Items
    axes[1, 1].scatter(user_stats['total_purchases'], user_stats['unique_items'], 
                       alpha=0.3, s=10, edgecolors='none')
    axes[1, 1].set_title('Purchases vs Unique Items', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Total Purchases')
    axes[1, 1].set_ylabel('Unique Items')
    axes[1, 1].set_xscale('log')
    axes[1, 1].set_yscale('log')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '3_raw_dataset_user_behavior.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 3_raw_dataset_user_behavior.png")
    plt.close()
    
    print("\n Raw Dataset EDA Complete!")
    
except Exception as e:
    print(f"‚ö†Ô∏è Error loading raw data: {e}")
    print("   Using processed data instead...")

gc.collect()


In [None]:
# ============================================================================
# EDA 2: TRAINING DATA ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("EDA: TRAINING DATA")
print("="*80)

# Load training data
print("\n Loading training data...")
try:
    train_data = pd.read_parquet(EDA_CONFIG['MODEL_PATH'] / 'train_data.parquet')
    print(f"‚úì Loaded {len(train_data):,} training samples")
    
    # Basic statistics
    print("\n Training Data Statistics:")
    print(f"  Total samples: {len(train_data):,}")
    print(f"  Unique users: {train_data['customer_id'].nunique():,}")
    print(f"  Unique items: {train_data['article_id'].nunique():,}")
    print(f"  Positive samples: {train_data['label'].sum():,} ({100*train_data['label'].mean():.2f}%)")
    print(f"  Negative samples: {(train_data['label']==0).sum():,} ({100*(1-train_data['label'].mean()):.2f}%)")
    
    # 1. Label Distribution
    print("\n Creating label distribution plots...")
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Pie chart
    label_counts = train_data['label'].value_counts()
    axes[0].pie(label_counts.values, labels=['Negative (0)', 'Positive (1)'], 
                autopct='%1.1f%%', startangle=90, colors=['#ff9999', '#66b3ff'])
    axes[0].set_title('Label Distribution (Pie Chart)', fontsize=14, fontweight='bold')
    
    # Bar chart
    axes[1].bar(['Negative (0)', 'Positive (1)'], label_counts.values, 
                color=['#ff9999', '#66b3ff'], alpha=0.7, edgecolor='black')
    axes[1].set_title('Label Distribution (Bar Chart)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Number of Samples')
    axes[1].grid(True, alpha=0.3, axis='y')
    for i, v in enumerate(label_counts.values):
        axes[1].text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '4_train_label_distribution.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 4_train_label_distribution.png")
    plt.close()
    
    # 2. User Activity in Training Data
    print("\n Creating user activity analysis...")
    user_train_stats = train_data.groupby('customer_id').agg({
        'article_id': 'nunique',
        'label': 'sum'
    }).reset_index()
    user_train_stats.columns = ['customer_id', 'candidate_items', 'positive_items']
    user_train_stats['positive_ratio'] = user_train_stats['positive_items'] / user_train_stats['candidate_items']
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Candidate items per user
    axes[0, 0].hist(user_train_stats['candidate_items'], bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Distribution of Candidate Items per User', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Number of Candidate Items')
    axes[0, 0].set_ylabel('Number of Users')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Positive items per user
    axes[0, 1].hist(user_train_stats['positive_items'], bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[0, 1].set_title('Distribution of Positive Items per User', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Number of Positive Items')
    axes[0, 1].set_ylabel('Number of Users')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Positive ratio distribution
    axes[1, 0].hist(user_train_stats['positive_ratio'], bins=50, edgecolor='black', alpha=0.7, color='orange')
    axes[1, 0].set_title('Distribution of Positive Ratio per User', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Positive Ratio (Positive / Total Candidates)')
    axes[1, 0].set_ylabel('Number of Users')
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Scatter: Candidates vs Positives
    axes[1, 1].scatter(user_train_stats['candidate_items'], user_train_stats['positive_items'],
                       alpha=0.3, s=10, edgecolors='none')
    axes[1, 1].set_title('Candidate Items vs Positive Items', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Number of Candidate Items')
    axes[1, 1].set_ylabel('Number of Positive Items')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '5_train_user_activity.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 5_train_user_activity.png")
    plt.close()
    
    # 3. Item Popularity in Training Data
    print("\n Creating item popularity analysis...")
    item_train_stats = train_data.groupby('article_id').agg({
        'customer_id': 'nunique',
        'label': 'sum'
    }).reset_index()
    item_train_stats.columns = ['article_id', 'unique_users', 'positive_labels']
    item_train_stats['positive_ratio'] = item_train_stats['positive_labels'] / item_train_stats['unique_users']
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Items per user distribution
    axes[0, 0].hist(item_train_stats['unique_users'], bins=50, edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Distribution of Users per Item', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Number of Users')
    axes[0, 0].set_ylabel('Number of Items')
    axes[0, 0].set_yscale('log')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Positive labels per item
    axes[0, 1].hist(item_train_stats['positive_labels'], bins=50, edgecolor='black', alpha=0.7, color='green')
    axes[0, 1].set_title('Distribution of Positive Labels per Item', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Number of Positive Labels')
    axes[0, 1].set_ylabel('Number of Items')
    axes[0, 1].set_yscale('log')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Top items by positive labels
    top_items = item_train_stats.nlargest(20, 'positive_labels')
    axes[1, 0].barh(range(len(top_items)), top_items['positive_labels'].values, alpha=0.7)
    axes[1, 0].set_yticks(range(len(top_items)))
    axes[1, 0].set_yticklabels([f"Item {idx}" for idx in top_items['article_id'].values], fontsize=8)
    axes[1, 0].set_title('Top 20 Items by Positive Labels', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Number of Positive Labels')
    axes[1, 0].grid(True, alpha=0.3, axis='x')
    
    # Scatter: Users vs Positive Labels
    axes[1, 1].scatter(item_train_stats['unique_users'], item_train_stats['positive_labels'],
                       alpha=0.3, s=10, edgecolors='none')
    axes[1, 1].set_title('Users vs Positive Labels per Item', fontsize=14, fontweight='bold')
    axes[1, 1].set_xlabel('Number of Users')
    axes[1, 1].set_ylabel('Number of Positive Labels')
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '6_train_item_popularity.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 6_train_item_popularity.png")
    plt.close()
    
    # 4. Feature Statistics (if available)
    print("\nüìä Creating feature statistics...")
    feature_cols = [col for col in train_data.columns if col not in 
                    ['customer_id', 'article_id', 'label', 'user_type', 'train_label', 'val_label']]
    
    if len(feature_cols) > 0:
        # Select numeric features for analysis
        numeric_features = train_data[feature_cols].select_dtypes(include=[np.number]).columns[:20]  # Top 20
        
        if len(numeric_features) > 0:
            fig, axes = plt.subplots(2, 2, figsize=(16, 12))
            
            # Feature value ranges
            feature_ranges = train_data[numeric_features].describe().loc[['min', 'max']].T
            feature_ranges['range'] = feature_ranges['max'] - feature_ranges['min']
            top_ranges = feature_ranges.nlargest(15, 'range')
            
            axes[0, 0].barh(range(len(top_ranges)), top_ranges['range'].values, alpha=0.7)
            axes[0, 0].set_yticks(range(len(top_ranges)))
            axes[0, 0].set_yticklabels(top_ranges.index, fontsize=8)
            axes[0, 0].set_title('Top 15 Features by Value Range', fontsize=14, fontweight='bold')
            axes[0, 0].set_xlabel('Value Range (Max - Min)')
            axes[0, 0].grid(True, alpha=0.3, axis='x')
            
            # Feature means
            feature_means = train_data[numeric_features].mean().sort_values(ascending=False).head(15)
            axes[0, 1].barh(range(len(feature_means)), feature_means.values, alpha=0.7, color='green')
            axes[0, 1].set_yticks(range(len(feature_means)))
            axes[0, 1].set_yticklabels(feature_means.index, fontsize=8)
            axes[0, 1].set_title('Top 15 Features by Mean Value', fontsize=14, fontweight='bold')
            axes[0, 1].set_xlabel('Mean Value')
            axes[0, 1].grid(True, alpha=0.3, axis='x')
            
            # Feature correlation with label (top correlated)
            correlations = train_data[numeric_features].corrwith(train_data['label']).abs().sort_values(ascending=False).head(15)
            axes[1, 0].barh(range(len(correlations)), correlations.values, alpha=0.7, color='orange')
            axes[1, 0].set_yticks(range(len(correlations)))
            axes[1, 0].set_yticklabels(correlations.index, fontsize=8)
            axes[1, 0].set_title('Top 15 Features Correlated with Label', fontsize=14, fontweight='bold')
            axes[1, 0].set_xlabel('Absolute Correlation with Label')
            axes[1, 0].grid(True, alpha=0.3, axis='x')
            
            # Feature variance
            feature_vars = train_data[numeric_features].var().sort_values(ascending=False).head(15)
            axes[1, 1].barh(range(len(feature_vars)), feature_vars.values, alpha=0.7, color='purple')
            axes[1, 1].set_yticks(range(len(feature_vars)))
            axes[1, 1].set_yticklabels(feature_vars.index, fontsize=8)
            axes[1, 1].set_title('Top 15 Features by Variance', fontsize=14, fontweight='bold')
            axes[1, 1].set_xlabel('Variance')
            axes[1, 1].grid(True, alpha=0.3, axis='x')
            
            plt.tight_layout()
            plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '7_train_feature_statistics.png', dpi=300, bbox_inches='tight')
            print(f"‚úì Saved: 7_train_feature_statistics.png")
            plt.close()
    
    print("\n Training Data EDA Complete!")
    
except Exception as e:
    print(f" Error loading training data: {e}")

gc.collect()


In [None]:
# ============================================================================
# EDA 3: VALIDATION DATA ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("EDA: VALIDATION DATA")
print("="*80)
# Load validation data
print("\n Loading validation data...")
try:
    val_data = pd.read_parquet(EDA_CONFIG['MODEL_PATH'] / 'val_data.parquet')
    print(f"‚úì Loaded {len(val_data):,} validation samples")
    
    # Basic statistics
    print("\n Validation Data Statistics:")
    print(f"  Total samples: {len(val_data):,}")
    print(f"  Unique users: {val_data['customer_id'].nunique():,}")
    print(f"  Unique items: {val_data['article_id'].nunique():,}")
    print(f"  Positive samples: {val_data['label'].sum():,} ({100*val_data['label'].mean():.2f}%)")
    print(f"  Negative samples: {(val_data['label']==0).sum():,} ({100*(1-val_data['label'].mean()):.2f}%)")
    
    # 1. Label Distribution Comparison
    print("\n Creating train vs validation comparison...")
    try:
        train_data = pd.read_parquet(EDA_CONFIG['MODEL_PATH'] / 'train_data.parquet')
        
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))
        
        # Train label distribution
        train_label_counts = train_data['label'].value_counts()
        axes[0].bar(['Negative (0)', 'Positive (1)'], train_label_counts.values,
                    color=['#ff9999', '#66b3ff'], alpha=0.7, edgecolor='black')
        axes[0].set_title('Training Data Label Distribution', fontsize=14, fontweight='bold')
        axes[0].set_ylabel('Number of Samples')
        axes[0].grid(True, alpha=0.3, axis='y')
        for i, v in enumerate(train_label_counts.values):
            axes[0].text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')
        
        # Validation label distribution
        val_label_counts = val_data['label'].value_counts()
        axes[1].bar(['Negative (0)', 'Positive (1)'], val_label_counts.values,
                    color=['#ff9999', '#66b3ff'], alpha=0.7, edgecolor='black')
        axes[1].set_title('Validation Data Label Distribution', fontsize=14, fontweight='bold')
        axes[1].set_ylabel('Number of Samples')
        axes[1].grid(True, alpha=0.3, axis='y')
        for i, v in enumerate(val_label_counts.values):
            axes[1].text(i, v, f'{v:,}', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '8_train_val_label_comparison.png', dpi=300, bbox_inches='tight')
        print(f"‚úì Saved: 8_train_val_label_comparison.png")
        plt.close()
    except:
        pass
    
    # 2. Validation User Analysis
    print("\n Creating validation user analysis...")
    val_user_stats = val_data.groupby('customer_id').agg({
        'article_id': 'nunique',
        'label': 'sum'
    }).reset_index()
    val_user_stats.columns = ['customer_id', 'candidate_items', 'positive_items']
    val_user_stats['positive_ratio'] = val_user_stats['positive_items'] / val_user_stats['candidate_items']
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Candidate items per user
    axes[0, 0].hist(val_user_stats['candidate_items'], bins=50, edgecolor='black', alpha=0.7, color='purple')
    axes[0, 0].set_title('Distribution of Candidate Items per User (Validation)', fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Number of Candidate Items')
    axes[0, 0].set_ylabel('Number of Users')
    axes[0, 0].grid(True, alpha=0.3, axis='y')
    
    # Positive items per user
    axes[0, 1].hist(val_user_stats['positive_items'], bins=50, edgecolor='black', alpha=0.7, color='teal')
    axes[0, 1].set_title('Distribution of Positive Items per User (Validation)', fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Number of Positive Items')
    axes[0, 1].set_ylabel('Number of Users')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Positive ratio distribution
    axes[1, 0].hist(val_user_stats['positive_ratio'], bins=50, edgecolor='black', alpha=0.7, color='coral')
    axes[1, 0].set_title('Distribution of Positive Ratio per User (Validation)', fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel('Positive Ratio (Positive / Total Candidates)')
    axes[1, 0].set_ylabel('Number of Users')
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # Box plot: Positive items distribution
    axes[1, 1].boxplot([val_user_stats['positive_items'].values], vert=True, patch_artist=True,
                        boxprops=dict(facecolor='lightblue', alpha=0.7))
    axes[1, 1].set_title('Distribution of Positive Items per User (Box Plot)', fontsize=14, fontweight='bold')
    axes[1, 1].set_ylabel('Number of Positive Items')
    axes[1, 1].grid(True, alpha=0.3, axis='y')
    axes[1, 1].set_xticklabels(['Validation Users'])
    
    plt.tight_layout()
    plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '9_val_user_analysis.png', dpi=300, bbox_inches='tight')
    print(f"‚úì Saved: 9_val_user_analysis.png")
    plt.close()
    
    # 3. Train vs Validation Comparison
    print("\n Creating train vs validation comparison...")
    try:
        train_data = pd.read_parquet(EDA_CONFIG['MODEL_PATH'] / 'train_data.parquet')
        train_user_stats = train_data.groupby('customer_id').agg({
            'article_id': 'nunique',
            'label': 'sum'
        }).reset_index()
        train_user_stats.columns = ['customer_id', 'candidate_items', 'positive_items']
        
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        
        # Candidate items comparison
        axes[0, 0].hist([train_user_stats['candidate_items'], val_user_stats['candidate_items']],
                        bins=30, alpha=0.7, label=['Train', 'Validation'], edgecolor='black')
        axes[0, 0].set_title('Candidate Items per User: Train vs Validation', fontsize=14, fontweight='bold')
        axes[0, 0].set_xlabel('Number of Candidate Items')
        axes[0, 0].set_ylabel('Number of Users')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3, axis='y')
        
        # Positive items comparison
        axes[0, 1].hist([train_user_stats['positive_items'], val_user_stats['positive_items']],
                        bins=30, alpha=0.7, label=['Train', 'Validation'], edgecolor='black')
        axes[0, 1].set_title('Positive Items per User: Train vs Validation', fontsize=14, fontweight='bold')
        axes[0, 1].set_xlabel('Number of Positive Items')
        axes[0, 1].set_ylabel('Number of Users')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.3, axis='y')
        
        # Summary statistics comparison
        comparison_stats = pd.DataFrame({
            'Train': [
                train_data['label'].mean(),
                train_user_stats['candidate_items'].mean(),
                train_user_stats['positive_items'].mean(),
                len(train_data)
            ],
            'Validation': [
                val_data['label'].mean(),
                val_user_stats['candidate_items'].mean(),
                val_user_stats['positive_items'].mean(),
                len(val_data)
            ]
        }, index=['Positive Ratio', 'Avg Candidates/User', 'Avg Positives/User', 'Total Samples'])
        
        comparison_stats.plot(kind='bar', ax=axes[1, 0], alpha=0.7, edgecolor='black')
        axes[1, 0].set_title('Summary Statistics: Train vs Validation', fontsize=14, fontweight='bold')
        axes[1, 0].set_ylabel('Value')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3, axis='y')
        axes[1, 0].tick_params(axis='x', rotation=45)
        
        # Sample size comparison
        size_comparison = pd.DataFrame({
            'Dataset': ['Train', 'Validation'],
            'Samples': [len(train_data), len(val_data)],
            'Users': [train_data['customer_id'].nunique(), val_data['customer_id'].nunique()],
            'Items': [train_data['article_id'].nunique(), val_data['article_id'].nunique()]
        })
        
        x = np.arange(len(size_comparison))
        width = 0.25
        axes[1, 1].bar(x - width, size_comparison['Samples']/1000, width, label='Samples (K)', alpha=0.7)
        axes[1, 1].bar(x, size_comparison['Users']/1000, width, label='Users (K)', alpha=0.7)
        axes[1, 1].bar(x + width, size_comparison['Items']/1000, width, label='Items (K)', alpha=0.7)
        axes[1, 1].set_title('Dataset Size Comparison', fontsize=14, fontweight='bold')
        axes[1, 1].set_ylabel('Count (Thousands)')
        axes[1, 1].set_xticks(x)
        axes[1, 1].set_xticklabels(size_comparison['Dataset'])
        axes[1, 1].legend()
        axes[1, 1].grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        plt.savefig(EDA_CONFIG['OUTPUT_DIR'] / '10_train_val_comparison.png', dpi=300, bbox_inches='tight')
        print(f"‚úì Saved: 10_train_val_comparison.png")
        plt.close()
    except Exception as e:
        print(f" Could not create comparison: {e}")
    
    print("\n Validation Data EDA Complete!")
    
except Exception as e:
    print(f" Error loading validation data: {e}")

gc.collect()


In [None]:
# ============================================================================
# EDA SUMMARY: ALL GENERATED PLOTS
# ============================================================================

print("\n" + "="*80)
print("EDA SUMMARY: ALL GENERATED PLOTS")
print("="*80)

import os

# List all generated plots
plot_files = sorted([f for f in os.listdir(EDA_CONFIG['OUTPUT_DIR']) if f.endswith('.png')])

print(f"\n Total plots generated: {len(plot_files)}")
print(f"\n Output directory: {EDA_CONFIG['OUTPUT_DIR']}")
print("\n Generated Plots:")

for i, plot_file in enumerate(plot_files, 1):
    file_path = EDA_CONFIG['OUTPUT_DIR'] / plot_file
    file_size = file_path.stat().st_size / 1024  # KB
    print(f"  {i:2d}. {plot_file:40s} ({file_size:.1f} KB)")

print("\n" + "="*80)
print(" EDA COMPLETE!")
print("="*80)
print("\n All plots are saved in high resolution (300 DPI) and ready for presentation!")
print(f"   Location: {EDA_CONFIG['OUTPUT_DIR']}")
print("\n Plot Categories:")
print("   ‚Ä¢ Raw Dataset: Temporal patterns, categories, user behavior")
print("   ‚Ä¢ Training Data: Label distribution, user activity, item popularity, features")
print("   ‚Ä¢ Validation Data: User analysis, train/val comparison")
print("\n All plots use presentation-quality styling with:")
print("   ‚Ä¢ Professional color schemes")
print("   ‚Ä¢ Clear labels and titles")
print("   ‚Ä¢ Grid lines for readability")
print("   ‚Ä¢ High resolution (300 DPI) for printing")


In [32]:
# ============================================================================
# BASELINE MODELS: IMPORTS AND CONFIGURATION
# ============================================================================

import pandas as pd
import numpy as np
from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import warnings
import gc

warnings.filterwarnings('ignore')

# Reuse MAP@12 evaluation function
def calculate_map_at_k(y_true, y_pred, k=12):
    """Calculate Mean Average Precision at K (MAP@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        hits = 0
        precision_sum = 0.0
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                hits += 1
                precision_sum += hits / (i + 1)
        
        if hits > 0:
            # Denominator must be the number of relevant items we evaluate at K,
            # i.e. min(#true_items, K). This keeps AP in [0, 1].
            denom = min(len(true_items), k)
            ap = precision_sum / denom
            aps.append(ap)
    
    return np.mean(aps) if len(aps) > 0 else 0.0

def evaluate_map_at_12(df, predictions):
    """Evaluate MAP@12 on validation set"""
    grouped = df.groupby('customer_id')
    
    y_true = []
    y_pred = []
    
    for customer_id, group in grouped:
        true_items = set(group[group['label'] == 1]['article_id'].values)
        y_true.append(true_items)
        
        customer_df = group.copy()
        customer_df['pred_score'] = predictions[:len(customer_df)]
        customer_df = customer_df.sort_values('pred_score', ascending=False)
        pred_items = customer_df['article_id'].values.tolist()
        y_pred.append(pred_items)
        
        predictions = predictions[len(customer_df):]
    
    return calculate_map_at_k(y_true, y_pred, k=12)

# Configuration
BASELINE_CONFIG = {
    'DATA_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2'),
    'MODEL_PATH': Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models'),
    'RANDOM_STATE': 42
}

print("‚úì Baseline models configuration loaded")
print(f"  Model path: {BASELINE_CONFIG['MODEL_PATH']}")


‚úì Baseline models configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models


In [33]:
# ============================================================================
# LOAD DATA FOR BASELINE MODELS
# ============================================================================

print("\n" + "="*80)
print("LOADING DATA FOR BASELINE MODELS")
print("="*80)

# Load training transactions for building user-item matrix
print("\nüìä Loading training transactions...")
train_transactions = pd.read_parquet(BASELINE_CONFIG['DATA_PATH'] / 'train_transactions.parquet')
print(f"‚úì Loaded {len(train_transactions):,} training transactions")

# Load validation data for evaluation
print("\nüìä Loading validation data...")
val_data = pd.read_parquet(BASELINE_CONFIG['MODEL_PATH'] / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")
print(f"  Unique users: {val_data['customer_id'].nunique():,}")
print(f"  Unique items: {val_data['article_id'].nunique():,}")

# Get unique users and items
all_users = sorted(train_transactions['customer_id'].unique().tolist())
all_items = sorted(train_transactions['article_id'].unique().tolist())

print(f"\nüìà Data Statistics:")
print(f"  Training users: {len(all_users):,}")
print(f"  Training items: {len(all_items):,}")
print(f"  Validation users: {val_data['customer_id'].nunique():,}")
print(f"  Validation items: {val_data['article_id'].nunique():,}")

# Create user and item mappings
user_to_idx = {user: idx for idx, user in enumerate(all_users)}
item_to_idx = {item: idx for idx, item in enumerate(all_items)}
idx_to_user = {idx: user for user, idx in user_to_idx.items()}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

print(f"\n‚úì Created mappings:")
print(f"  User mappings: {len(user_to_idx):,}")
print(f"  Item mappings: {len(item_to_idx):,}")

gc.collect()



LOADING DATA FOR BASELINE MODELS

üìä Loading training transactions...
‚úì Loaded 412,156 training transactions

üìä Loading validation data...
‚úì Loaded 121,164 validation samples
  Unique users: 7,132
  Unique items: 13,925

üìà Data Statistics:
  Training users: 47,543
  Training items: 15,932
  Validation users: 7,132
  Validation items: 13,925

‚úì Created mappings:
  User mappings: 47,543
  Item mappings: 15,932


0

In [34]:
# ============================================================================
# BASELINE 1: POPULARITY-BASED RECOMMENDATION
# ============================================================================

print("\n" + "="*80)
print("BASELINE 1: POPULARITY-BASED RECOMMENDATION")
print("="*80)

# Calculate item popularity from training data
print("\nüìä Calculating item popularity...")
item_popularity = train_transactions.groupby('article_id').size().reset_index(name='popularity')
item_popularity = item_popularity.sort_values('popularity', ascending=False)

print(f"‚úì Calculated popularity for {len(item_popularity):,} items")
print(f"  Most popular item: {item_popularity.iloc[0]['article_id']} ({item_popularity.iloc[0]['popularity']} purchases)")

# Create popularity scores for validation candidates
print("\nüìä Generating popularity-based predictions...")
val_data_pop = val_data.copy()
val_data_pop = val_data_pop.merge(
    item_popularity[['article_id', 'popularity']],
    on='article_id',
    how='left'
)
val_data_pop['pred_score'] = val_data_pop['popularity'].fillna(0)

# Evaluate
map12_popularity = evaluate_map_at_12(val_data_pop, val_data_pop['pred_score'].values)

print(f"\n‚úì Popularity-Based Baseline Results:")
print(f"  MAP@12: {map12_popularity:.6f}")

# Save predictions
pred_path_pop = BASELINE_CONFIG['MODEL_PATH'] / 'baseline_popularity_predictions.parquet'
val_data_pop[['customer_id', 'article_id', 'label', 'pred_score']].to_parquet(pred_path_pop, index=False)
print(f"Saved predictions to {pred_path_pop}")

gc.collect()



BASELINE 1: POPULARITY-BASED RECOMMENDATION

üìä Calculating item popularity...
‚úì Calculated popularity for 15,932 items
  Most popular item: 706016001 (585 purchases)

üìä Generating popularity-based predictions...

‚úì Popularity-Based Baseline Results:
  MAP@12: 0.418288
Saved predictions to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/baseline_popularity_predictions.parquet


24

In [None]:
# ============================================================================
# BASELINE 2: USER-BASED COLLABORATIVE FILTERING
# ============================================================================

print("\n" + "="*80)
print("BASELINE 2: USER-BASED COLLABORATIVE FILTERING")
print("="*80)

# Build user-item interaction matrix
print("\n Building user-item interaction matrix...")
train_transactions['user_idx'] = train_transactions['customer_id'].map(user_to_idx)
train_transactions['item_idx'] = train_transactions['article_id'].map(item_to_idx)

# Create sparse matrix (binary: 1 if user purchased item, 0 otherwise)
rows = train_transactions['user_idx'].values
cols = train_transactions['item_idx'].values
data = np.ones(len(train_transactions), dtype=np.float32)

user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(all_users), len(all_items)))
print(f"‚úì Created user-item matrix: {user_item_matrix.shape}")
print(f"  Sparsity: {(1 - user_item_matrix.nnz / (len(all_users) * len(all_items))) * 100:.2f}%")

# Compute user-user similarity matrix (cosine similarity)
print("\n Computing user-user similarity...")
print("  This may take a few minutes...")
user_similarity = cosine_similarity(user_item_matrix, dense_output=False)
print(f"Computed user similarity matrix: {user_similarity.shape}")

# Generate predictions for validation users
print("\n Generating user-based CF predictions...")
val_users = val_data['customer_id'].unique()
val_predictions = []

for user in tqdm(val_users, desc="Predicting for users"):
    if user not in user_to_idx:
        # Cold-start user: use popularity
        user_predictions = np.zeros(len(val_data[val_data['customer_id'] == user]))
    else:
        user_idx = user_to_idx[user]
        
        # Get similar users (top 50)
        user_sim = user_similarity[user_idx].toarray().flatten()
        similar_users_idx = np.argsort(user_sim)[::-1][1:51]  # Exclude self, top 50
        
        # Get items purchased by similar users
        similar_users_items = user_item_matrix[similar_users_idx].sum(axis=0).A1
        
        # Get user's candidate items
        user_candidates = val_data[val_data['customer_id'] == user]['article_id'].values
        user_candidate_idx = [item_to_idx.get(item, -1) for item in user_candidates]
        
        # Score candidates based on similar users' purchases
        user_scores = []
        for item_idx in user_candidate_idx:
            if item_idx >= 0:
                score = similar_users_items[item_idx]
            else:
                score = 0
            user_scores.append(score)
        
        user_predictions = np.array(user_scores)
    
    val_predictions.extend(user_predictions)

val_data_ubcf = val_data.copy()
val_data_ubcf['pred_score'] = val_predictions

# Evaluate
map12_ubcf = evaluate_map_at_12(val_data_ubcf, val_data_ubcf['pred_score'].values)

print(f"\n User-Based CF Results:")
print(f"  MAP@12: {map12_ubcf:.6f}")

# Save predictions
pred_path_ubcf = BASELINE_CONFIG['MODEL_PATH'] / 'baseline_ubcf_predictions.parquet'
val_data_ubcf[['customer_id', 'article_id', 'label', 'pred_score']].to_parquet(pred_path_ubcf, index=False)
print(f" Saved predictions to {pred_path_ubcf}")

gc.collect()


In [None]:
# ============================================================================
# BASELINE 3: ITEM-BASED COLLABORATIVE FILTERING
# ============================================================================

print("\n" + "="*80)
print("BASELINE 3: ITEM-BASED COLLABORATIVE FILTERING")
print("="*80)

# Compute item-item similarity matrix (cosine similarity on transposed matrix)
print("\n Computing item-item similarity...")
print("  This may take a few minutes...")
item_item_matrix = user_item_matrix.T  # Transpose: items x users
item_similarity = cosine_similarity(item_item_matrix, dense_output=False)
print(f"‚úì Computed item similarity matrix: {item_similarity.shape}")

# Generate predictions for validation users
print("\n Generating item-based CF predictions...")
val_users = val_data['customer_id'].unique()
val_predictions = []

for user in tqdm(val_users, desc="Predicting for users"):
    if user not in user_to_idx:
        # Cold-start user: use popularity
        user_predictions = np.zeros(len(val_data[val_data['customer_id'] == user]))
    else:
        user_idx = user_to_idx[user]
        
        # Get user's purchased items
        user_items = user_item_matrix[user_idx].nonzero()[1]  # Items user purchased
        
        if len(user_items) == 0:
            # User with no purchases: use popularity
            user_predictions = np.zeros(len(val_data[val_data['customer_id'] == user]))
        else:
            # Get user's candidate items
            user_candidates = val_data[val_data['customer_id'] == user]['article_id'].values
            user_candidate_idx = [item_to_idx.get(item, -1) for item in user_candidates]
            
            # Score candidates based on similarity to user's purchased items
            user_scores = []
            for candidate_idx in user_candidate_idx:
                if candidate_idx >= 0:
                    # Average similarity to all items user purchased
                    similarities = item_similarity[candidate_idx, user_items].toarray().flatten()
                    score = similarities.mean() if len(similarities) > 0 else 0
                else:
                    score = 0
                user_scores.append(score)
            
            user_predictions = np.array(user_scores)
    
    val_predictions.extend(user_predictions)

val_data_ibcf = val_data.copy()
val_data_ibcf['pred_score'] = val_predictions

# Evaluate
map12_ibcf = evaluate_map_at_12(val_data_ibcf, val_data_ibcf['pred_score'].values)

print(f"\n Item-Based CF Results:")
print(f"  MAP@12: {map12_ibcf:.6f}")

# Save predictions
pred_path_ibcf = BASELINE_CONFIG['MODEL_PATH'] / 'baseline_ibcf_predictions.parquet'
val_data_ibcf[['customer_id', 'article_id', 'label', 'pred_score']].to_parquet(pred_path_ibcf, index=False)
print(f" Saved predictions to {pred_path_ibcf}")

gc.collect()


In [None]:
# ============================================================================
# BASELINE 4: MATRIX FACTORIZATION (SVD)
# ============================================================================

print("\n" + "="*80)
print("BASELINE 4: MATRIX FACTORIZATION (SVD)")
print("="*80)

# Apply SVD for dimensionality reduction
print("\n Applying Truncated SVD...")
n_components = 50  # Number of latent factors
svd = TruncatedSVD(n_components=n_components, random_state=BASELINE_CONFIG['RANDOM_STATE'])
user_factors = svd.fit_transform(user_item_matrix)
item_factors = svd.components_.T  # Transpose to get items x factors

print(f"‚úì SVD decomposition complete")
print(f"  User factors shape: {user_factors.shape}")
print(f"  Item factors shape: {item_factors.shape}")
print(f"  Explained variance: {svd.explained_variance_ratio_.sum():.4f}")

# Generate predictions for validation users
print("\n Generating SVD-based predictions...")
val_users = val_data['customer_id'].unique()
val_predictions = []

for user in tqdm(val_users, desc="Predicting for users"):
    if user not in user_to_idx:
        # Cold-start user: use popularity
        user_predictions = np.zeros(len(val_data[val_data['customer_id'] == user]))
    else:
        user_idx = user_to_idx[user]
        user_vector = user_factors[user_idx]  # User's latent factors
        
        # Get user's candidate items
        user_candidates = val_data[val_data['customer_id'] == user]['article_id'].values
        user_candidate_idx = [item_to_idx.get(item, -1) for item in user_candidates]
        
        # Score candidates using dot product
        user_scores = []
        for candidate_idx in user_candidate_idx:
            if candidate_idx >= 0:
                item_vector = item_factors[candidate_idx]  # Item's latent factors
                score = np.dot(user_vector, item_vector)  # Dot product = predicted rating
            else:
                score = 0
            user_scores.append(score)
        
        user_predictions = np.array(user_scores)
    
    val_predictions.extend(user_predictions)

val_data_svd = val_data.copy()
val_data_svd['pred_score'] = val_predictions

# Evaluate
map12_svd = evaluate_map_at_12(val_data_svd, val_data_svd['pred_score'].values)

print(f"\n SVD Results:")
print(f"  MAP@12: {map12_svd:.6f}")

# Save predictions
pred_path_svd = BASELINE_CONFIG['MODEL_PATH'] / 'baseline_svd_predictions.parquet'
val_data_svd[['customer_id', 'article_id', 'label', 'pred_score']].to_parquet(pred_path_svd, index=False)
print(f" Saved predictions to {pred_path_svd}")

gc.collect()


In [None]:
# ============================================================================
# CF EVALUATION ON TEST DATA
# ============================================================================

from pathlib import Path

print("\n" + "="*80)
print("CF EVALUATION ON TEST DATA")
print("="*80)

# We expect test_data.parquet to be created by the Neural Tower / model training pipeline
# at the same MODEL_PATH used for val_data

test_data_path = EnsembleConfig.MODEL_PATH / 'test_data.parquet'
if not test_data_path.exists():
    print(f"\n‚ö†Ô∏è  test_data.parquet not found at {test_data_path}")
    print("   Skipping CF evaluation on test set. Run the data-splitting step to create it.")
else:
    print(f"\nüìä Loading test data from {test_data_path}...")
    test_data = pd.read_parquet(test_data_path)
    print(f"‚úì Loaded {len(test_data):,} test samples")

    # Generate CF predictions on test_data (same procedure as for val_data)
    print("\nüìä Generating CF predictions on test data...")
    cf_test_predictions = {}
    test_grouped = test_data.groupby('customer_id')

    for model_name in ['svd', 'svdpp', 'als', 'nmf', 'user_cf', 'item_cf']:
        if model_name not in cf_models or cf_models[model_name] is None:
            print(f"\n‚ö†Ô∏è  Skipping {model_name} (model not available)")
            continue

        print(f"\n  Generating {model_name} predictions on test set...")
        predictions = []
        for user_id, group in tqdm(test_grouped, desc=f'{model_name} (test)', leave=False):
            user_predictions = []
            for _, row in group.iterrows():
                item_id = row['article_id']

                if model_name == 'svd':
                    pred = predict_svd(user_id, item_id, cf_models['svd'])
                elif model_name == 'svdpp':
                    if HAS_SURPRISE and cf_models['svdpp'] is not None:
                        try:
                            pred = cf_models['svdpp'].predict(str(user_id), str(item_id)).est
                        except Exception:
                            pred = 0.0
                    else:
                        # Fallback: use SVD-style prediction
                        pred = predict_svd(user_id, item_id, cf_models['svd'])
                elif model_name == 'als':
                    pred = predict_als(user_id, item_id, cf_models['als'])
                elif model_name == 'nmf':
                    pred = predict_nmf(user_id, item_id, cf_models['nmf'])
                elif model_name == 'user_cf':
                    pred = predict_user_cf(user_id, item_id, cf_models['user_cf'])
                elif model_name == 'item_cf':
                    pred = predict_item_cf(user_id, item_id, cf_models['item_cf'])
                else:
                    pred = 0.0

                user_predictions.append(pred)

            predictions.extend(user_predictions)

        cf_test_predictions[model_name] = np.array(predictions)

        # Evaluate MAP@12 on test_data
        map12_score = evaluate_map_at_12(test_data, cf_test_predictions[model_name])
        print(f"    Test MAP@12: {map12_score:.6f}")

    print("\n‚úÖ CF test evaluation complete.")



### Stage 1: Loading Dataset

In [68]:
import pandas as pd
import numpy as np
import gc
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [69]:
class Config:
    # Paths (adjust based on your Kaggle dataset location)
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_2')
    
     # Temporal configuration
    N_TRAIN_WEEKS = 11  # Number of weeks for training
    N_VAL_WEEKS = 1     # Validation week
    TOTAL_WEEKS = 24    # Total weeks to consider (16-24 range)
    
    # User sampling configuration
    TARGET_USERS = 50000  # Target number of users to sample
    MIN_USER_PURCHASES = 1  # Minimum purchases across all weeks for a user
    
    # Cold start configuration
    INCLUDE_COLD_START = True  # Include users with limited history
    COLD_START_RATIO = 0.15  # 15% of sampled users will be cold start
    COLD_START_MAX_PURCHASES = 1  # Users with <= this many purchases are "cold start"
    
    # Stratification configuration
    STRATIFY_BY_ACTIVITY = True  # Stratify users by activity level
    ACTIVITY_BINS = [0, 5, 10, 20, 50, np.inf]  # Purchase count bins
    ACTIVITY_LABELS = ['low', 'medium', 'high', 'very_high', 'extreme']
    
    # Item filtering
    MIN_ITEM_PURCHASES = 5  # Minimum purchases for an item to be included
    
    # Memory optimization
    CHUNK_SIZE = 500_000  # Process transactions in chunks
    
    # Random seed
    RANDOM_STATE = 42

config = Config()
config.OUTPUT_PATH.mkdir(exist_ok=True)
np.random.seed(config.RANDOM_STATE)

In [70]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def reduce_mem_usage(df, verbose=True):
    """
    Reduce memory usage of a dataframe by optimizing dtypes
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # Skip datetime and object columns
        if col_type == object or pd.api.types.is_datetime64_any_dtype(df[col]):
            continue
            
        c_min = df[col].min()
        c_max = df[col].max()
        
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    
    end_mem = df.memory_usage().sum() / 1024**2
    
    if verbose:
        print(f'Memory usage decreased from {start_mem:.2f} MB to {end_mem:.2f} MB '
              f'({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)')
    
    return df

def print_section(title):
    """Pretty print section headers"""
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80)

In [71]:
# ============================================================================
# STEP 1: LOAD AND EXPLORE DATA
# ============================================================================

print_section("STEP 1: LOADING DATA")

# Load transactions
print("Loading transactions...")
transactions = pd.read_csv(
    config.DATA_PATH / 'transactions_train.csv',
    dtype={
        'article_id': 'int32',
        'price': 'float32',
        'sales_channel_id': 'int8'
    },
    parse_dates=['t_dat']
)

print(f"‚úì Loaded {len(transactions):,} transactions")
print(f"  Date range: {transactions['t_dat'].min()} to {transactions['t_dat'].max()}")
print(f"  Unique customers: {transactions['customer_id'].nunique():,}")
print(f"  Unique articles: {transactions['article_id'].nunique():,}")

# Load customers
print("\nLoading customers...")
customers = pd.read_csv(
    config.DATA_PATH / 'customers.csv',
    dtype={
        'FN': 'float32',
        'Active': 'float32',
        'age': 'float32'
    }
)
customers = reduce_mem_usage(customers, verbose=False)
print(f"‚úì Loaded {len(customers):,} customers")

# Load articles
print("\nLoading articles...")
articles = pd.read_csv(
    config.DATA_PATH / 'articles.csv',
    dtype={'article_id': 'int32'}
)
articles = reduce_mem_usage(articles, verbose=False)
print(f"‚úì Loaded {len(articles):,} articles")


  STEP 1: LOADING DATA
Loading transactions...
‚úì Loaded 31,788,324 transactions
  Date range: 2018-09-20 00:00:00 to 2020-09-22 00:00:00
  Unique customers: 1,362,281
  Unique articles: 104,547

Loading customers...
‚úì Loaded 1,371,980 customers

Loading articles...
‚úì Loaded 105,542 articles


In [72]:
# ============================================================================
# STEP 2: TEMPORAL WINDOW SELECTION
# ============================================================================

print_section("STEP 2: SELECTING TEMPORAL WINDOW")

# Get the last date in transactions
max_date = transactions['t_dat'].max()
print(f"Last transaction date: {max_date}")

# Calculate cutoff dates for the full window
window_start = max_date - timedelta(weeks=config.TOTAL_WEEKS)
print(f"\nUsing {config.TOTAL_WEEKS} weeks of data for sampling")
print(f"Window: {window_start.date()} to {max_date.date()}")

# Filter transactions to our window
print(f"\nFiltering transactions from {window_start.date()} onwards...")
transactions = transactions[transactions['t_dat'] >= window_start].copy()
print(f"‚úì Retained {len(transactions):,} transactions ({len(transactions)/1e6:.2f}M)")

# Add week number (relative to window start)
transactions['week'] = ((transactions['t_dat'] - window_start).dt.days // 7).astype(np.int8)
print(f"  Week range: {transactions['week'].min()} to {transactions['week'].max()}")


  STEP 2: SELECTING TEMPORAL WINDOW
Last transaction date: 2020-09-22 00:00:00

Using 24 weeks of data for sampling
Window: 2020-04-07 to 2020-09-22

Filtering transactions from 2020-04-07 onwards...
‚úì Retained 7,561,154 transactions (7.56M)
  Week range: 0 to 24


In [73]:
# ============================================================================
# STEP 3: USER-BASED STRATIFIED SAMPLING
# ============================================================================

print_section("STEP 3: USER-BASED STRATIFIED SAMPLING")

# Calculate user activity across all weeks
print("Calculating user activity metrics...")
user_activity = transactions.groupby('customer_id').agg({
    'article_id': 'count',  # Total purchases
    'week': ['min', 'max', 'nunique']  # Week span and diversity
}).reset_index()

user_activity.columns = ['customer_id', 'total_purchases', 'first_week', 'last_week', 'active_weeks']
user_activity['week_span'] = user_activity['last_week'] - user_activity['first_week'] + 1

print(f"Total users in window: {len(user_activity):,}")
print(f"  Avg purchases per user: {user_activity['total_purchases'].mean():.2f}")
print(f"  Avg active weeks per user: {user_activity['active_weeks'].mean():.2f}")

# Separate cold start and regular users
if config.INCLUDE_COLD_START:
    cold_start_users = user_activity[
        user_activity['total_purchases'] <= config.COLD_START_MAX_PURCHASES
    ].copy()
    regular_users = user_activity[
        user_activity['total_purchases'] >= config.MIN_USER_PURCHASES
    ].copy()
    
    print(f"\nUser segments:")
    print(f"  Cold start users (‚â§{config.COLD_START_MAX_PURCHASES} purchases): {len(cold_start_users):,}")
    print(f"  Regular users (‚â•{config.MIN_USER_PURCHASES} purchases): {len(regular_users):,}")
    
    # Calculate target counts
    n_cold_start_target = int(config.TARGET_USERS * config.COLD_START_RATIO)
    n_regular_target = config.TARGET_USERS - n_cold_start_target
    
    print(f"\nSampling targets:")
    print(f"  Cold start: {n_cold_start_target:,} users ({config.COLD_START_RATIO*100:.1f}%)")
    print(f"  Regular: {n_regular_target:,} users ({(1-config.COLD_START_RATIO)*100:.1f}%)")
    
else:
    # Filter users with minimum activity
    regular_users = user_activity[
        user_activity['total_purchases'] >= config.MIN_USER_PURCHASES
    ].copy()
    cold_start_users = pd.DataFrame()
    n_cold_start_target = 0
    n_regular_target = config.TARGET_USERS
    
    print(f"\nUsers with >= {config.MIN_USER_PURCHASES} purchases: {len(regular_users):,}")

# Sample cold start users (if enabled)
sampled_cold_start = []
if config.INCLUDE_COLD_START and len(cold_start_users) > 0:
    n_cold_sample = min(n_cold_start_target, len(cold_start_users))
    sampled_cold_start = cold_start_users['customer_id'].sample(
        n=n_cold_sample, 
        random_state=config.RANDOM_STATE
    ).tolist()
    print(f"\n‚úì Sampled {len(sampled_cold_start):,} cold start users")

# Sample regular users with stratification
if config.STRATIFY_BY_ACTIVITY and len(regular_users) > 0:
    regular_users['activity_level'] = pd.cut(
        regular_users['total_purchases'],
        bins=config.ACTIVITY_BINS,
        labels=config.ACTIVITY_LABELS
    )
    
    print("\nRegular user activity distribution:")
    activity_dist = regular_users['activity_level'].value_counts().sort_index()
    for level, count in activity_dist.items():
        print(f"  {level}: {count:,} users ({100*count/len(regular_users):.1f}%)")
    
    # Stratified sampling for regular users
    print(f"\nPerforming stratified sampling to get {n_regular_target:,} regular users...")
    
    # Calculate samples per stratum (proportional)
    samples_per_stratum = (activity_dist / activity_dist.sum() * n_regular_target).round().astype(int)
    
    # Adjust for rounding errors
    diff = n_regular_target - samples_per_stratum.sum()
    if diff != 0:
        largest_stratum = samples_per_stratum.idxmax()
        samples_per_stratum[largest_stratum] += diff
    
    print("\nSamples per activity level:")
    for level, n_samples in samples_per_stratum.items():
        print(f"  {level}: {n_samples:,} users")
    
    # Sample from each stratum
    sampled_regular = []
    for level in config.ACTIVITY_LABELS:
        stratum_users = regular_users[regular_users['activity_level'] == level]['customer_id']
        n_sample = min(samples_per_stratum[level], len(stratum_users))
        if n_sample > 0:
            sampled = stratum_users.sample(n=n_sample, random_state=config.RANDOM_STATE)
            sampled_regular.extend(sampled.tolist())

else:
    # Simple random sampling for regular users
    print(f"\nPerforming random sampling to get {n_regular_target:,} regular users...")
    n_sample = min(n_regular_target, len(regular_users))
    sampled_regular = regular_users['customer_id'].sample(
        n=n_sample, 
        random_state=config.RANDOM_STATE
    ).tolist()

# Combine both groups
selected_users = set(sampled_cold_start + sampled_regular)

print(f"\n‚úì Total selected users: {len(selected_users):,}")
if config.INCLUDE_COLD_START:
    print(f"  - Cold start: {len(sampled_cold_start):,} ({100*len(sampled_cold_start)/len(selected_users):.1f}%)")
    print(f"  - Regular: {len(sampled_regular):,} ({100*len(sampled_regular)/len(selected_users):.1f}%)")

# Verify sampling quality
sampled_activity = user_activity[user_activity['customer_id'].isin(selected_users)]
print(f"\nSampled users statistics:")
print(f"  Avg purchases: {sampled_activity['total_purchases'].mean():.2f}")
print(f"  Median purchases: {sampled_activity['total_purchases'].median():.2f}")
print(f"  Min purchases: {sampled_activity['total_purchases'].min():.0f}")
print(f"  Max purchases: {sampled_activity['total_purchases'].max():.0f}")
print(f"  Avg active weeks: {sampled_activity['active_weeks'].mean():.2f}")
print(f"  Purchases std: {sampled_activity['total_purchases'].std():.2f}")


  STEP 3: USER-BASED STRATIFIED SAMPLING
Calculating user activity metrics...
Total users in window: 719,806
  Avg purchases per user: 10.50
  Avg active weeks per user: 2.61

User segments:
  Cold start users (‚â§1 purchases): 72,280
  Regular users (‚â•1 purchases): 719,806

Sampling targets:
  Cold start: 7,500 users (15.0%)
  Regular: 42,500 users (85.0%)

‚úì Sampled 7,500 cold start users

Regular user activity distribution:
  low: 331,386 users (46.0%)
  medium: 161,171 users (22.4%)
  high: 133,525 users (18.6%)
  very_high: 79,738 users (11.1%)
  extreme: 13,986 users (1.9%)

Performing stratified sampling to get 42,500 regular users...

Samples per activity level:
  low: 19,566 users
  medium: 9,516 users
  high: 7,884 users
  very_high: 4,708 users
  extreme: 826 users

‚úì Total selected users: 49,576
  - Cold start: 7,500 (15.1%)
  - Regular: 42,500 (85.7%)

Sampled users statistics:
  Avg purchases: 9.14
  Median purchases: 5.00
  Min purchases: 1
  Max purchases: 342
  

In [74]:
# ============================================================================
# STEP 4: FILTER TRANSACTIONS TO SAMPLED USERS
# ============================================================================

print_section("STEP 4: FILTERING TRANSACTIONS TO SAMPLED USERS")

# Filter transactions
transactions = transactions[transactions['customer_id'].isin(selected_users)].copy()
print(f"‚úì Retained {len(transactions):,} transactions")
print(f"  Reduction: {100 * (1 - len(transactions) / len(transactions)):.1f}% (based on sampled users)")

# Now create train/val split
val_end_date = max_date
val_start_date = val_end_date - timedelta(weeks=config.N_VAL_WEEKS)
train_end_date = val_start_date - timedelta(days=1)
train_start_date = train_end_date - timedelta(weeks=config.N_TRAIN_WEEKS)

print(f"\nTemporal splits (from {config.TOTAL_WEEKS} week window):")
print(f"  Training:   {train_start_date.date()} to {train_end_date.date()} ({config.N_TRAIN_WEEKS} weeks)")
print(f"  Validation: {val_start_date.date()} to {val_end_date.date()} ({config.N_VAL_WEEKS} week)")

# Split transactions
train_transactions = transactions[transactions['t_dat'] <= train_end_date].copy()
val_transactions = transactions[transactions['t_dat'] > train_end_date].copy()

print(f"\nDataset split:")
print(f"  Training transactions: {len(train_transactions):,}")
print(f"  Validation transactions: {len(val_transactions):,}")

# Check how many sampled users appear in validation
val_users = set(val_transactions['customer_id'].unique())
print(f"  Users in validation: {len(val_users):,} ({100*len(val_users)/len(selected_users):.1f}% of sampled)")

del transactions
gc.collect()


  STEP 4: FILTERING TRANSACTIONS TO SAMPLED USERS
‚úì Retained 453,143 transactions
  Reduction: 0.0% (based on sampled users)

Temporal splits (from 24 week window):
  Training:   2020-06-29 to 2020-09-14 (11 weeks)
  Validation: 2020-09-15 to 2020-09-22 (1 week)

Dataset split:
  Training transactions: 436,663
  Validation transactions: 16,480
  Users in validation: 4,943 (10.0% of sampled)


451

In [75]:
# ============================================================================
# STEP 5: ITEM FILTERING
# ============================================================================

print_section("STEP 5: ITEM FILTERING")

# Count purchases per item in training window
item_counts = train_transactions['article_id'].value_counts()
print(f"Unique items in training: {len(item_counts):,}")

# Keep items with minimum purchases
valid_items = set(item_counts[item_counts >= config.MIN_ITEM_PURCHASES].index)
print(f"Items with >= {config.MIN_ITEM_PURCHASES} purchases: {len(valid_items):,}")

# Also include all items from validation (even if rare in training)
val_items = set(val_transactions['article_id'].unique())
print(f"Items in validation: {len(val_items):,}")

# Combine
selected_items = valid_items.union(val_items)
print(f"\nTotal selected items: {len(selected_items):,}")

# Filter transactions
train_transactions = train_transactions[train_transactions['article_id'].isin(selected_items)].copy()
val_transactions = val_transactions[val_transactions['article_id'].isin(selected_items)].copy()

print(f"\nAfter item filtering:")
print(f"  Training transactions: {len(train_transactions):,}")
print(f"  Validation transactions: {len(val_transactions):,}")

# Filter articles and customers tables
articles = articles[articles['article_id'].isin(selected_items)].copy()
customers = customers[customers['customer_id'].isin(selected_users)].copy()

print(f"  Articles retained: {len(articles):,}")
print(f"  Customers retained: {len(customers):,}")


  STEP 5: ITEM FILTERING
Unique items in training: 28,951
Items with >= 5 purchases: 14,851
Items in validation: 5,730

Total selected items: 16,616

After item filtering:
  Training transactions: 412,156
  Validation transactions: 16,480
  Articles retained: 16,616
  Customers retained: 49,576


In [76]:
# ============================================================================
# STEP 6: DATA TYPE OPTIMIZATION
# ============================================================================

print_section("STEP 6: MEMORY OPTIMIZATION")

print("Before optimization:")
print(f"  train_transactions: {train_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  val_transactions: {val_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  customers: {customers.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  articles: {articles.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Optimize transactions
train_transactions = reduce_mem_usage(train_transactions)
val_transactions = reduce_mem_usage(val_transactions)

# Convert categorical columns
for col in ['product_code', 'product_type_no', 'graphical_appearance_no', 
            'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id',
            'department_no', 'index_code', 'index_group_no', 'section_no', 'garment_group_no']:
    if col in articles.columns:
        articles[col] = articles[col].astype('category')

# Optimize customer categoricals
for col in ['club_member_status', 'fashion_news_frequency', 'postal_code']:
    if col in customers.columns:
        customers[col] = customers[col].astype('category')

print("\nAfter optimization:")
print(f"  train_transactions: {train_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  val_transactions: {val_transactions.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  customers: {customers.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"  articles: {articles.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


  STEP 6: MEMORY OPTIMIZATION
Before optimization:
  train_transactions: 57.78 MB
  val_transactions: 2.31 MB
  customers: 18.34 MB
  articles: 17.70 MB
Memory usage decreased from 13.36 MB to 13.36 MB (0.0% reduction)
Memory usage decreased from 0.53 MB to 0.53 MB (0.0% reduction)

After optimization:
  train_transactions: 57.78 MB
  val_transactions: 2.31 MB
  customers: 12.94 MB
  articles: 16.92 MB


In [77]:
# ============================================================================
# STEP 7: DATA VALIDATION & EDA
# ============================================================================

print_section("STEP 7: DATA VALIDATION & EDA")

# Validation checks
print("Data validation:")
print(f"  ‚úì No null customer_ids in train: {train_transactions['customer_id'].isnull().sum() == 0}")
print(f"  ‚úì No null article_ids in train: {train_transactions['article_id'].isnull().sum() == 0}")

# Weekly distribution
print("\nWeekly activity distribution (sampled users):")
weekly_users = train_transactions.groupby('week')['customer_id'].nunique()
for week, n_users in weekly_users.items():
    print(f"  Week {week}: {n_users:,} active users")

# Purchase distribution
print("\nPurchase distribution in validation week:")
if len(val_transactions) > 0:
    val_user_purchases = val_transactions.groupby('customer_id').size()
    print(f"  Mean purchases per user: {val_user_purchases.mean():.2f}")
    print(f"  Median purchases per user: {val_user_purchases.median():.0f}")
    print(f"  Users with 1 purchase: {(val_user_purchases == 1).sum():,}")
    print(f"  Users with 2-5 purchases: {((val_user_purchases >= 2) & (val_user_purchases <= 5)).sum():,}")
    print(f"  Users with 6+ purchases: {(val_user_purchases >= 6).sum():,}")
else:
    print("  ‚ö†Ô∏è No validation transactions for sampled users")


  STEP 7: DATA VALIDATION & EDA
Data validation:
  ‚úì No null customer_ids in train: True
  ‚úì No null article_ids in train: True

Weekly activity distribution (sampled users):
  Week 0: 4,926 active users
  Week 1: 3,240 active users
  Week 2: 3,976 active users
  Week 3: 3,284 active users
  Week 4: 3,771 active users
  Week 5: 4,157 active users
  Week 6: 5,213 active users
  Week 7: 5,419 active users
  Week 8: 4,900 active users
  Week 9: 4,432 active users
  Week 10: 7,090 active users
  Week 11: 7,371 active users
  Week 12: 5,377 active users
  Week 13: 4,692 active users
  Week 14: 4,717 active users
  Week 15: 4,910 active users
  Week 16: 4,985 active users
  Week 17: 4,756 active users
  Week 18: 4,366 active users
  Week 19: 4,289 active users
  Week 20: 5,006 active users
  Week 21: 4,634 active users
  Week 22: 4,647 active users

Purchase distribution in validation week:
  Mean purchases per user: 3.33
  Median purchases per user: 2
  Users with 1 purchase: 1,689
  U

In [78]:
# ============================================================================
# STEP 8: CREATE VALIDATION GROUND TRUTH
# ============================================================================

print_section("STEP 8: CREATING VALIDATION GROUND TRUTH")

# Create validation ground truth
if len(val_transactions) > 0:
    val_ground_truth = (
        val_transactions
        .groupby('customer_id')['article_id']
        .apply(list)
        .reset_index()
        .rename(columns={'article_id': 'purchased_articles'})
    )
    
    print(f"Validation ground truth:")
    print(f"  Users: {len(val_ground_truth):,}")
    print(f"  Total purchases: {val_ground_truth['purchased_articles'].apply(len).sum():,}")
    print(f"  Avg purchases per user: {val_ground_truth['purchased_articles'].apply(len).mean():.2f}")
else:
    val_ground_truth = pd.DataFrame(columns=['customer_id', 'purchased_articles'])
    print("‚ö†Ô∏è Empty validation ground truth")


  STEP 8: CREATING VALIDATION GROUND TRUTH
Validation ground truth:
  Users: 4,943
  Total purchases: 16,480
  Avg purchases per user: 3.33


In [79]:
# ============================================================================
# STEP 9: SAVE PROCESSED DATA
# ============================================================================

print_section("STEP 9: SAVING PROCESSED DATA")

# Save to parquet
print("Saving files...")

train_transactions.to_parquet(config.OUTPUT_PATH / 'train_transactions.parquet', index=False)
print(f"  ‚úì train_transactions.parquet ({len(train_transactions):,} rows)")

val_transactions.to_parquet(config.OUTPUT_PATH / 'val_transactions.parquet', index=False)
print(f"  ‚úì val_transactions.parquet ({len(val_transactions):,} rows)")

customers.to_parquet(config.OUTPUT_PATH / 'customers.parquet', index=False)
print(f"  ‚úì customers.parquet ({len(customers):,} rows)")

articles.to_parquet(config.OUTPUT_PATH / 'articles.parquet', index=False)
print(f"  ‚úì articles.parquet ({len(articles):,} rows)")

val_ground_truth.to_parquet(config.OUTPUT_PATH / 'val_ground_truth.parquet', index=False)
print(f"  ‚úì val_ground_truth.parquet ({len(val_ground_truth):,} rows)")

# Save user activity for analysis
sampled_activity.to_parquet(config.OUTPUT_PATH / 'user_activity_stats.parquet', index=False)
print(f"  ‚úì user_activity_stats.parquet ({len(sampled_activity):,} rows)")

# Save metadata
metadata = {
    'total_weeks': config.TOTAL_WEEKS,
    'train_weeks': config.N_TRAIN_WEEKS,
    'val_weeks': config.N_VAL_WEEKS,
    'train_start_date': str(train_start_date.date()),
    'train_end_date': str(train_end_date.date()),
    'val_start_date': str(val_start_date.date()),
    'val_end_date': str(val_end_date.date()),
    'target_users': config.TARGET_USERS,
    'actual_users': len(selected_users),
    'users_in_validation': len(val_users),
    'cold_start_users': len(sampled_cold_start) if config.INCLUDE_COLD_START else 0,
    'regular_users': len(sampled_regular) if config.INCLUDE_COLD_START else len(selected_users),
    'cold_start_ratio': config.COLD_START_RATIO if config.INCLUDE_COLD_START else 0,
    'n_items': len(selected_items),
    'n_train_transactions': len(train_transactions),
    'n_val_transactions': len(val_transactions),
    'stratified': config.STRATIFY_BY_ACTIVITY,
    'min_user_purchases': config.MIN_USER_PURCHASES,
}

import json
with open(config.OUTPUT_PATH / 'metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f"  ‚úì metadata.json")


  STEP 9: SAVING PROCESSED DATA
Saving files...
  ‚úì train_transactions.parquet (412,156 rows)
  ‚úì val_transactions.parquet (16,480 rows)
  ‚úì customers.parquet (49,576 rows)
  ‚úì articles.parquet (16,616 rows)
  ‚úì val_ground_truth.parquet (4,943 rows)
  ‚úì user_activity_stats.parquet (49,576 rows)
  ‚úì metadata.json


In [None]:
# ============================================================================
# SUMMARY
# ============================================================================

print_section("PREPROCESSING COMPLETE!")

print("\nFinal dataset summary:")
print(f"Total weeks considered: {config.TOTAL_WEEKS}")
print(f"Training weeks: {config.N_TRAIN_WEEKS}")
print(f"Validation weeks: {config.N_VAL_WEEKS}")
print(f"Target users: {config.TARGET_USERS:,}")
print(f"Actual sampled users: {len(selected_users):,}")
if config.INCLUDE_COLD_START:
    print(f"Cold start users: {len(sampled_cold_start):,} ({100*len(sampled_cold_start)/len(selected_users):.1f}%)")
    print(f"Regular users: {len(sampled_regular):,} ({100*len(sampled_regular)/len(selected_users):.1f}%)")
print(f"Users in validation: {len(val_users):,} ({100*len(val_users)/len(selected_users):.1f}%)")
print(f"Items: {len(selected_items):,}")
print(f"Train transactions: {len(train_transactions):,}")
print(f"Val transactions: {len(val_transactions):,}")
print(f"Avg transactions per user (train): {len(train_transactions)/len(selected_users):.2f}")

if config.STRATIFY_BY_ACTIVITY:
    print("\n Sampling was stratified by user activity level")
if config.INCLUDE_COLD_START:
    print(f"Cold start users included for testing recommendations with limited history")



  PREPROCESSING COMPLETE!

Final dataset summary:
  üìÖ Total weeks considered: 24
  üìÖ Training weeks: 11
  üìÖ Validation weeks: 1
  üë• Target users: 50,000
  üë• Actual sampled users: 49,576
  ‚ùÑÔ∏è  Cold start users: 7,500 (15.1%)
  üî• Regular users: 42,500 (85.7%)
  üë• Users in validation: 4,943 (10.0%)
  üõçÔ∏è  Items: 16,616
  üìä Train transactions: 412,156
  üìä Val transactions: 16,480
  üìä Avg transactions per user (train): 8.31

  üìà Sampling was stratified by user activity level
  ‚ùÑÔ∏è  Cold start users included for testing recommendations with limited history

‚úÖ Ready for Stage 2: Recall Strategies!

Next steps:
  1. Review the saved files in /kaggle/working/
  2. Check metadata.json for dataset info
  3. Analyze user_activity_stats.parquet for sampling quality
  4. Proceed to Stage 2 when ready


### Stage 2: Generating Candidates

In [117]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import warnings
from tqdm.auto import tqdm
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from text_features import integrate_text_features_stage2

warnings.filterwarnings('ignore')

In [129]:
# ============================================================================
# MEMORY MONITORING
# ============================================================================

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    gc.collect()

In [130]:
class Config:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_2')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    
    # Recall configuration - REDUCED for memory
    N_REPURCHASE_CANDIDATES = 25  # Reduced from 30
    N_POPULARITY_CANDIDATES = 25  # Reduced from 30
    N_COPURCHASE_CANDIDATES = 15  # Reduced from 20
    N_USERKNN_CANDIDATES = 15     # Reduced from 20
    N_CATEGORY_CANDIDATES = 15    # Reduced from 20
    
    # Processing parameters
    USER_CHUNK_SIZE = 1000  # Process users in chunks
    BASKET_CHUNK_SIZE = 5000  # Process baskets in chunks
    
    # EMERGENCY MODE: Use only recent data for repurchase
    USE_RECENT_ONLY_REPURCHASE = True  # Set to True if kernel keeps crashing
    REPURCHASE_RECENT_WEEKS = 8  # Only use last 8 weeks for repurchase
    
    # Item-to-Item CF parameters
    MIN_ITEM_SUPPORT = 3
    MAX_ITEM_NEIGHBORS = 30  # Reduced from 50
    
    # User-KNN parameters (ONLY for validation users)
    N_SIMILAR_USERS = 20  # Reduced from 30
    MIN_COMMON_ITEMS = 2
    
    # Time decay
    REPURCHASE_DECAY_RATE = 0.05
    POPULARITY_WINDOW_WEEKS = 2
    
    RANDOM_STATE = 42

config = Config()

In [131]:
# ============================================================================
# UTILITY FUNCTIONS
# ============================================================================

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

def time_decay_score(days_ago, decay_rate=0.05):
    """Vectorized time decay"""
    return np.exp(-decay_rate * days_ago)

In [132]:
# ============================================================================
# LOAD DATA
# ============================================================================

print_section("LOADING DATA")

print("Loading data files...")
train_transactions = pd.read_parquet(config.DATA_PATH / 'train_transactions.parquet')
val_ground_truth = pd.read_parquet(config.DATA_PATH / 'val_ground_truth.parquet')
articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')

print(f"‚úì Train transactions: {len(train_transactions):,}")
print_memory()

all_users = train_transactions['customer_id'].unique()
all_items = train_transactions['article_id'].unique()
val_users = set(val_ground_truth['customer_id'].unique())
max_date = train_transactions['t_dat'].max()

print(f"‚úì Users: {len(all_users):,}, Items: {len(all_items):,}, Val users: {len(val_users):,}")


  LOADING DATA

Loading data files...
‚úì Train transactions: 412,156
  üíæ Memory: 0.35 GB
‚úì Users: 47,543, Items: 15,932, Val users: 4,943


In [133]:
# ============================================================================
# STRATEGY 1: REPURCHASE - CHUNKED PROCESSING
# ============================================================================

print_section("STRATEGY 1: REPURCHASE (CHUNKED)")

print("Processing in chunks to save memory...")
repurchase_chunks = []

# Split users into chunks
user_chunks = np.array_split(all_users, max(1, len(all_users) // config.USER_CHUNK_SIZE))

for i, user_chunk in enumerate(tqdm(user_chunks, desc="User chunks")):
    # Filter transactions for this chunk
    chunk_trans = train_transactions[
        train_transactions['customer_id'].isin(user_chunk)
    ].copy()
    
    # Get last purchase per user-item
    user_item_last = (
        chunk_trans
        .groupby(['customer_id', 'article_id'], as_index=False)['t_dat']
        .max()
    )
    
    # Calculate scores (vectorized) - with NaN handling
    user_item_last['days_ago'] = (max_date - user_item_last['t_dat']).dt.days
    
    # Drop any NaN values before converting to int
    user_item_last = user_item_last.dropna(subset=['days_ago'])
    
    # Now safe to convert to int
    user_item_last['days_ago'] = user_item_last['days_ago'].astype(np.int16)
    user_item_last['repurchase_score'] = time_decay_score(
        user_item_last['days_ago'].values, 
        config.REPURCHASE_DECAY_RATE
    ).astype(np.float32)
    
    # Get top N per user
    top_candidates = (
        user_item_last
        .sort_values(['customer_id', 'repurchase_score'], ascending=[True, False])
        .groupby('customer_id', as_index=False)
        .head(config.N_REPURCHASE_CANDIDATES)
        [['customer_id', 'article_id', 'repurchase_score']]
    )
    
    repurchase_chunks.append(top_candidates)
    
    # Clean up
    del chunk_trans, user_item_last, top_candidates
    force_garbage_collection()

# Combine chunks
repurchase_candidates = pd.concat(repurchase_chunks, ignore_index=True)
del repurchase_chunks
force_garbage_collection()

print(f"‚úì Generated {len(repurchase_candidates):,} repurchase candidates")
print_memory()

# Save intermediate result
repurchase_candidates.to_parquet(config.OUTPUT_PATH / 'temp_repurchase.parquet', index=False)
del repurchase_candidates
force_garbage_collection()


  STRATEGY 1: REPURCHASE (CHUNKED)

Processing in chunks to save memory...


User chunks:   0%|          | 0/47 [00:00<?, ?it/s]

‚úì Generated 325,177 repurchase candidates
  üíæ Memory: 1.36 GB


In [134]:
# ============================================================================
# STRATEGY 2: POPULARITY
# ============================================================================

print_section("STRATEGY 2: POPULARITY")

cutoff_date = max_date - timedelta(weeks=config.POPULARITY_WINDOW_WEEKS)
recent_trans = train_transactions[train_transactions['t_dat'] >= cutoff_date].copy()

print(f"Using {len(recent_trans):,} recent transactions")

# Vectorized calculations
recent_trans['days_ago'] = (max_date - recent_trans['t_dat']).dt.days

# Drop NaN values
recent_trans = recent_trans.dropna(subset=['days_ago'])

# Convert to int
recent_trans['days_ago'] = recent_trans['days_ago'].astype(np.int16)
recent_trans['weight'] = time_decay_score(recent_trans['days_ago'].values, 0.1).astype(np.float32)

# Aggregate
item_popularity = (
    recent_trans
    .groupby('article_id', as_index=False)
    .agg({'weight': 'sum', 'customer_id': 'nunique'})
    .rename(columns={'weight': 'weighted_purchases', 'customer_id': 'unique_buyers'})
)

item_popularity['popularity_score'] = (
    0.7 * item_popularity['weighted_purchases'] + 
    0.3 * item_popularity['unique_buyers']
)
item_popularity['popularity_score'] = (
    item_popularity['popularity_score'] / item_popularity['popularity_score'].max()
).astype(np.float32)

# Get top items
top_items = item_popularity.nlargest(config.N_POPULARITY_CANDIDATES, 'popularity_score')

print(f"‚úì Top {len(top_items)} popular items")

# Create candidates - CHUNKED
print("Creating popularity candidates in chunks...")
pop_chunks = []

for user_chunk in tqdm(np.array_split(all_users, 20), desc="Popularity chunks"):
    chunk_df = pd.DataFrame({
        'customer_id': np.repeat(user_chunk, len(top_items)),
        'article_id': np.tile(top_items['article_id'].values, len(user_chunk))
    })
    
    rank_penalty = np.tile(1 - np.arange(len(top_items)) * 0.01, len(user_chunk))
    scores = np.tile(top_items['popularity_score'].values, len(user_chunk))
    chunk_df['popularity_score'] = (scores * rank_penalty).astype(np.float32)
    
    pop_chunks.append(chunk_df)

popularity_candidates = pd.concat(pop_chunks, ignore_index=True)
del pop_chunks, recent_trans
force_garbage_collection()

print(f"‚úì Generated {len(popularity_candidates):,} popularity candidates")
print_memory()

# Save
popularity_candidates.to_parquet(config.OUTPUT_PATH / 'temp_popularity.parquet', index=False)
item_popularity.to_parquet(config.OUTPUT_PATH / 'item_popularity.parquet', index=False)
del popularity_candidates
force_garbage_collection()


  STRATEGY 2: POPULARITY

Using 32,152 recent transactions
‚úì Top 25 popular items
Creating popularity candidates in chunks...


Popularity chunks:   0%|          | 0/20 [00:00<?, ?it/s]

‚úì Generated 1,188,575 popularity candidates
  üíæ Memory: 1.47 GB


In [135]:
# ============================================================================
# STRATEGY 3: CO-PURCHASE (Item-to-Item CF) - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 3: CO-PURCHASE (Item-to-Item CF)")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_copurchase.parquet').exists():
    print("‚ö° Found existing co-purchase candidates, loading...")
    copurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet')
    print(f"‚úì Loaded {len(copurchase_candidates):,} co-purchase candidates")
    print(f"  Users with candidates: {copurchase_candidates['customer_id'].nunique():,}")
else:
    print("Building co-purchase matrix...")

    # Create item-to-item co-purchase matrix
    # Group by transaction/basket (same user, same day)
    train_transactions['basket_id'] = (
        train_transactions['customer_id'].astype(str) + '_' + 
        train_transactions['t_dat'].astype(str)
    )

    # Get baskets with multiple items
    basket_items = (
        train_transactions
        .groupby('basket_id')['article_id']
        .apply(list)
        .reset_index()
    )

    # Filter baskets with at least 2 items
    basket_items = basket_items[basket_items['article_id'].apply(len) >= 2]
    print(f"  Baskets with 2+ items: {len(basket_items):,}")

    # Build co-purchase counts
    print("Computing co-purchase frequencies...")
    copurchase_counts = defaultdict(lambda: defaultdict(int))

    for items in tqdm(basket_items['article_id'], desc="Processing baskets"):
        # For each pair of items in the basket
        for i in range(len(items)):
            for j in range(i + 1, len(items)):
                item1, item2 = items[i], items[j]
                copurchase_counts[item1][item2] += 1
                copurchase_counts[item2][item1] += 1

    print(f"‚úì Built co-purchase matrix for {len(copurchase_counts):,} items")

    # Convert to item-to-item similarity scores
    print("Computing item-to-item similarity scores...")
    item_to_items = {}

    for item1 in tqdm(copurchase_counts.keys(), desc="Computing similarities"):
        # Get co-purchased items
        copurchased = copurchase_counts[item1]
        
        # Filter by minimum support
        copurchased = {
            item2: count 
            for item2, count in copurchased.items() 
            if count >= config.MIN_ITEM_SUPPORT
        }
        
        if copurchased:
            # Sort by count and take top K
            top_items = sorted(
                copurchased.items(), 
                key=lambda x: x[1], 
                reverse=True
            )[:config.MAX_ITEM_NEIGHBORS]
            
            # Normalize scores
            max_count = top_items[0][1]
            item_to_items[item1] = [
                (item2, count / max_count) 
                for item2, count in top_items
            ]

    print(f"‚úì Computed similarities for {len(item_to_items):,} items")

    # Save item-to-item similarity matrix for potential reuse
    print("Saving item-to-item similarity matrix...")
    import pickle
    with open(config.OUTPUT_PATH / 'item_to_items.pkl', 'wb') as f:
        pickle.dump(item_to_items, f)
    print(f"  ‚úì Saved item_to_items.pkl ({len(item_to_items):,} items)")

    # Generate co-purchase candidates for each user
    print("Generating co-purchase candidates...")
    copurchase_candidates = []

    # Get recent purchases for each user (last 10)
    user_recent_items = (
        train_transactions
        .sort_values('t_dat', ascending=False)
        .groupby('customer_id')['article_id']
        .apply(lambda x: list(x.unique()[:10]))
        .to_dict()
    )

    for user in tqdm(all_users, desc="User co-purchase recommendations"):
        if user not in user_recent_items:
            continue
        
        user_items = user_recent_items[user]
        candidate_scores = defaultdict(float)
        
        # Aggregate scores from all user's items
        for user_item in user_items:
            if user_item in item_to_items:
                for similar_item, score in item_to_items[user_item]:
                    if similar_item not in user_items:  # Don't recommend already purchased
                        candidate_scores[similar_item] += score
        
        # Get top N candidates
        if candidate_scores:
            top_candidates = sorted(
                candidate_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            )[:config.N_COPURCHASE_CANDIDATES]
            
            for item, score in top_candidates:
                copurchase_candidates.append({
                    'customer_id': user,
                    'article_id': item,
                    'copurchase_score': score
                })

    copurchase_candidates = pd.DataFrame(copurchase_candidates)
    print(f"‚úì Generated {len(copurchase_candidates):,} co-purchase candidates")
    print(f"  Users with candidates: {copurchase_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving co-purchase candidates...")
    copurchase_candidates.to_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet', index=False)
    print(f"‚úì Saved temp_copurchase.parquet ({len(copurchase_candidates):,} rows)")

    # Clean up memory
    del basket_items, copurchase_counts, item_to_items, user_recent_items
    force_garbage_collection()
    print("‚úì Memory cleaned")


  STRATEGY 3: CO-PURCHASE (Item-to-Item CF)

Building co-purchase matrix...
  Baskets with 2+ items: 83,983
Computing co-purchase frequencies...


Processing baskets:   0%|          | 0/83983 [00:00<?, ?it/s]

‚úì Built co-purchase matrix for 15,868 items
Computing item-to-item similarity scores...


Computing similarities:   0%|          | 0/15868 [00:00<?, ?it/s]

‚úì Computed similarities for 10,603 items
Saving item-to-item similarity matrix...
  ‚úì Saved item_to_items.pkl (10,603 items)
Generating co-purchase candidates...


User co-purchase recommendations:   0%|          | 0/47543 [00:00<?, ?it/s]

‚úì Generated 612,807 co-purchase candidates
  Users with candidates: 45,344

Saving co-purchase candidates...
‚úì Saved temp_copurchase.parquet (612,807 rows)
‚úì Memory cleaned


In [136]:
# ============================================================================
# STRATEGY 4: USER-KNN COLLABORATIVE FILTERING - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 4: USER-KNN COLLABORATIVE FILTERING")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_userknn.parquet').exists():
    print("‚ö° Found existing user-KNN candidates, loading...")
    userknn_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet')
    print(f"‚úì Loaded {len(userknn_candidates):,} user-KNN candidates")
    print(f"  Users with candidates: {userknn_candidates['customer_id'].nunique():,}")
else:
    print("Building user-item matrix...")

    # Create sparse user-item matrix (binary: 1 if purchased, 0 otherwise)
    # Map users and items to indices
    user_to_idx = {user: idx for idx, user in enumerate(all_users)}
    item_to_idx = {item: idx for idx, item in enumerate(all_items)}

    # Create matrix
    n_users = len(all_users)
    n_items = len(all_items)

    print(f"  Matrix size: {n_users:,} users x {n_items:,} items")

    # Use last 4 weeks for user similarity (more recent = more relevant)
    recent_date = max_date - timedelta(weeks=4)
    recent_user_items = train_transactions[train_transactions['t_dat'] >= recent_date].copy()

    user_item_matrix = lil_matrix((n_users, n_items), dtype=np.int8)

    print("Populating user-item matrix...")
    for _, row in tqdm(recent_user_items.iterrows(), total=len(recent_user_items), desc="Building matrix"):
        user_idx = user_to_idx[row['customer_id']]
        item_idx = item_to_idx[row['article_id']]
        user_item_matrix[user_idx, item_idx] = 1

    # Convert to CSR for efficient operations
    user_item_matrix = user_item_matrix.tocsr()
    print(f"‚úì Matrix density: {user_item_matrix.nnz / (n_users * n_items) * 100:.4f}%")

    # Save user-item matrix for potential reuse
    print("Saving user-item matrix...")
    from scipy.sparse import save_npz
    save_npz(config.OUTPUT_PATH / 'user_item_matrix.npz', user_item_matrix)
    print(f"  ‚úì Saved user_item_matrix.npz")

    # Save user/item mappings
    import pickle
    with open(config.OUTPUT_PATH / 'user_to_idx.pkl', 'wb') as f:
        pickle.dump(user_to_idx, f)
    with open(config.OUTPUT_PATH / 'item_to_idx.pkl', 'wb') as f:
        pickle.dump(item_to_idx, f)
    print(f"  ‚úì Saved index mappings")

    # Compute user-user similarity (only for validation users to save memory)
    print(f"Computing user similarities for {len(val_users):,} validation users...")

    val_user_indices = [user_to_idx[user] for user in val_users if user in user_to_idx]
    val_user_matrix = user_item_matrix[val_user_indices]

    # Normalize rows
    val_user_matrix_norm = normalize(val_user_matrix, norm='l2', axis=1)
    user_item_matrix_norm = normalize(user_item_matrix, norm='l2', axis=1)

    # Compute similarity (batch processing to avoid memory issues)
    print("Computing cosine similarities...")
    batch_size = 1000
    userknn_candidates = []

    for i in tqdm(range(0, len(val_user_indices), batch_size), desc="Similarity batches"):
        batch_indices = val_user_indices[i:i+batch_size]
        batch_matrix = val_user_matrix_norm[i:i+batch_size]
        
        # Compute similarity with all users
        similarities = cosine_similarity(batch_matrix, user_item_matrix_norm)
        
        # For each user in batch
        for j, user_idx in enumerate(batch_indices):
            user = all_users[user_idx]
            user_sims = similarities[j]
            
            # Get top similar users (exclude self)
            similar_user_indices = np.argsort(user_sims)[::-1][1:config.N_SIMILAR_USERS+1]
            
            # Get items purchased by similar users
            candidate_scores = defaultdict(float)
            user_purchased = set(
                train_transactions[train_transactions['customer_id'] == user]['article_id']
            )
            
            for sim_user_idx in similar_user_indices:
                sim_score = user_sims[sim_user_idx]
                if sim_score < 0.01:  # Skip very dissimilar users
                    continue
                
                sim_user = all_users[sim_user_idx]
                sim_user_items = train_transactions[
                    train_transactions['customer_id'] == sim_user
                ]['article_id'].unique()
                
                for item in sim_user_items:
                    if item not in user_purchased:
                        candidate_scores[item] += sim_score
            
            # Get top N candidates
            if candidate_scores:
                top_candidates = sorted(
                    candidate_scores.items(), 
                    key=lambda x: x[1], 
                    reverse=True
                )[:config.N_USERKNN_CANDIDATES]
                
                for item, score in top_candidates:
                    userknn_candidates.append({
                        'customer_id': user,
                        'article_id': item,
                        'userknn_score': score
                    })

    userknn_candidates = pd.DataFrame(userknn_candidates)
    print(f"‚úì Generated {len(userknn_candidates):,} user-KNN candidates")
    print(f"  Users with candidates: {userknn_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving user-KNN candidates...")
    userknn_candidates.to_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet', index=False)
    print(f"‚úì Saved temp_userknn.parquet ({len(userknn_candidates):,} rows)")

    # Clean up memory
    del user_item_matrix, val_user_matrix, val_user_matrix_norm, user_item_matrix_norm
    del user_to_idx, item_to_idx, recent_user_items
    force_garbage_collection()
    print("‚úì Memory cleaned")



  STRATEGY 4: USER-KNN COLLABORATIVE FILTERING

Building user-item matrix...
  Matrix size: 47,543 users x 15,932 items
Populating user-item matrix...


Building matrix:   0%|          | 0/62015 [00:00<?, ?it/s]

‚úì Matrix density: 0.0072%
Saving user-item matrix...
  ‚úì Saved user_item_matrix.npz
  ‚úì Saved index mappings
Computing user similarities for 4,943 validation users...
Computing cosine similarities...


Similarity batches:   0%|          | 0/4 [00:00<?, ?it/s]

‚úì Generated 30,107 user-KNN candidates
  Users with candidates: 2,018

Saving user-KNN candidates...
‚úì Saved temp_userknn.parquet (30,107 rows)
‚úì Memory cleaned


In [137]:
# ============================================================================
# STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS - WITH PARQUET SAVING
# ============================================================================

print_section("STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS")

# Check if already computed
if (config.OUTPUT_PATH / 'temp_category.parquet').exists():
    print("‚ö° Found existing category candidates, loading...")
    category_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_category.parquet')
    print(f"‚úì Loaded {len(category_candidates):,} category candidates")
    print(f"  Users with candidates: {category_candidates['customer_id'].nunique():,}")
else:
    print("Computing user category preferences...")

    # Get user's category preferences
    user_categories = (
        train_transactions
        .merge(articles[['article_id', 'product_type_no', 'product_group_name']], on='article_id')
        .groupby(['customer_id', 'product_type_no'])
        .size()
        .reset_index(name='count')
    )

    # Get top 3 categories per user
    user_top_categories = (
        user_categories
        .sort_values(['customer_id', 'count'], ascending=[True, False])
        .groupby('customer_id')
        .head(3)
    )

    print(f"‚úì Computed preferences for {user_top_categories['customer_id'].nunique():,} users")

    # Save user category preferences for potential reuse
    print("Saving user category preferences...")
    user_top_categories.to_parquet(config.OUTPUT_PATH / 'user_category_preferences.parquet', index=False)
    print(f"  ‚úì Saved user_category_preferences.parquet")

    # Get popular items per category
    category_popular_items = (
        train_transactions[train_transactions['t_dat'] >= cutoff_date]
        .merge(articles[['article_id', 'product_type_no']], on='article_id')
        .groupby(['product_type_no', 'article_id'])
        .size()
        .reset_index(name='count')
        .sort_values(['product_type_no', 'count'], ascending=[True, False])
        .groupby('product_type_no')
        .head(10)
    )

    # Save category popular items for potential reuse
    print("Saving category popular items...")
    category_popular_items.to_parquet(config.OUTPUT_PATH / 'category_popular_items.parquet', index=False)
    print(f"  ‚úì Saved category_popular_items.parquet")

    print("Generating category-based candidates...")
    category_candidates = []

    for _, row in tqdm(user_top_categories.iterrows(), total=len(user_top_categories), desc="Category recommendations"):
        user = row['customer_id']
        category = row['product_type_no']
        
        # Get popular items in this category
        category_items = category_popular_items[
            category_popular_items['product_type_no'] == category
        ]['article_id'].tolist()
        
        # Get user's purchased items
        user_items = set(
            train_transactions[train_transactions['customer_id'] == user]['article_id']
        )
        
        # Recommend items not yet purchased
        for rank, item in enumerate(category_items):
            if item not in user_items and rank < config.N_CATEGORY_CANDIDATES:
                category_candidates.append({
                    'customer_id': user,
                    'article_id': item,
                    'category_score': 1.0 / (rank + 1)  # Rank-based score
                })

    category_candidates = pd.DataFrame(category_candidates)
    print(f"‚úì Generated {len(category_candidates):,} category-based candidates")
    print(f"  Users with candidates: {category_candidates['customer_id'].nunique():,}")

    # Save to parquet
    print("\nSaving category candidates...")
    category_candidates.to_parquet(config.OUTPUT_PATH / 'temp_category.parquet', index=False)
    print(f"‚úì Saved temp_category.parquet ({len(category_candidates):,} rows)")

    # Clean up memory
    del user_categories, user_top_categories, category_popular_items
    force_garbage_collection()
    print("‚úì Memory cleaned")


  STRATEGY 5: CATEGORY-BASED RECOMMENDATIONS

Computing user category preferences...
‚úì Computed preferences for 47,543 users
Saving user category preferences...
  ‚úì Saved user_category_preferences.parquet
Saving category popular items...
  ‚úì Saved category_popular_items.parquet
Generating category-based candidates...


Category recommendations:   0%|          | 0/142629 [00:00<?, ?it/s]

‚úì Generated 1,409,632 category-based candidates
  Users with candidates: 47,543

Saving category candidates...
‚úì Saved temp_category.parquet (1,409,632 rows)
‚úì Memory cleaned


In [138]:
print_section("STRATEGY 6: TEXT SIMILARITY RECOMMENDATIONS")

# Import the text feature module (save the artifact code as text_features.py)
from text_features import integrate_text_features_stage2

# Generate text-based candidates
text_candidates, article_embeddings, user_embeddings, text_cols = integrate_text_features_stage2(
    all_users=all_users,
    train_transactions=train_transactions,
    articles=articles,
    output_path=config.OUTPUT_PATH
)

# Save for later use
if text_candidates is not None and len(text_candidates) > 0:
    text_candidates.to_parquet(config.OUTPUT_PATH / 'temp_text_similarity.parquet', index=False)
    print(f"‚úì Saved {len(text_candidates):,} text similarity candidates")
else:
    print("‚ö†Ô∏è  No text candidates generated")


  STRATEGY 6: TEXT SIMILARITY RECOMMENDATIONS


  STAGE 2 ENHANCEMENT: TEXT-BASED CANDIDATES

Creating text corpus from articles...
  Available text columns: 12/12
    Processed 0 articles...
  ‚úì Created corpus for 16,616 articles

Computing text embeddings...
  Valid documents: 16,616
  Computing TF-IDF...
  ‚úì TF-IDF shape: (16616, 100)
  Reducing to 20 dimensions...
  ‚úì Embeddings shape: (16616, 20)
  Explained variance: 0.760

Computing user text preferences...
  Building user preference vectors...
  ‚úì Computed preferences for 47,543 users

Generating text similarity candidates...
  Processing 47,543 users...
    Processed 10,000 users...
    Processed 20,000 users...
    Processed 30,000 users...
    Processed 40,000 users...
  ‚úì Generated 713,145 text similarity candidates

‚úì Saved text similarity candidates to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/temp_text_similarity.parquet

Saving embeddings for Stage 3...
‚

In [139]:
print_section("COMBINING ALL RECALL STRATEGIES")

print("Loading candidates from parquet files...")

# Load all candidates (existing code)
print("  Loading repurchase candidates...")
repurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_repurchase.parquet')
print(f"    ‚úì {len(repurchase_candidates):,} candidates")

print("  Loading popularity candidates...")
popularity_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_popularity.parquet')
print(f"    ‚úì {len(popularity_candidates):,} candidates")

print("  Loading co-purchase candidates...")
copurchase_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_copurchase.parquet')
print(f"    ‚úì {len(copurchase_candidates):,} candidates")

print("  Loading user-KNN candidates...")
userknn_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_userknn.parquet')
print(f"    ‚úì {len(userknn_candidates):,} candidates")

print("  Loading category candidates...")
category_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_category.parquet')
print(f"    ‚úì {len(category_candidates):,} candidates")

# NEW: Load text similarity candidates
if (config.OUTPUT_PATH / 'temp_text_similarity.parquet').exists():
    print("  Loading text similarity candidates...")
    text_candidates = pd.read_parquet(config.OUTPUT_PATH / 'temp_text_similarity.parquet')
    print(f"    ‚úì {len(text_candidates):,} candidates")
    has_text_candidates = True
else:
    print("  ‚ö†Ô∏è  No text similarity candidates found")
    has_text_candidates = False

print("\nMerging candidates from all strategies...")

# Start with repurchase candidates (existing code)
all_candidates = repurchase_candidates.copy()

# Merge popularity
all_candidates = all_candidates.merge(
    popularity_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_pop')
)

# Merge co-purchase
all_candidates = all_candidates.merge(
    copurchase_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_cop')
)

# Merge user-KNN
all_candidates = all_candidates.merge(
    userknn_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_knn')
)

# Merge category
all_candidates = all_candidates.merge(
    category_candidates,
    on=['customer_id', 'article_id'],
    how='outer',
    suffixes=('', '_cat')
)

# NEW: Merge text similarity
if has_text_candidates:
    all_candidates = all_candidates.merge(
        text_candidates,
        on=['customer_id', 'article_id'],
        how='outer',
        suffixes=('', '_text')
    )

# Fill NaN scores with 0
score_columns = [
    'repurchase_score', 'popularity_score', 'copurchase_score', 
    'userknn_score', 'category_score'
]

# NEW: Add text similarity score
if has_text_candidates:
    score_columns.append('text_similarity_score')

all_candidates[score_columns] = all_candidates[score_columns].fillna(0)

print(f"‚úì Total unique user-item pairs: {len(all_candidates):,}")

# Count how many strategies recommend each item
all_candidates['n_strategies'] = (all_candidates[score_columns] > 0).sum(axis=1)

print("\nCandidate statistics:")
print(f"  Candidates per user: {len(all_candidates) / all_candidates['customer_id'].nunique():.2f}")
print(f"  Avg strategies per candidate: {all_candidates['n_strategies'].mean():.2f}")
print("\n  Candidates by number of strategies:")
for n in sorted(all_candidates['n_strategies'].unique()):
    count = (all_candidates['n_strategies'] == n).sum()
    pct = count / len(all_candidates) * 100
    print(f"    {n} strategies: {count:,} ({pct:.1f}%)")

# Save the merged candidates
print("\nSaving merged candidates...")
all_candidates.to_parquet(config.OUTPUT_PATH / 'all_candidates_merged.parquet', index=False)
print(f"‚úì Saved to all_candidates_merged.parquet ({len(all_candidates):,} rows)")

# Clean up to save memory
print("\nCleaning up temporary dataframes...")
del repurchase_candidates, popularity_candidates, copurchase_candidates
del userknn_candidates, category_candidates
if has_text_candidates:
    del text_candidates
gc.collect()
print("‚úì Memory cleaned")


  COMBINING ALL RECALL STRATEGIES

Loading candidates from parquet files...
  Loading repurchase candidates...
    ‚úì 325,177 candidates
  Loading popularity candidates...
    ‚úì 1,188,575 candidates
  Loading co-purchase candidates...
    ‚úì 612,807 candidates
  Loading user-KNN candidates...
    ‚úì 30,107 candidates
  Loading category candidates...
    ‚úì 1,409,632 candidates
  Loading text similarity candidates...
    ‚úì 713,145 candidates

Merging candidates from all strategies...
‚úì Total unique user-item pairs: 4,044,442

Candidate statistics:
  Candidates per user: 85.07
  Avg strategies per candidate: 1.06

  Candidates by number of strategies:
    1 strategies: 3,816,416 (94.4%)
    2 strategies: 221,301 (5.5%)
    3 strategies: 6,479 (0.2%)
    4 strategies: 242 (0.0%)
    5 strategies: 4 (0.0%)

Saving merged candidates...
‚úì Saved to all_candidates_merged.parquet (4,044,442 rows)

Cleaning up temporary dataframes...
‚úì Memory cleaned


### Stage 3: Extracting Features

In [162]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import warnings
from tqdm.auto import tqdm
import pickle

warnings.filterwarnings('ignore')

In [163]:
# ============================================================================
# MEMORY MONITORING
# ============================================================================

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    gc.collect()

In [164]:
# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2')
    
    # Processing
    CHUNK_SIZE = 50000  # Process candidates in chunks
    
    # Feature engineering windows
    RECENT_DAYS = 7  # Last week
    MEDIUM_DAYS = 30  # Last month
    
    RANDOM_STATE = 42

config = Config()

In [None]:
# UTILITY FUNCTIONS

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

In [None]:
# LOAD DATA

print_section("LOADING DATA")

print("Loading preprocessed data...")

# Check which candidate file exists
if (config.DATA_PATH / 'all_candidates_merged.parquet').exists():
    candidates = pd.read_parquet(config.DATA_PATH / 'all_candidates_merged.parquet')
    print(f"‚úì Loaded all_candidates_merged.parquet")
elif (config.DATA_PATH / 'recall_candidates.parquet').exists():
    candidates = pd.read_parquet(config.DATA_PATH / 'recall_candidates.parquet')
    print(f"‚úì Loaded recall_candidates.parquet")
else:
    raise FileNotFoundError("Could not find candidates file (all_candidates_merged.parquet or recall_candidates.parquet)")

train_transactions = pd.read_parquet(config.DATA_PATH / 'train_transactions.parquet')
articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')
customers = pd.read_parquet(config.DATA_PATH / 'customers.parquet')

print(f"‚úì Train transactions: {len(train_transactions):,}")
print(f"‚úì Articles: {len(articles):,}")
print(f"‚úì Customers: {len(customers):,}")
print(f"‚úì Candidates: {len(candidates):,}")
print_memory()

# Get max date
max_date = train_transactions['t_dat'].max()
print(f"‚úì Max date: {max_date.date()}")

# Load item popularity from recall stage
item_popularity = pd.read_parquet(config.DATA_PATH / 'item_popularity.parquet')
print(f"‚úì Item popularity scores: {len(item_popularity):,}")

# Check what score columns are available in candidates
available_scores = [col for col in candidates.columns if 'score' in col.lower()]
print(f"‚úì Available recall scores in candidates: {available_scores}")

# Item-to-item CF is already in the candidates as 'copurchase_score'
# We don't need to load a separate pkl file
has_copurchase_score = 'copurchase_score' in candidates.columns
if has_copurchase_score:
    print(f"‚úì Co-purchase scores available in candidates")
else:
    print("‚ö†Ô∏è  Co-purchase scores not found in candidates")


  LOADING DATA

Loading preprocessed data...
‚úì Loaded all_candidates_merged.parquet
‚úì Train transactions: 412,156
‚úì Articles: 16,616
‚úì Customers: 49,576
‚úì Candidates: 4,044,442
  üíæ Memory: 1.56 GB
‚úì Max date: 2020-09-14
‚úì Item popularity scores: 7,115
‚úì Available recall scores in candidates: ['repurchase_score', 'popularity_score', 'copurchase_score', 'userknn_score', 'category_score', 'text_similarity_score']
‚úì Co-purchase scores available in candidates


In [None]:
# PART 1: USER FEATURES (20-25 features)

print_section("PART 1: USER FEATURES")

# Check if user features already exist
if (config.OUTPUT_PATH / 'user_features.parquet').exists():
    print("‚ö° Found existing user_features.parquet, loading...")
    user_stats = pd.read_parquet(config.OUTPUT_PATH / 'user_features.parquet')
    print(f"‚úì Loaded {len(user_stats.columns)-1} user features from disk")
    print_memory()
else:
    print("Computing user-level features...")

    # Basic user statistics
    print("  [1/5] Basic purchase statistics...")
    user_stats = train_transactions.groupby('customer_id').agg({
        'article_id': 'count',  # Total purchases
        'price': ['mean', 'std', 'min', 'max'],  # Price statistics
        't_dat': ['min', 'max']  # First and last purchase dates
    }).reset_index()

    user_stats.columns = ['customer_id', 'n_purchases', 'avg_price', 'std_price', 
                          'min_price', 'max_price', 'first_purchase_date', 'last_purchase_date']

    # Calculate days since first/last purchase
    user_stats['days_since_first_purchase'] = (
        max_date - user_stats['first_purchase_date']
    ).dt.days.astype(np.int16)

    user_stats['days_since_last_purchase'] = (
        max_date - user_stats['last_purchase_date']
    ).dt.days.astype(np.int16)

    # Purchase frequency
    user_stats['purchase_frequency'] = (
        user_stats['n_purchases'] / (user_stats['days_since_first_purchase'] + 1)
    ).astype(np.float32)

    # Drop date columns (not needed anymore)
    user_stats = user_stats.drop(['first_purchase_date', 'last_purchase_date'], axis=1)

    print(f"  - Created {len(user_stats.columns)-1} basic features")

    # Recent activity features
    print("  [2/5] Recent activity features...")
    recent_cutoff = max_date - timedelta(days=config.RECENT_DAYS)
    recent_transactions = train_transactions[train_transactions['t_dat'] >= recent_cutoff]

    user_recent_stats = recent_transactions.groupby('customer_id').agg({
        'article_id': 'count',
        'price': 'mean'
    }).reset_index()
    user_recent_stats.columns = ['customer_id', 'n_purchases_last_week', 'avg_price_last_week']

    # Merge with main stats
    user_stats = user_stats.merge(user_recent_stats, on='customer_id', how='left')
    user_stats['n_purchases_last_week'] = user_stats['n_purchases_last_week'].fillna(0).astype(np.int16)
    user_stats['avg_price_last_week'] = user_stats['avg_price_last_week'].fillna(0).astype(np.float32)

    # Is user active recently
    user_stats['is_active_last_week'] = (user_stats['n_purchases_last_week'] > 0).astype(np.int8)

    del recent_transactions, user_recent_stats
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Diversity features
    print("  [3/5] Diversity features...")
    user_diversity = train_transactions.groupby('customer_id').agg({
        'article_id': 'nunique',
    }).reset_index()
    user_diversity.columns = ['customer_id', 'n_unique_articles']

    # Category diversity
    user_cat_diversity = (
        train_transactions
        .merge(articles[['article_id', 'product_type_no']], on='article_id')
        .groupby('customer_id')['product_type_no']
        .nunique()
        .reset_index()
    )
    user_cat_diversity.columns = ['customer_id', 'n_unique_categories']

    user_stats = user_stats.merge(user_diversity, on='customer_id', how='left')
    user_stats = user_stats.merge(user_cat_diversity, on='customer_id', how='left')

    user_stats['exploration_rate'] = (
        user_stats['n_unique_articles'] / user_stats['n_purchases']
    ).astype(np.float32)

    del user_diversity, user_cat_diversity
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Customer demographic features
    print("  [4/5] Demographic features...")
    customer_features = customers[['customer_id', 'age', 'FN', 'Active']].copy()

    # Merge with user stats
    user_stats = user_stats.merge(customer_features, on='customer_id', how='left')

    # Fill missing values
    user_stats['age'] = user_stats['age'].fillna(user_stats['age'].median()).astype(np.float32)
    user_stats['FN'] = user_stats['FN'].fillna(0).astype(np.float32)
    user_stats['Active'] = user_stats['Active'].fillna(0).astype(np.float32)

    del customer_features
    force_garbage_collection()

    print(f"  - Total features so far: {len(user_stats.columns)-1}")

    # Purchase trend
    print("  [5/5] Purchase trend features...")
    # Split into two periods and compare
    mid_date = max_date - timedelta(days=config.MEDIUM_DAYS // 2)
    old_cutoff = max_date - timedelta(days=config.MEDIUM_DAYS)

    recent_period = train_transactions[train_transactions['t_dat'] >= mid_date]
    old_period = train_transactions[
        (train_transactions['t_dat'] >= old_cutoff) & (train_transactions['t_dat'] < mid_date)
    ]

    user_recent_count = recent_period.groupby('customer_id').size().reset_index(name='purchases_recent_period')
    user_old_count = old_period.groupby('customer_id').size().reset_index(name='purchases_old_period')

    user_trend = user_recent_count.merge(user_old_count, on='customer_id', how='outer').fillna(0)
    user_trend['purchase_trend'] = (
        (user_trend['purchases_recent_period'] - user_trend['purchases_old_period']) / 
        (user_trend['purchases_old_period'] + 1)
    ).astype(np.float32)

    user_stats = user_stats.merge(
        user_trend[['customer_id', 'purchase_trend']], 
        on='customer_id', 
        how='left'
    )
    user_stats['purchase_trend'] = user_stats['purchase_trend'].fillna(0).astype(np.float32)

    del recent_period, old_period, user_recent_count, user_old_count, user_trend
    force_garbage_collection()

    # Convert to optimal dtypes
    for col in user_stats.columns:
        if col != 'customer_id':
            if user_stats[col].dtype == 'float64':
                user_stats[col] = user_stats[col].astype(np.float32)
            elif user_stats[col].dtype == 'int64':
                user_stats[col] = user_stats[col].astype(np.int32)

    print(f"‚úì Created {len(user_stats.columns)-1} user features")
    print_memory()

    # Save user features for reuse
    print("\nSaving user features...")
    user_stats.to_parquet(config.OUTPUT_PATH / 'user_features.parquet', index=False)
    print(f"‚úì Saved user_features.parquet ({len(user_stats):,} rows, {len(user_stats.columns)-1} features)")


  PART 1: USER FEATURES

‚ö° Found existing user_features.parquet, loading...
‚úì Loaded 18 user features from disk
  üíæ Memory: 1.42 GB


In [None]:
# PART 2: ITEM FEATURES (20-25 features)

print_section("PART 2: ITEM FEATURES")

# Check if item features already exist
if (config.OUTPUT_PATH / 'item_features.parquet').exists():
    print("‚ö° Found existing item_features.parquet, loading...")
    item_stats = pd.read_parquet(config.OUTPUT_PATH / 'item_features.parquet')
    print(f"‚úì Loaded {len(item_stats.columns)-1} item features from disk")
    print_memory()
else:
    print("Computing item-level features...")

# Basic item statistics
print("  [1/4] Basic item statistics...")
item_stats = train_transactions.groupby('article_id').agg({
    'customer_id': 'nunique',  # Number of unique buyers
    'price': ['mean', 'std'],
    't_dat': ['min', 'max', 'count']
}).reset_index()

item_stats.columns = ['article_id', 'n_unique_buyers', 'avg_price', 'std_price',
                      'first_sale_date', 'last_sale_date', 'total_sales']

# Days since first/last sale
item_stats['days_since_first_sale'] = (
    max_date - item_stats['first_sale_date']
).dt.days.astype(np.int16)

item_stats['days_since_last_sale'] = (
    max_date - item_stats['last_sale_date']
).dt.days.astype(np.int16)

# Sales frequency
item_stats['sales_frequency'] = (
    item_stats['total_sales'] / (item_stats['days_since_first_sale'] + 1)
).astype(np.float32)

item_stats = item_stats.drop(['first_sale_date', 'last_sale_date'], axis=1)

print(f"  - Created {len(item_stats.columns)-1} basic features")

# Recent popularity
print("  [2/4] Recent popularity features...")
recent_cutoff = max_date - timedelta(days=config.RECENT_DAYS)
recent_sales = train_transactions[train_transactions['t_dat'] >= recent_cutoff]

item_recent_stats = recent_sales.groupby('article_id').agg({
    'customer_id': ['count', 'nunique']
}).reset_index()
item_recent_stats.columns = ['article_id', 'sales_last_week', 'buyers_last_week']

item_stats = item_stats.merge(item_recent_stats, on='article_id', how='left')
item_stats['sales_last_week'] = item_stats['sales_last_week'].fillna(0).astype(np.int16)
item_stats['buyers_last_week'] = item_stats['buyers_last_week'].fillna(0).astype(np.int16)

del recent_sales, item_recent_stats
force_garbage_collection()

print(f"  - Total features so far: {len(item_stats.columns)-1}")

# Sales trend
print("  [3/4] Sales trend features...")
mid_date = max_date - timedelta(days=config.MEDIUM_DAYS // 2)
old_cutoff = max_date - timedelta(days=config.MEDIUM_DAYS)

recent_period = train_transactions[train_transactions['t_dat'] >= mid_date]
old_period = train_transactions[
    (train_transactions['t_dat'] >= old_cutoff) & (train_transactions['t_dat'] < mid_date)
]

item_recent_count = recent_period.groupby('article_id').size().reset_index(name='sales_recent_period')
item_old_count = old_period.groupby('article_id').size().reset_index(name='sales_old_period')

item_trend = item_recent_count.merge(item_old_count, on='article_id', how='outer').fillna(0)
item_trend['sales_trend'] = (
    (item_trend['sales_recent_period'] - item_trend['sales_old_period']) / 
    (item_trend['sales_old_period'] + 1)
).astype(np.float32)

item_stats = item_stats.merge(
    item_trend[['article_id', 'sales_trend']], 
    on='article_id', 
    how='left'
)
item_stats['sales_trend'] = item_stats['sales_trend'].fillna(0).astype(np.float32)

del recent_period, old_period, item_recent_count, item_old_count, item_trend
force_garbage_collection()

print(f"  - Total features so far: {len(item_stats.columns)-1}")

# Merge with article metadata
print("  [4/4] Article metadata features...")
article_features = articles[[
    'article_id', 'product_type_no', 'graphical_appearance_no',
    'colour_group_code', 'perceived_colour_value_id', 
    'department_no', 'index_group_no', 'section_no', 'garment_group_no'
]].copy()

item_stats = item_stats.merge(article_features, on='article_id', how='left')

del article_features
force_garbage_collection()

# Add popularity scores from recall stage
item_stats = item_stats.merge(
    item_popularity[['article_id', 'popularity_score']], 
    on='article_id', 
    how='left'
)
item_stats['popularity_score'] = item_stats['popularity_score'].fillna(0).astype(np.float32)

# Convert to optimal dtypes
for col in item_stats.columns:
    if col != 'article_id':
        if item_stats[col].dtype == 'float64':
            item_stats[col] = item_stats[col].astype(np.float32)
        elif item_stats[col].dtype == 'int64':
            item_stats[col] = item_stats[col].astype(np.int32)

print(f"‚úì Created {len(item_stats.columns)-1} item features")
print_memory()

# Save item features for reuse
print("\nSaving item features...")
item_stats.to_parquet(config.OUTPUT_PATH / 'item_features.parquet', index=False)
print(f"‚úì Saved item_features.parquet ({len(item_stats):,} rows, {len(item_stats.columns)-1} features)")


  PART 2: ITEM FEATURES

‚ö° Found existing item_features.parquet, loading...
‚úì Loaded 19 item features from disk
  üíæ Memory: 1.42 GB
  [1/4] Basic item statistics...
  - Created 7 basic features
  [2/4] Recent popularity features...
  - Total features so far: 9
  [3/4] Sales trend features...
  - Total features so far: 10
  [4/4] Article metadata features...
‚úì Created 19 item features
  üíæ Memory: 2.07 GB

Saving item features...
‚úì Saved item_features.parquet (15,932 rows, 19 features)


In [None]:
# PART 3: USER-ITEM INTERACTION FEATURES (CHUNKED)

print_section("PART 3: USER-ITEM INTERACTION FEATURES")

print("Computing interaction features in chunks...")

# Precompute user purchase history for fast lookup
print("  [1/3] Building user purchase history...")
user_purchases = (
    train_transactions
    .groupby('customer_id')['article_id']
    .apply(set)
    .to_dict()
)

user_purchase_list = (
    train_transactions
    .sort_values('t_dat', ascending=False)
    .groupby('customer_id')['article_id']
    .apply(list)
    .to_dict()
)

# Save user purchase history for reuse
print("  [1.5/3] Saving user purchase history...")
user_purchase_df = pd.DataFrame([
    {'customer_id': user, 'purchased_articles': list(items)}
    for user, items in user_purchases.items()
])
user_purchase_df.to_parquet(config.OUTPUT_PATH / 'user_purchase_history.parquet', index=False)
print(f"  ‚úì Saved user_purchase_history.parquet")

del user_purchase_df
force_garbage_collection()

# User category preferences
user_categories = (
    train_transactions
    .merge(articles[['article_id', 'product_type_no']], on='article_id')
    .groupby(['customer_id', 'product_type_no'])
    .size()
    .reset_index(name='count')
)
user_top_category = (
    user_categories
    .sort_values(['customer_id', 'count'], ascending=[True, False])
    .groupby('customer_id')
    .first()
    .reset_index()
    [['customer_id', 'product_type_no']]
    .rename(columns={'product_type_no': 'top_category'})
)

# User price preferences
user_price_stats = train_transactions.groupby('customer_id')['price'].agg(['mean', 'std']).reset_index()
user_price_stats.columns = ['customer_id', 'user_avg_price', 'user_std_price']

# Save additional user stats
print("Saving additional user statistics...")
user_top_category.to_parquet(config.OUTPUT_PATH / 'user_top_categories.parquet', index=False)
user_price_stats.to_parquet(config.OUTPUT_PATH / 'user_price_stats.parquet', index=False)
print(f"  ‚úì Saved user_top_categories.parquet")
print(f"  ‚úì Saved user_price_stats.parquet")

print("  [2/3] Processing candidates in chunks...")

# Split candidates into chunks
n_chunks = max(1, len(candidates) // config.CHUNK_SIZE)
candidate_chunks = np.array_split(candidates, n_chunks)

feature_chunks = []

for chunk_idx, chunk in enumerate(tqdm(candidate_chunks, desc="Feature chunks")):
    # Start with the chunk
    chunk_features = chunk.copy()
    
    # Has user purchased this exact item before?
    chunk_features['has_purchased_item'] = chunk_features.apply(
        lambda row: 1 if row['article_id'] in user_purchases.get(row['customer_id'], set()) else 0,
        axis=1
    ).astype(np.int8)
    
    # If purchased before, get days since last purchase
    def days_since_purchase(row):
        user_items = user_purchase_list.get(row['customer_id'], [])
        if row['article_id'] in user_items:
            # Get position of first occurrence (most recent due to sort)
            try:
                idx = user_items.index(row['article_id'])
                # Approximate days (assuming 1 purchase per day on average)
                return min(idx, 365)
            except:
                return 365
        return 365
    
    chunk_features['days_since_item_purchase'] = chunk_features.apply(
        days_since_purchase, axis=1
    ).astype(np.int16)
    
    # Merge with item stats to get item metadata
    chunk_features = chunk_features.merge(
        item_stats[['article_id', 'product_type_no', 'avg_price', 'popularity_score']], 
        on='article_id', 
        how='left'
    )
    
    # Has user purchased items from this category?
    chunk_features = chunk_features.merge(user_top_category, on='customer_id', how='left')
    chunk_features['category_match'] = (
        chunk_features['product_type_no'] == chunk_features['top_category']
    ).astype(np.int8)
    chunk_features = chunk_features.drop(['product_type_no', 'top_category'], axis=1)
    
    # Price match features
    chunk_features = chunk_features.merge(user_price_stats, on='customer_id', how='left')
    chunk_features['price_vs_user_avg'] = (
        (chunk_features['avg_price'] - chunk_features['user_avg_price']) / 
        (chunk_features['user_std_price'] + 0.01)
    ).astype(np.float32)
    
    chunk_features['is_cheaper_than_usual'] = (
        chunk_features['avg_price'] < chunk_features['user_avg_price']
    ).astype(np.int8)
    
    chunk_features = chunk_features.drop(['user_avg_price', 'user_std_price', 'avg_price'], axis=1)
    
    # Co-purchase score is already in candidates, but create derived features
    if 'copurchase_score' in chunk_features.columns:
        # Normalize copurchase score by user's max copurchase score
        user_max_copurchase = chunk_features.groupby('customer_id')['copurchase_score'].transform('max')
        chunk_features['copurchase_score_normalized'] = (
            chunk_features['copurchase_score'] / (user_max_copurchase + 0.001)
        ).astype(np.float32)
        
        # Binary: has any copurchase signal
        chunk_features['has_copurchase_signal'] = (
            chunk_features['copurchase_score'] > 0
        ).astype(np.int8)
    
    # Recall strategy coverage (how many strategies recommended this item)
    # Already in candidates as 'n_strategies'
    
    # Rank features (rank within each recall strategy)
    for score_col in ['repurchase_score', 'popularity_score', 'copurchase_score', 
                      'userknn_score', 'category_score']:
        if score_col in chunk_features.columns:
            chunk_features[f'{score_col}_rank'] = (
                chunk_features.groupby('customer_id')[score_col]
                .rank(method='dense', ascending=False)
                .astype(np.int16)
            )
    
    # Overall candidate rank (by combined score if available)
    if 'n_strategies' in chunk_features.columns:
        chunk_features['overall_rank'] = (
            chunk_features.groupby('customer_id')['n_strategies']
            .rank(method='dense', ascending=False)
            .astype(np.int16)
        )
    
    # Clean up
    chunk_features = chunk_features.fillna(0)
    
    feature_chunks.append(chunk_features)
    
    # Clean up
    del chunk_features
    if chunk_idx % 10 == 0:
        force_garbage_collection()

print("  [3/3] Combining feature chunks...")
all_features = pd.concat(feature_chunks, ignore_index=True)
del feature_chunks
force_garbage_collection()

print(f"‚úì Created interaction features for {len(all_features):,} candidates")
print_memory()

# MERGE ALL FEATURES

print_section("MERGING ALL FEATURES")

print("Merging user, item, and interaction features...")

# Merge user features
all_features = all_features.merge(user_stats, on='customer_id', how='left')
print(f"  ‚úì Merged user features")

# Merge item features (already partially merged, merge remaining)
remaining_item_cols = [col for col in item_stats.columns if col not in all_features.columns]
remaining_item_cols.append('article_id')
all_features = all_features.merge(item_stats[remaining_item_cols], on='article_id', how='left')
print(f"  ‚úì Merged item features")

# Fill any remaining NaNs (handle categorical columns separately)
print("Filling missing values...")

# Identify categorical columns
categorical_cols = all_features.select_dtypes(include=['category']).columns.tolist()
numerical_cols = all_features.select_dtypes(include=[np.number]).columns.tolist()

# Fill numerical columns with 0
if numerical_cols:
    all_features[numerical_cols] = all_features[numerical_cols].fillna(0)

# Fill categorical columns with their mode or a default value
for col in categorical_cols:
    if all_features[col].isna().any():
        # Get the most frequent category
        mode_value = all_features[col].mode()
        if len(mode_value) > 0:
            all_features[col] = all_features[col].fillna(mode_value[0])
        else:
            # If no mode, convert to string and fill with 'unknown'
            all_features[col] = all_features[col].astype(str).fillna('unknown')

print(f"\n‚úì Total features: {len(all_features.columns) - 2} (excluding customer_id, article_id)")
print(f"‚úì Total candidate-feature pairs: {len(all_features):,}")
print_memory()


  PART 3: USER-ITEM INTERACTION FEATURES

Computing interaction features in chunks...
  [1/3] Building user purchase history...
  [1.5/3] Saving user purchase history...
  ‚úì Saved user_purchase_history.parquet
Saving additional user statistics...
  ‚úì Saved user_top_categories.parquet
  ‚úì Saved user_price_stats.parquet
  [2/3] Processing candidates in chunks...


Feature chunks:   0%|          | 0/80 [00:00<?, ?it/s]

  [3/3] Combining feature chunks...
‚úì Created interaction features for 4,044,442 candidates
  üíæ Memory: 2.43 GB

  MERGING ALL FEATURES

Merging user, item, and interaction features...
  ‚úì Merged user features
  ‚úì Merged item features
Filling missing values...

‚úì Total features: 55 (excluding customer_id, article_id)
‚úì Total candidate-feature pairs: 4,044,442
  üíæ Memory: 1.84 GB


In [171]:
print_section("PART 4: TEXT-BASED SEMANTIC FEATURES")

# Import the text feature module
from text_features import integrate_text_features_stage3

# Check if embeddings exist
embeddings_path = config.DATA_PATH  # Or wherever you ran Stage 2

if (embeddings_path / 'article_embeddings.pkl').exists():
    print("Found saved text embeddings, integrating text features...")
    
    all_features = integrate_text_features_stage3(
        all_features=all_features,
        articles=articles,
        train_transactions=train_transactions,
        embeddings_path=embeddings_path
    )
    
    print(f"\n‚úì Enhanced features with text semantics")
    print(f"  Total features now: {len(all_features.columns) - 2}")
    print_memory()
else:
    print("‚ö†Ô∏è  Text embeddings not found, skipping text features")
    print("   Run Stage 2 with text feature integration first")



  PART 4: TEXT-BASED SEMANTIC FEATURES

Found saved text embeddings, integrating text features...

  STAGE 3 ENHANCEMENT: TEXT-BASED FEATURES

Loading saved embeddings...
  ‚úì Loaded 16,616 article embeddings
  ‚úì Loaded 47,543 user embeddings

Enhancing features with text semantics...

Creating category encoding features...
  Processing 7 categorical columns...
  ‚úì Created 14 category encoding features

Computing semantic diversity features...
  ‚úì Computed diversity for 47,543 users

  Merging category features...
  Merging semantic diversity features...
  Computing user-item text similarities in chunks...
    Processing chunk 1/80...
    Processing chunk 11/80...
    Processing chunk 21/80...
    Processing chunk 31/80...
    Processing chunk 41/80...
    Processing chunk 51/80...
    Processing chunk 61/80...
    Processing chunk 71/80...
  ‚úì Text enhancement complete

‚úì Enhanced features with text semantics
  Total features now: 72
  üíæ Memory: 3.28 GB


In [None]:
print_section("SAVING FEATURES")

print("Saving feature matrix...")

# IMPORTANT: Check if text features were added
text_features_added = any('text_similarity' in col or 'semantic' in col 
                          for col in all_features.columns)

if text_features_added:
    print("  ‚úì Text semantic features detected in feature matrix")
else:
    print("  ‚ö†Ô∏è  No text semantic features detected")

# Save the complete feature matrix
all_features.to_parquet(config.OUTPUT_PATH / 'training_features.parquet', index=False)

file_size = (config.OUTPUT_PATH / 'training_features.parquet').stat().st_size / 1024**2
print(f"‚úì Saved training_features.parquet ({file_size:.2f} MB)")

# Save feature names for later use
feature_names = [col for col in all_features.columns 
                 if col not in ['customer_id', 'article_id']]

with open(config.OUTPUT_PATH / 'feature_names.txt', 'w') as f:
    f.write('\n'.join(feature_names))

print(f"‚úì Saved feature_names.txt ({len(feature_names)} features)")

# NEW: Save feature metadata including text features info
feature_metadata = {
    'total_features': len(feature_names),
    'has_text_features': text_features_added,
    'feature_list': feature_names,
    'timestamp': str(datetime.now())
}

import json
with open(config.OUTPUT_PATH / 'feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)
print(f"‚úì Saved feature_metadata.json")

# Print feature summary
print("\n" + "="*80)
print("FEATURE SUMMARY")
print("="*80)
print(f"\nTotal features: {len(feature_names)}")

# Categorize features
user_features = [f for f in feature_names if any(x in f.lower() for x in [
    'user', 'customer', 'purchase', 'age', 'active', 'fn', 'trend'
])]

item_features = [f for f in feature_names if any(x in f.lower() for x in [
    'item', 'article', 'sales', 'product', 'colour', 'color', 
    'department', 'section', 'garment', 'frequency', 'count'
]) and f not in user_features]

interaction_features = [f for f in feature_names if any(x in f.lower() for x in [
    'score', 'rank', 'strategies', 'match', 'purchased', 'category_match',
    'price_vs', 'cheaper'
]) and f not in user_features and f not in item_features]

# NEW: Identify text features
text_features = [f for f in feature_names if any(x in f.lower() for x in [
    'text_similarity', 'semantic', 'embedding'
])]

print(f"\nFeature breakdown:")
print(f"  - User features: {len(user_features)}")
print(f"  - Item features: {len(item_features)}")
print(f"  - Interaction features: {len(interaction_features)}")

# NEW: Show text features if present
if text_features:
    print(f"  - Text semantic features: {len(text_features)} ‚úì")
    print(f"\n  Text features included:")
    for feat in text_features:
        print(f"    ‚Ä¢ {feat}")
else:
    print(f"  - Text semantic features: 0 (not integrated)")

# NEW: Show sample of other feature categories
print(f"\n  Sample user features:")
for feat in user_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print(f"\n  Sample item features:")
for feat in item_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print(f"\n  Sample interaction features:")
for feat in interaction_features[:5]:
    print(f"    ‚Ä¢ {feat}")

print("\n SAVED FILES \n")
print("\nIntermediate feature files (for reuse):")
print("user_features.parquet - User-level features")
print("item_features.parquet - Item-level features")
print("user_purchase_history.parquet - User purchase history")
print("user_top_categories.parquet - User category preferences")
print("user_price_stats.parquet - User price statistics")

# NEW: Check for text-related files
if (config.OUTPUT_PATH / 'article_embeddings.pkl').exists():
    print("article_embeddings.pkl - Article text embeddings")
if (config.OUTPUT_PATH / 'user_embeddings.pkl').exists():
    print("user_embeddings.pkl - User preference embeddings")

print("\nFinal output files:")
print("training_features.parquet - Complete feature matrix for training")
print("feature_names.txt - List of all feature names")
print("feature_metadata.json - Feature metadata and info")

print("\n STAGE 3 COMPLETE! \n")

# NEW: Validation checks
print("\nValidation checks:")
print(f"No NaN values: {all_features.isnull().sum().sum() == 0}")
print(f"Total rows: {len(all_features):,}")
print(f"Total features: {len(feature_names)}")
print(f"Memory usage: {all_features.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

if text_features_added:
    print(f"Text features integrated: YES ‚ú®")
    print(f"{len(text_features)} text-based features added")
else:
    print(f"Text features not integrated")
    print(f"Run Stage 2 with text integration first")
    print(f"Or check if embeddings exist in {config.DATA_PATH}")

if not text_features_added:
    print("Consider integrating text features for better performance")
    print("(See text_feature_integration guide)")
else:
    print("Ready for Model Training")

if text_features_added:
    print("\n Text semantic features successfully integrated!")


  SAVING FEATURES

Saving feature matrix...
  ‚úì Text semantic features detected in feature matrix
‚úì Saved training_features.parquet (288.07 MB)
‚úì Saved feature_names.txt (72 features)
‚úì Saved feature_metadata.json

FEATURE SUMMARY

Total features: 72

Feature breakdown:
  - User features: 23
  - Item features: 24
  - Interaction features: 10
  - Text semantic features: 4 ‚úì

  Text features included:
    ‚Ä¢ text_similarity_score
    ‚Ä¢ semantic_diversity
    ‚Ä¢ semantic_range
    ‚Ä¢ user_item_text_similarity

  Sample user features:
    ‚Ä¢ repurchase_score
    ‚Ä¢ copurchase_score
    ‚Ä¢ userknn_score
    ‚Ä¢ has_purchased_item
    ‚Ä¢ days_since_item_purchase

  Sample item features:
    ‚Ä¢ n_unique_articles
    ‚Ä¢ total_sales
    ‚Ä¢ sales_frequency
    ‚Ä¢ sales_last_week
    ‚Ä¢ product_type_no

  Sample interaction features:
    ‚Ä¢ popularity_score_x
    ‚Ä¢ category_score
    ‚Ä¢ text_similarity_score
    ‚Ä¢ n_strategies
    ‚Ä¢ popularity_score_y

SAVED FILES

### Getting Image features

In [None]:
import pandas as pd
import numpy as np
import gc
import os
import psutil
from pathlib import Path
import warnings
from tqdm.auto import tqdm
from sklearn.decomposition import PCA
import pickle

warnings.filterwarnings('ignore')

try:
    import torch
    import torchvision
    from torchvision import models, transforms
    from PIL import Image
    HAS_TORCH = True
    print("‚úì PyTorch available")
    print(f"  PyTorch version: {torch.__version__}")
    
    # Check for MPS (Metal Performance Shaders) support
    if torch.backends.mps.is_available():
        print("‚úì MPS (Metal Performance Shaders) available - Apple Silicon GPU acceleration enabled!")
    else:
        print("‚ö†Ô∏è  MPS not available - will use CPU")
        
except ImportError:
    HAS_TORCH = False
    print("‚ö†Ô∏è  PyTorch not available. Install with: pip install torch torchvision pillow")

try:
    from transformers import CLIPProcessor, CLIPModel
    HAS_CLIP = True
    print("‚úì CLIP/FashionCLIP available")
except ImportError:
    HAS_CLIP = False
    print("‚ö†Ô∏è  CLIP/FashionCLIP not available. Install with: pip install transformers")

‚úì PyTorch available
  PyTorch version: 2.9.1
‚úì MPS (Metal Performance Shaders) available - Apple Silicon GPU acceleration enabled!
‚úì CLIP/FashionCLIP available


In [None]:
# MEMORY MONITORING

def get_memory_usage():
    """Get current memory usage in GB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024**3

def print_memory():
    """Print current memory usage"""
    mem = get_memory_usage()
    print(f"  üíæ Memory: {mem:.2f} GB")

def force_garbage_collection():
    """Aggressive garbage collection"""
    gc.collect()
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()  # Clear MPS cache

In [None]:
# CONFIGURATION

class Config:
    # Paths - UPDATE THESE FOR YOUR LOCAL SETUP
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')  # Where your parquet files are
    OUTPUT_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')  # Where to save outputs
    IMAGE_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/h-and-m-personalized-fashion-recommendations/images')  # H&M image directory
    
    # Image embedding options
    EMBEDDING_METHOD = 'fashion-clip'  # Options: 'fashion-clip', 'resnet50', 'clip', 'efficientnet'
    EMBEDDING_DIM = 512  # Output dimension (will be projected from original)
    
    # Processing - Optimized for Apple Silicon
    BATCH_SIZE = 64  # M4 can handle larger batches efficiently
    IMAGE_SIZE = 224  # Input size for models
    
    # Memory optimization
    PROCESS_SUBSET = False  # Set True to process only subset (for testing)
    SUBSET_SIZE = 10000  # Number of images to process if PROCESS_SUBSET=True
    USE_FP16 = False  # MPS doesn't fully support FP16 yet, keep False
    
    # Apple Silicon specific
    USE_MPS = True  # Enable MPS acceleration
    NUM_WORKERS = 4  # For data loading (M4 has excellent multi-core)
    
    RANDOM_STATE = 42

config = Config()

# Create output directory if it doesn't exist
config.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
# UTILITY FUNCTIONS

def print_section(title):
    print("\n" + "="*80)
    print(f"  {title}")
    print("="*80 + "\n")

def get_device():
    """Get the best available device for Apple Silicon"""
    if config.USE_MPS and torch.backends.mps.is_available():
        return torch.device("mps")
    else:
        return torch.device("cpu")

In [None]:
# IMAGE EMBEDDING EXTRACTION

class ImageEmbeddingExtractor:
    """Extract embeddings from product images using pre-trained models"""
    
    def __init__(self, method='fashion-clip', device=None):
        self.method = method
        self.device = device if device else get_device()
        self.model = None
        self.transform = None
        
        print(f"Using device: {self.device}")
        
        if method == 'fashion-clip':
            self._load_fashion_clip()
        elif method == 'resnet50':
            self._load_resnet()
        elif method == 'clip':
            self._load_clip()
        elif method == 'efficientnet':
            self._load_efficientnet()
    
    def _load_fashion_clip(self):
        """Load pre-trained FashionCLIP - optimized for fashion domain"""
        print("Loading FashionCLIP")
        
        # Load FashionCLIP 2.0 - uses better base model
        self.model = CLIPModel.from_pretrained("patrickjohncyh/fashion-clip")
        self.processor = CLIPProcessor.from_pretrained("patrickjohncyh/fashion-clip")
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.output_dim = 512  # FashionCLIP outputs 512-dim embeddings
        print(f"‚úì FashionCLIP 2.0 loaded on {self.device}, output dim: {self.output_dim}")
        print("  Model details: ViT-B/32 architecture, trained on Farfetch dataset")
        print("  Benefits: Better fashion understanding, semantic similarity, zero-shot capabilities")
    
    def _load_resnet(self):
        """Load pre-trained ResNet50 - optimized for Apple Silicon"""
        print("Loading ResNet50 (pre-trained on ImageNet)...")
        
        # Load model with updated weights parameter (new PyTorch API)
        model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)
        
        # Remove final classification layer
        self.model = torch.nn.Sequential(*list(model.children())[:-1])
        self.model.eval()
        
        # Move to MPS device
        self.model = self.model.to(self.device)
        
        # Image preprocessing - using updated normalize values from weights
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(config.IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                std=[0.229, 0.224, 0.225])
        ])
        
        self.output_dim = 2048
        print(f"‚úì ResNet50 loaded on {self.device}, output dim: {self.output_dim}")
    
    def _load_efficientnet(self):
        """Load pre-trained EfficientNet-B0 - optimized for Apple Silicon"""
        print("Loading EfficientNet-B0 (pre-trained on ImageNet)...")
        
        # Load with updated weights parameter
        model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
        
        # Remove final classification layer
        self.model = torch.nn.Sequential(*list(model.children())[:-1])
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(config.IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                std=[0.229, 0.224, 0.225])
        ])
        
        self.output_dim = 1280
        print(f"‚úì EfficientNet-B0 loaded on {self.device}, output dim: {self.output_dim}")
    
    def _load_clip(self):
        """Load CLIP model - optimized for Apple Silicon"""
        print("Loading CLIP (vision-language model)...")
        
        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.model.eval()
        
        self.model = self.model.to(self.device)
        
        self.output_dim = 512
        print(f"‚úì CLIP loaded on {self.device}, output dim: {self.output_dim}")
    
    def extract_single(self, image_path):
        """Extract embedding from a single image"""
        try:
            # Load and preprocess image
            img = Image.open(image_path).convert('RGB')
            
            if self.method in ['clip', 'fashion-clip']:
                inputs = self.processor(images=img, return_tensors="pt")
                inputs = {k: v.to(self.device) for k, v in inputs.items()}
                
                with torch.no_grad():
                    image_features = self.model.get_image_features(**inputs)
                
                embedding = image_features.squeeze().cpu().numpy()
            else:
                img_tensor = self.transform(img).unsqueeze(0).to(self.device)
                
                with torch.no_grad():
                    embedding = self.model(img_tensor).squeeze().cpu().numpy()
            
            return embedding
        
        except Exception as e:
            # Return zero vector if image processing fails
            print(f"‚ö†Ô∏è  Error processing {image_path}: {e}")
            return np.zeros(self.output_dim, dtype=np.float32)
    
    def extract_batch(self, image_paths):
        """Extract embeddings from a batch of images - optimized for MPS"""
        embeddings = []
        
        # For CLIP-based models (fashion-clip and clip), process individually
        # because batch processing with processor is more complex
        if self.method in ['clip', 'fashion-clip']:
            for img_path in image_paths:
                emb = self.extract_single(img_path)
                embeddings.append(emb)
            return np.array(embeddings)
        
        # For CNN models (ResNet, EfficientNet), process as batch
        batch_tensors = []
        valid_indices = []
        
        # Load and preprocess all images in batch
        for idx, img_path in enumerate(image_paths):
            try:
                img = Image.open(img_path).convert('RGB')
                img_tensor = self.transform(img)
                batch_tensors.append(img_tensor)
                valid_indices.append(idx)
            except Exception as e:
                print(f"‚ö†Ô∏è  Error loading {img_path}: {e}")
                embeddings.append(np.zeros(self.output_dim, dtype=np.float32))
        
        # Process batch
        if batch_tensors:
            batch = torch.stack(batch_tensors).to(self.device)
            
            with torch.no_grad():
                batch_embeddings = self.model(batch).squeeze().cpu().numpy()
            
            # Handle single vs multiple embeddings
            if len(batch_tensors) == 1:
                batch_embeddings = batch_embeddings.reshape(1, -1)
            
            # Insert embeddings at correct positions
            result_embeddings = []
            batch_idx = 0
            for idx in range(len(image_paths)):
                if idx in valid_indices:
                    result_embeddings.append(batch_embeddings[batch_idx])
                    batch_idx += 1
                else:
                    result_embeddings.append(np.zeros(self.output_dim, dtype=np.float32))
            
            return np.array(result_embeddings)
        
        return np.array(embeddings)

In [None]:
# MAIN EXTRACTION PIPELINE

print_section("STAGE 1.5: IMAGE EMBEDDING EXTRACTION (APPLE SILICON OPTIMIZED)")

# Check if embeddings already exist
if (config.OUTPUT_PATH / 'image_embeddings_3.parquet').exists():
    print("‚ö° Found existing image_embeddings_3.parquet!")
    print("   Delete this file if you want to re-extract embeddings.")
    
    # Load existing embeddings
    image_embeddings_df = pd.read_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet')
    print(f"‚úì Loaded {len(image_embeddings_df):,} image embeddings from disk")
    
else:
    print("Image embeddings not found. Starting extraction...")
    
    # Check prerequisites
    if not HAS_TORCH:
        raise ImportError("PyTorch is required. Install with: pip install torch torchvision pillow")
    
    # Load articles
    print("\nLoading articles metadata...")
    articles = pd.read_parquet(config.DATA_PATH / 'articles.parquet')
    print(f"‚úì Loaded {len(articles):,} articles")
    
    # Determine which articles to process
    if config.PROCESS_SUBSET:
        articles = articles.head(config.SUBSET_SIZE)
        print(f"‚ö†Ô∏è  Processing subset of {len(articles):,} articles (PROCESS_SUBSET=True)")
    
    article_ids = articles['article_id'].tolist()
    
    # Check image directory structure
    print(f"\nChecking image directory: {config.IMAGE_PATH}")
    if not config.IMAGE_PATH.exists():
        raise FileNotFoundError(f"Image directory not found: {config.IMAGE_PATH}")
    
    # H&M images are organized as: images/0XX/0XXXXXXXX.jpg
    # Where first 3 digits determine subfolder
    
    # Find available images
    print("Scanning for available images...")
    available_images = {}
    missing_count = 0
    
    for article_id in tqdm(article_ids, desc="Checking images"):
        # Try different possible paths
        article_str = str(article_id).zfill(10)  # Pad to 10 digits
        subfolder = article_str[:3]
        
        # Possible image paths
        possible_paths = [
            config.IMAGE_PATH / subfolder / f"{article_str}.jpg",
            config.IMAGE_PATH / f"{article_str}.jpg",
            config.IMAGE_PATH / f"{article_id}.jpg",
        ]
        
        image_found = False
        for img_path in possible_paths:
            if img_path.exists():
                available_images[article_id] = img_path
                image_found = True
                break
        
        if not image_found:
            missing_count += 1
    
    print(f"\n‚úì Found {len(available_images):,} images")
    print(f"  Missing {missing_count:,} images ({100*missing_count/len(article_ids):.1f}%)")
    
    # Initialize extractor
    print(f"\nInitializing {config.EMBEDDING_METHOD} model...")
    extractor = ImageEmbeddingExtractor(method=config.EMBEDDING_METHOD)
    print_memory()
    
    # Extract embeddings
    print(f"\nExtracting embeddings for {len(available_images):,} images...")
    print(f"  Batch size: {config.BATCH_SIZE}")
    print(f"  Device: {extractor.device}")
    estimated_time = len(available_images) / (config.BATCH_SIZE * 10)  # ~10 batches/sec on M4
    print(f"  Estimated time: {estimated_time:.1f} minutes")
    
    embeddings_dict = {}
    batch_article_ids = []
    batch_image_paths = []
    
    for article_id, img_path in tqdm(available_images.items(), desc="Extracting embeddings"):
        batch_article_ids.append(article_id)
        batch_image_paths.append(img_path)
        
        # Process batch
        if len(batch_article_ids) >= config.BATCH_SIZE:
            batch_embeddings = extractor.extract_batch(batch_image_paths)
            
            for aid, emb in zip(batch_article_ids, batch_embeddings):
                embeddings_dict[aid] = emb
            
            # Clear batch
            batch_article_ids = []
            batch_image_paths = []
            
            # Periodic garbage collection
            if len(embeddings_dict) % (config.BATCH_SIZE * 10) == 0:
                force_garbage_collection()
    
    # Process remaining batch
    if batch_article_ids:
        batch_embeddings = extractor.extract_batch(batch_image_paths)
        for aid, emb in zip(batch_article_ids, batch_embeddings):
            embeddings_dict[aid] = emb
    
    print(f"\n‚úì Extracted {len(embeddings_dict):,} embeddings")
    print_memory()
    
    # Create DataFrame
    print("\nCreating embeddings DataFrame...")
    
    # For missing images, use mean embedding or zero vector
    mean_embedding = np.mean(list(embeddings_dict.values()), axis=0) if embeddings_dict else np.zeros(extractor.output_dim)
    
    all_embeddings = []
    for article_id in article_ids:
        if article_id in embeddings_dict:
            emb = embeddings_dict[article_id]
        else:
            emb = mean_embedding  # Use mean for missing images
        
        all_embeddings.append(emb)
    
    # Create DataFrame with article_id and embedding columns
    image_embeddings_df = pd.DataFrame({
        'article_id': article_ids
    })
    
    # Add embedding dimensions as separate columns
    embedding_matrix = np.array(all_embeddings)
    
    # Project to target dimension if needed
    if embedding_matrix.shape[1] != config.EMBEDDING_DIM:
        print(f"\nProjecting embeddings from {embedding_matrix.shape[1]} to {config.EMBEDDING_DIM} dimensions...")
        
        pca = PCA(n_components=config.EMBEDDING_DIM, random_state=config.RANDOM_STATE)
        embedding_matrix = pca.fit_transform(embedding_matrix)
        
        print(f"  Explained variance: {pca.explained_variance_ratio_.sum():.3f}")
    
    # Add embedding columns
    for i in range(embedding_matrix.shape[1]):
        image_embeddings_df[f'image_emb_{i}'] = embedding_matrix[:, i].astype(np.float32)
    
    # Save embeddings
    print("\nSaving image embeddings...")
    image_embeddings_df.to_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet', index=False)
    
    file_size = (config.OUTPUT_PATH / 'image_embeddings_3.parquet').stat().st_size / 1024**2
    print(f"‚úì Saved image_embeddings.parquet ({file_size:.2f} MB)")
    
    # Clean up
    del extractor, embeddings_dict, embedding_matrix
    force_garbage_collection()

In [None]:
# INTEGRATE INTO TRAINING FEATURES

print_section("INTEGRATING IMAGE EMBEDDINGS INTO TRAINING FEATURES")

# Check for different possible training feature files
training_file = None
possible_files = [
    'training_features.parquet'
]

for filename in possible_files:
    if (Path("/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/" + filename)).exists():
        training_file = filename
        break
try:
    image_embeddings_df = pd.read_parquet(config.OUTPUT_PATH / 'image_embeddings_3.parquet')
except Exception as e:
    print(f"unable to load image embeddings")

if training_file is None:
    print("No training features file found!")
    print(f"\n Image embeddings saved to: {config.OUTPUT_PATH / 'image_embeddings.parquet'}")
else:
    # Load training features
    print(f"Loading {training_file}...")
    training_features = pd.read_parquet("/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/" + training_file)
    print(f"‚úì Loaded training features: {len(training_features):,} rows")
    print(f"  Current features: {len(training_features.columns) - 2}")  # Exclude customer_id, article_id
    print_memory()

    # Merge image embeddings
    print("\nMerging image embeddings with training features...")
    print(f"  Image embeddings: {len(image_embeddings_df):,} articles")

    # Merge on article_id
    training_features = training_features.merge(
        image_embeddings_df,
        on='article_id',
        how='left'
    )

    print(f"Merged successfully")
    print(f"New features: {len(training_features.columns) - 2}")

    # Check for missing embeddings
    image_cols = [col for col in training_features.columns if col.startswith('image_emb_')]
    missing_embeddings = training_features[image_cols].isna().any(axis=1).sum()

    if missing_embeddings > 0:
        print(f"\n Found {missing_embeddings:,} rows with missing image embeddings")
        print("Filling with mean values...")
        
        # Fill with mean
        for col in image_cols:
            mean_val = training_features[col].mean()
            training_features[col] = training_features[col].fillna(mean_val)
        
        print("‚úì Filled missing values")

    # Convert to float32 to save memory
    print("\nOptimizing data types...")
    for col in image_cols:
        training_features[col] = training_features[col].astype(np.float32)

    print_memory()

    # Save updated training features
    print(f"\nSaving updated {training_file}...")
    training_features.to_parquet(
        config.OUTPUT_PATH / training_file,
        index=False
    )

    file_size = (config.OUTPUT_PATH / training_file).stat().st_size / 1024**2
    print(f"‚úì Saved {training_file} ({file_size:.2f} MB)")

    # Update feature names
    feature_names = [col for col in training_features.columns if col not in ['customer_id', 'article_id']]

    with open(config.OUTPUT_PATH / 'feature_names.txt', 'w') as f:
        f.write('\n'.join(feature_names))

    print(f"‚úì Updated feature_names.txt ({len(feature_names)} features)")

    # ============================================================================
    # SUMMARY
    # ============================================================================

    print_section("IMAGE EMBEDDING INTEGRATION COMPLETE!")

    print("Summary:")
    print(f"Image embeddings extracted: {len(image_embeddings_df):,}")
    print(f"Image embedding dimensions: {config.EMBEDDING_DIM}")
    print(f"Total features: {len(feature_names)}")
    print(f"User features: {len([f for f in feature_names if any(x in f for x in ['user', 'purchase', 'age'])])}")
    print(f"Item features: {len([f for f in feature_names if any(x in f for x in ['sales', 'product', 'department'])])}")
    print(f"Interaction features: {len([f for f in feature_names if any(x in f for x in ['match', 'rank', 'score', 'similarity'])])}")
    print(f"Image features: {len(image_cols)}")

    print("\nFiles created/updated:")
    print(f"image_embeddings.parquet - Image embeddings for all articles")
    print(f"{training_file} - Updated with image embeddings")
    print(f"feature_names.txt - Updated feature list")

print(" Ready for Stage 4: Model Training with Image Features!")


  INTEGRATING IMAGE EMBEDDINGS INTO TRAINING FEATURES

Loading training_features.parquet...
‚úì Loaded training features: 4,044,442 rows
  Current features: 72
  üíæ Memory: 2.13 GB

Merging image embeddings with training features...
  Image embeddings: 16,616 articles
‚úì Merged successfully
  New features: 584

Optimizing data types...
  üíæ Memory: 0.60 GB

Saving updated training_features.parquet...
‚úì Saved training_features.parquet (5126.99 MB)
‚úì Updated feature_names.txt (584 features)

  IMAGE EMBEDDING INTEGRATION COMPLETE!

Summary:
  üì∏ Image embeddings extracted: 16,616
  üìä Image embedding dimensions: 512
  üéØ Total features: 584
     - User features: 531
     - Item features: 12
     - Interaction features: 16
     - Image features: 512

Files created/updated:
  ‚úì image_embeddings.parquet - Image embeddings for all articles
  ‚úì training_features.parquet - Updated with image embeddings
  ‚úì feature_names.txt - Updated feature list

‚úÖ Ready for Stage 4: M

In [57]:
import pandas as pd

# Path to your file
file_path = '/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/training_features.parquet'

try:
    # Load the parquet file
    print(f"Loading {file_path}...")
    df = pd.read_parquet(file_path)

    # 1. Show dimensions (rows, columns)
    print(f"\nShape: {df.shape}")

    # 2. Show column names
    print(f"\nColumns: {df.columns.tolist()}")

    # 3. Show the first 5 rows
    print("\nFirst 5 rows:")
    print(df[100:105]["userknn_score"])

    # 4. (Optional) Show data types and memory usage
    print("\nInfo:")
    print(df.info())

except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Loading /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_features_2/training_features.parquet...

Shape: (4044442, 74)

Columns: ['customer_id', 'article_id', 'repurchase_score', 'popularity_score_x', 'copurchase_score', 'userknn_score', 'category_score', 'text_similarity_score', 'n_strategies', 'has_purchased_item', 'days_since_item_purchase', 'popularity_score_y', 'category_match', 'price_vs_user_avg', 'is_cheaper_than_usual', 'copurchase_score_normalized', 'has_copurchase_signal', 'repurchase_score_rank', 'copurchase_score_rank', 'userknn_score_rank', 'category_score_rank', 'overall_rank', 'n_purchases', 'avg_price', 'std_price', 'min_price', 'max_price', 'days_since_first_purchase', 'days_since_last_purchase', 'purchase_frequency', 'n_purchases_last_week', 'avg_price_last_week', 'is_active_last_week', 'n_unique_articles', 'n_unique_categories', 'exploration_rate', 'age', 'FN', 'Active', 'purchase_trend', 'n_unique_buyers', 'total_sales', 'days_since_first_sal

### Stage 4: Model Training

In [2]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from tqdm.auto import tqdm
import gc
import pickle
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [3]:
# CONFIGURATION

class Config:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    MODEL_PATH.mkdir(exist_ok=True, parents=True)
    
    # Device
    if torch.backends.mps.is_available():
        DEVICE = torch.device('mps')
        print("Using Apple Silicon GPU (MPS)")
    else:
        DEVICE = torch.device('cpu')
        print("Using CPU")
    
    # Sampling configuration - LARGE DATASET
    MAX_POSITIVE_SAMPLES = 2_500_000  # 2.5M positives
    MAX_NEGATIVE_SAMPLES = 1_500_000  # 1.5M negatives
    HARD_NEGATIVE_RATIO = 0.7  # 70% hard, 30% random
    
    VAL_SIZE = 0.15  # 15% validation (smaller to keep more for training)
    RANDOM_STATE = 42

config = Config()

Using Apple Silicon GPU (MPS)


In [None]:
print("="*80)
print("LOADING DATA")
print("="*80)

print("\nLoading training_features.parquet...")
training_features = pd.read_parquet(config.DATA_PATH / 'training_features.parquet')
print(f"‚úì Loaded {len(training_features):,} candidates")
print(f"  Memory: {training_features.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Load ground truth for validation period
print("\nLoading val_ground_truth.parquet...")
val_ground_truth = pd.read_parquet(Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2') / 'val_ground_truth.parquet')
print(f"‚úì Loaded validation ground truth: {len(val_ground_truth):,} users")

# Load training transactions for creating training period labels
print("\nLoading train_transactions.parquet...")
train_transactions = pd.read_parquet(config.DATA_PATH / 'train_transactions.parquet')
print(f"‚úì Loaded training transactions: {len(train_transactions):,} rows")
print(f"  Memory: {train_transactions.memory_usage(deep=True).sum() / 1024**3:.2f} GB")

# Identify validation users
val_users_set = set(val_ground_truth['customer_id'].unique())
all_users_in_features = set(training_features['customer_id'].unique())
train_only_users = all_users_in_features - val_users_set

print(f"\nUser breakdown:")
print(f"  Total users in candidates: {len(all_users_in_features):,}")
print(f"  Validation users: {len(val_users_set):,}")
print(f"  Training-only users: {len(train_only_users):,}")

gc.collect()

LOADING DATA

Loading training_features.parquet...
‚úì Loaded 4,044,442 candidates
  Memory: 9.22 GB

Loading val_ground_truth.parquet...
‚úì Loaded validation ground truth: 4,943 users

Loading train_transactions.parquet...
‚úì Loaded training transactions: 412,156 rows
  Memory: 0.05 GB

User breakdown:
  Total users in candidates: 47,543
  Validation users: 4,943
  Training-only users: 43,842


0

: 

In [None]:
# CREATE LABELS - OPTIMIZED FOR TRAINING & VALIDATION USERS

print("\n" + "="*80)
print("CREATING LABELS")
print("="*80)

# Step 1: Create training period labels (for ALL users)
print("\n[1/3] Creating training period labels...")

# Check train_transactions structure
if len(train_transactions) == 0:
    raise ValueError("train_transactions is empty! Cannot create training labels.")
    
if 'customer_id' not in train_transactions.columns or 'article_id' not in train_transactions.columns:
    raise ValueError(f"train_transactions missing required columns. Available: {train_transactions.columns.tolist()}")

print(f"  Processing {len(train_transactions):,} training transactions...")

train_ground_truth = (
    train_transactions
    .groupby('customer_id')['article_id']
    .apply(list)
    .reset_index()
    .rename(columns={'article_id': 'purchased_articles'})
)

print(f"  ‚úì Created train_ground_truth: {len(train_ground_truth):,} users")
print(f"  Columns: {train_ground_truth.columns.tolist()}")

# Verify purchased_articles column exists before explode
if 'purchased_articles' not in train_ground_truth.columns:
    print(f"  ‚ö†Ô∏è Error: 'purchased_articles' column not found!")
    print(f"  Available columns: {train_ground_truth.columns.tolist()}")
    raise KeyError(f"'purchased_articles' column not found in train_ground_truth")

# Explode the lists into individual rows
train_ground_truth_exploded = train_ground_truth.explode('purchased_articles')

# Check if explode worked correctly
if len(train_ground_truth_exploded) == 0:
    print("  ‚ö†Ô∏è Warning: train_ground_truth_exploded is empty after explode!")
    print(f"  train_ground_truth shape: {train_ground_truth.shape}")
    print(f"  Sample of purchased_articles: {train_ground_truth['purchased_articles'].head()}")
    raise ValueError("train_ground_truth_exploded is empty - check if purchased_articles contains valid lists")
else:
    print(f"  ‚úì Exploded to {len(train_ground_truth_exploded):,} rows")

# Rename the column from purchased_articles to article_id
train_ground_truth_exploded = train_ground_truth_exploded.rename(
    columns={'purchased_articles': 'article_id'}
)

# Add train_label column
train_ground_truth_exploded['train_label'] = 1

# Verify the column was created
if 'train_label' not in train_ground_truth_exploded.columns:
    print(f"  ‚ö†Ô∏è Error: Failed to create 'train_label' column!")
    print(f"  DataFrame type: {type(train_ground_truth_exploded)}")
    print(f"  Available columns: {train_ground_truth_exploded.columns.tolist()}")
    print(f"  DataFrame shape: {train_ground_truth_exploded.shape}")
    raise KeyError("'train_label' column was not created successfully")

# Verify all required columns exist before merge
required_cols = ['customer_id', 'article_id', 'train_label']
missing_cols = [col for col in required_cols if col not in train_ground_truth_exploded.columns]
if missing_cols:
    print(f"  ‚ö†Ô∏è Error: Missing columns in train_ground_truth_exploded: {missing_cols}")
    print(f"  Available columns: {train_ground_truth_exploded.columns.tolist()}")
    print(f"  DataFrame info:")
    print(train_ground_truth_exploded.info())
    raise KeyError(f"Missing required columns: {missing_cols}")
else:
    print(f"  ‚úì All required columns present: {required_cols}")
    print(f"  train_ground_truth_exploded shape: {train_ground_truth_exploded.shape}")

# Merge training labels
training_features = training_features.merge(
    train_ground_truth_exploded[required_cols],
    on=['customer_id', 'article_id'],
    how='left'
)
training_features['train_label'] = training_features['train_label'].fillna(0).astype(np.int8)

print(f"  ‚úì Training period positives: {training_features['train_label'].sum():,}")

# Clean up
del train_ground_truth, train_ground_truth_exploded
gc.collect()

# Step 2: Create validation period labels (only for validation users)
print("\n[2/3] Creating validation period labels...")
val_ground_truth_exploded = val_ground_truth.explode('purchased_articles').rename(
    columns={'purchased_articles': 'article_id'}
)
val_ground_truth_exploded['val_label'] = 1

# Merge validation labels
training_features = training_features.merge(
    val_ground_truth_exploded[['customer_id', 'article_id', 'val_label']],
    on=['customer_id', 'article_id'],
    how='left'
)
training_features['val_label'] = training_features['val_label'].fillna(0).astype(np.int8)

print(f"  ‚úì Validation period positives: {training_features['val_label'].sum():,}")

# Clean up
del val_ground_truth_exploded
gc.collect()

# Step 3: Create final label - use validation labels for validation users, training labels for others
print("\n[3/3] Creating final labels (vectorized)...")
val_users_set = set(val_ground_truth['customer_id'].unique())

# Vectorized approach - much faster than apply()
# For validation users: use validation period labels (predict future purchases)
# For training-only users: use training period labels (learn from past purchases)
is_val_user = training_features['customer_id'].isin(val_users_set)
training_features['label'] = np.where(
    is_val_user,
    training_features['val_label'],
    training_features['train_label']
).astype(np.int8)

# Add user_type for tracking (vectorized)
# Convert to pandas Series first, then to category dtype (numpy arrays don't support category)
training_features['user_type'] = pd.Series(
    np.where(is_val_user, 'validation', 'training_only'),
    dtype='category'
)

print(f"\n‚úì Final label summary:")
print(f"  Total positives: {training_features['label'].sum():,}")
print(f"  Total negatives: {(training_features['label']==0).sum():,}")
print(f"\n  By user type:")
print(f"    Validation users - Positives: {training_features[training_features['user_type']=='validation']['label'].sum():,}")
print(f"    Validation users - Negatives: {(training_features[training_features['user_type']=='validation']['label']==0).sum():,}")
print(f"    Training-only users - Positives: {training_features[training_features['user_type']=='training_only']['label'].sum():,}")
print(f"    Training-only users - Negatives: {(training_features[training_features['user_type']=='training_only']['label']==0).sum():,}")

# Clean up intermediate columns (keep for now in case needed for analysis)
# training_features = training_features.drop(['train_label', 'val_label'], axis=1)
gc.collect()


CREATING LABELS

[1/3] Creating training period labels...
  Processing 412,156 training transactions...
  ‚úì Created train_ground_truth: 47,543 users
  Columns: ['customer_id', 'purchased_articles']
  ‚úì Exploded to 412,156 rows
  ‚úì All required columns present: ['customer_id', 'article_id', 'train_label']
  train_ground_truth_exploded shape: (412156, 3)


In [None]:
# FEATURE SEPARATION

print("\n" + "="*80)
print("SEPARATING FEATURE TYPES")
print("="*80)

# Load feature names
with open(config.DATA_PATH / 'feature_names.txt', 'r') as f:
    all_features = [line.strip() for line in f.readlines()]

print(f"Total features: {len(all_features)}")

# Identify feature types
user_features = [f for f in all_features if any(x in f for x in 
    ['user', 'customer', 'purchase', 'age', 'Active', 'FN', 'exploration', 'diversity', 'trend'])]

item_features = [f for f in all_features if any(x in f for x in 
    ['sales', 'product', 'department', 'section', 'colour', 'garment', 'article', 
     'perceived', 'graphical', 'index_'])]

image_features = [f for f in all_features if f.startswith('image_emb_')]

# Interaction features are everything else (excluding user, item, image)
interaction_features = [f for f in all_features 
                        if f not in user_features 
                        and f not in item_features 
                        and f not in image_features]

print(f"\nFeature breakdown:")
print(f"  User features: {len(user_features)}")
print(f"  Item features: {len(item_features)}")
print(f"  Image features: {len(image_features)}")
print(f"  Interaction features: {len(interaction_features)}")

# Update config with actual dimensions
config.USER_FEATURE_DIM = len(user_features)
config.ITEM_FEATURE_DIM = len(item_features)
config.IMAGE_FEATURE_DIM = len(image_features)
config.INTERACTION_FEATURE_DIM = len(interaction_features)

# Save feature lists
feature_dict = {
    'user_features': user_features,
    'item_features': item_features,
    'image_features': image_features,
    'interaction_features': interaction_features,
    'all_features': all_features
}

with open(config.MODEL_PATH / 'feature_dict.pkl', 'wb') as f:
    pickle.dump(feature_dict, f)

print(f"‚úì Saved feature_dict.pkl")

In [None]:
# SPLIT INTO TRAIN/VAL AND SAVE

print("\n" + "="*80)
print("SPLITTING INTO TRAIN/VAL")
print("="*80)

# Split into train and validation sets
# Use stratified split to maintain label distribution
from sklearn.model_selection import train_test_split

# Get feature columns (exclude customer_id, article_id, label)
feature_cols = [col for col in training_features.columns 
                if col not in ['customer_id', 'article_id', 'label']]

# Split the data
train_data, val_data = train_test_split(
    training_features,
    test_size=config.VAL_SIZE,
    random_state=config.RANDOM_STATE,
    stratify=training_features['label']  # Maintain label distribution
)

# Reset indices
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

print(f"\n‚úì Train set: {len(train_data):,} samples")
print(f"  Positive: {train_data['label'].sum():,} ({100*train_data['label'].mean():.2f}%)")
print(f"  Negative: {(train_data['label']==0).sum():,} ({100*(1-train_data['label'].mean()):.2f}%)")

print(f"\n‚úì Val set: {len(val_data):,} samples")
print(f"  Positive: {val_data['label'].sum():,} ({100*val_data['label'].mean():.2f}%)")
print(f"  Negative: {(val_data['label']==0).sum():,} ({100*(1-val_data['label'].mean()):.2f}%)")

# Save train and validation datasets
print("\nSaving datasets...")
train_data.to_parquet(config.MODEL_PATH / 'train_data.parquet', index=False)
val_data.to_parquet(config.MODEL_PATH / 'val_data.parquet', index=False)

print(f"‚úì Saved train_data.parquet ({len(train_data):,} rows)")
print(f"‚úì Saved val_data.parquet ({len(val_data):,} rows)")

# Clean up
del training_features
gc.collect()

print("\n Dataset creation complete! Ready for model training.")


In [4]:
# DOWNSAMPLE DATASETS FOR LOCAL TRAINING (M4 MacBook Air Optimized)

print("\n" + "="*80)
print("DOWNSAMPLING DATASETS FOR LOCAL TRAINING")
print("="*80)
print("\nStrategy: Keep ALL positives, sample negatives for balanced ratio (~40:60)\n")

# Configuration for balanced sampling
TARGET_POSITIVE_RATIO = 0.30  # 40% positive, 60% negative
RANDOM_STATE = 42

# Load the saved datasets
print("Loading saved datasets...")
train_data = pd.read_parquet(config.MODEL_PATH / 'train_data.parquet')
val_data = pd.read_parquet(config.MODEL_PATH / 'val_data.parquet')

print(f"\nOriginal dataset sizes:")
print(f"  Train: {len(train_data):,} samples")
print(f"    Positives: {train_data['label'].sum():,} ({100*train_data['label'].mean():.2f}%)")
print(f"    Negatives: {(train_data['label']==0).sum():,} ({100*(1-train_data['label'].mean()):.2f}%)")
print(f"  Val: {len(val_data):,} samples")
print(f"    Positives: {val_data['label'].sum():,} ({100*val_data['label'].mean():.2f}%)")
print(f"    Negatives: {(val_data['label']==0).sum():,} ({100*(1-val_data['label'].mean()):.2f}%)")

# Function to downsample while keeping all positives
def downsample_balanced(df, target_positive_ratio=0.40, random_state=42):
    """
    Downsample dataset keeping ALL positives and sampling negatives
    to achieve target positive ratio
    """
    positives = df[df['label'] == 1].copy()
    negatives = df[df['label'] == 0].copy()
    
    n_positives = len(positives)
    
    # Calculate how many negatives we need for target ratio
    # target_positive_ratio = n_positives / (n_positives + n_negatives)
    # Solving for n_negatives:
    # n_negatives = n_positives * (1 - target_positive_ratio) / target_positive_ratio
    n_negatives_needed = int(n_positives * (1 - target_positive_ratio) / target_positive_ratio)
    
    # Sample negatives (don't exceed available)
    n_negatives_to_sample = min(n_negatives_needed, len(negatives))
    
    if n_negatives_to_sample < len(negatives):
        negatives_sampled = negatives.sample(n=n_negatives_to_sample, random_state=random_state)
    else:
        negatives_sampled = negatives.copy()
    
    # Combine
    balanced_df = pd.concat([positives, negatives_sampled], ignore_index=True)
    
    # Shuffle
    balanced_df = balanced_df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    return balanced_df, n_positives, n_negatives_to_sample

# Downsample training set
print("\n" + "-"*80)
print("Downsampling TRAINING set...")
print("-"*80)
train_data_downsampled, train_pos, train_neg = downsample_balanced(
    train_data, 
    target_positive_ratio=TARGET_POSITIVE_RATIO,
    random_state=RANDOM_STATE
)

print(f"\n‚úì Downsampled training set:")
print(f"  Total samples: {len(train_data_downsampled):,}")
print(f"  Positives: {train_pos:,} ({100*train_data_downsampled['label'].mean():.2f}%)")
print(f"  Negatives: {train_neg:,} ({100*(1-train_data_downsampled['label'].mean()):.2f}%)")
print(f"  Reduction: {100*(1 - len(train_data_downsampled)/len(train_data)):.1f}%")

# Downsample validation set
print("\n" + "-"*80)
print("Downsampling VALIDATION set...")
print("-"*80)
val_data_downsampled, val_pos, val_neg = downsample_balanced(
    val_data,
    target_positive_ratio=TARGET_POSITIVE_RATIO,
    random_state=RANDOM_STATE
)

print(f"\n‚úì Downsampled validation set:")
print(f"  Total samples: {len(val_data_downsampled):,}")
print(f"  Positives: {val_pos:,} ({100*val_data_downsampled['label'].mean():.2f}%)")
print(f"  Negatives: {val_neg:,} ({100*(1-val_data_downsampled['label'].mean()):.2f}%)")
print(f"  Reduction: {100*(1 - len(val_data_downsampled)/len(val_data)):.1f}%")

# Save downsampled datasets (overwrite original files)
print("\n" + "="*80)
print("SAVING DOWNSAMPLED DATASETS")
print("="*80)

print("\nSaving downsampled datasets...")
train_data_downsampled.to_parquet(config.MODEL_PATH / 'train_data.parquet', index=False)
val_data_downsampled.to_parquet(config.MODEL_PATH / 'val_data.parquet', index=False)

print(f"‚úì Saved train_data.parquet ({len(train_data_downsampled):,} rows)")
print(f"‚úì Saved val_data.parquet ({len(val_data_downsampled):,} rows)")

# Calculate file sizes
train_size_mb = (config.MODEL_PATH / 'train_data.parquet').stat().st_size / 1024**2
val_size_mb = (config.MODEL_PATH / 'val_data.parquet').stat().st_size / 1024**2

print(f"\nFile sizes:")
print(f"  train_data.parquet: {train_size_mb:.2f} MB")
print(f"  val_data.parquet: {val_size_mb:.2f} MB")
print(f"  Total: {train_size_mb + val_size_mb:.2f} MB")

# Clean up
del train_data, val_data
gc.collect()

print("\n‚úÖ Downsampling complete! Datasets optimized for M4 MacBook Air local training.")
print(f"\nSummary:")
print(f"  Train: {len(train_data_downsampled):,} samples ({train_size_mb:.1f} MB)")
print(f"  Val: {len(val_data_downsampled):,} samples ({val_size_mb:.1f} MB)")
print(f"  Positive ratio: ~{TARGET_POSITIVE_RATIO*100:.0f}% (balanced for training)")



DOWNSAMPLING DATASETS FOR LOCAL TRAINING

Strategy: Keep ALL positives, sample negatives for balanced ratio (~40:60)

Loading saved datasets...

Original dataset sizes:
  Train: 685,300 samples
    Positives: 274,142 (40.00%)
    Negatives: 411,158 (60.00%)
  Val: 121,164 samples
    Positives: 48,444 (39.98%)
    Negatives: 72,720 (60.02%)

--------------------------------------------------------------------------------
Downsampling TRAINING set...
--------------------------------------------------------------------------------

‚úì Downsampled training set:
  Total samples: 685,300
  Positives: 274,142 (40.00%)
  Negatives: 411,158 (60.00%)
  Reduction: 0.0%

--------------------------------------------------------------------------------
Downsampling VALIDATION set...
--------------------------------------------------------------------------------

‚úì Downsampled validation set:
  Total samples: 121,164
  Positives: 48,444 (39.98%)
  Negatives: 72,720 (60.02%)
  Reduction: 0.0%

S

### Stage 4A: LightGBM Training & Reranking

This stage implements multiple LightGBM models for reranking candidates:
- **LightGBM Classifier**: Binary classification approach
- **LightGBM Ranker (LambdaRank)**: Learning-to-rank with MAP@12 optimization
- **LightGBM Ranker (XENDCG)**: Alternative ranking objective
- **Ensemble**: Weighted combination of all models

**Key Features:**
- MAP@12 evaluation metric (competition metric)
- Time-based cross-validation
- Feature importance analysis
- Model checkpointing and saving
- Optimized for M4 MacBook Air


In [5]:
# IMPORTS AND CONFIGURATION

import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path
import pickle
import gc
from tqdm.auto import tqdm
import warnings
from sklearn.model_selection import train_test_split
import json
from datetime import datetime

warnings.filterwarnings('ignore')

# Configuration
class LightGBMConfig:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    MODEL_PATH.mkdir(exist_ok=True, parents=True)
    
    # Training configuration
    RANDOM_STATE = 42
    N_FOLDS = 5  # Number of CV folds
    EARLY_STOPPING_ROUNDS = 100
    VERBOSE_EVAL = 50
    
    # Model configurations
    N_ESTIMATORS = 2000  # Maximum boosting rounds
    
    # Device (LightGBM uses CPU by default, but optimized for M4)
    DEVICE = 'cpu'  # LightGBM doesn't support MPS, but CPU is fast on M4
    
    # Feature selection
    MIN_FEATURE_IMPORTANCE = 0.001  # Minimum importance to keep feature
    
print("‚úì LightGBM Configuration loaded")
print(f"  Model path: {LightGBMConfig.MODEL_PATH}")
print(f"  Data path: {LightGBMConfig.DATA_PATH}")


‚úì LightGBM Configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models
  Data path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2


In [6]:
# ============================================================================
# MAP@12 EVALUATION FUNCTION
# ============================================================================

def calculate_map_at_k(y_true, y_pred, k=12):
    """
    Calculate Mean Average Precision at K (MAP@K)
    
    Args:
        y_true: List of lists, where each inner list contains the true article_ids
        y_pred: List of lists, where each inner list contains predicted article_ids (ranked)
        k: Number of top predictions to consider
    
    Returns:
        MAP@K score
    """
    if len(y_true) == 0:
        return 0.0
    
    # Limit predictions to top k
    y_pred = [pred[:k] for pred in y_pred]
    
    # Calculate AP for each user
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
            
        # Convert to sets for faster lookup
        true_set = set(true_items)
        
        # Calculate precision at each position
        hits = 0
        precisions = []
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_set:
                hits += 1
                precision = hits / (i + 1)
                precisions.append(precision)
        
        # Average Precision (AP) = mean of precisions at hit positions
        if len(precisions) > 0:
            ap = np.mean(precisions)
            aps.append(ap)
        else:
            aps.append(0.0)
    
    # MAP = mean of all APs
    return np.mean(aps) if len(aps) > 0 else 0.0


def evaluate_map_at_12(df, predictions, customer_col='customer_id', 
                       article_col='article_id', label_col='label', k=12):
    """
    Evaluate MAP@12 for a dataframe with predictions
    
    Args:
        df: DataFrame with customer_id, article_id, label columns
        predictions: Array of prediction scores (same length as df)
        customer_col: Name of customer ID column
        article_col: Name of article ID column
        label_col: Name of label column
        k: Number of top predictions to consider
    
    Returns:
        MAP@12 score
    """
    # Add predictions to dataframe
    df_eval = df[[customer_col, article_col, label_col]].copy()
    df_eval['pred_score'] = predictions
    
    # Get true positives (purchased items) for each customer
    true_positives = df_eval[df_eval[label_col] == 1].groupby(customer_col)[article_col].apply(list).to_dict()
    
    # Get top-k predictions for each customer
    top_predictions = (df_eval.groupby(customer_col)
                      .apply(lambda x: x.nlargest(k, 'pred_score')[article_col].tolist())
                      .to_dict())
    
    # Calculate MAP@12
    y_true = []
    y_pred = []
    
    # Only evaluate on customers with true positives
    for customer_id in true_positives.keys():
        if customer_id in top_predictions:
            y_true.append(true_positives[customer_id])
            y_pred.append(top_predictions[customer_id])
    
    map_score = calculate_map_at_k(y_true, y_pred, k=k)
    return map_score


print("‚úì MAP@12 evaluation functions loaded")


‚úì MAP@12 evaluation functions loaded


In [7]:
# ============================================================================
# LOAD DATA AND PREPARE FEATURES
# ============================================================================

print("\n" + "="*80)
print("LOADING TRAINING DATA")
print("="*80)

# Load train and validation datasets
print("\nLoading train_data.parquet...")
train_data = pd.read_parquet(LightGBMConfig.MODEL_PATH / 'train_data.parquet')
print(f"‚úì Loaded {len(train_data):,} training samples")
print(f"  Memory: {train_data.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
print(f"  Positives: {train_data['label'].sum():,} ({100*train_data['label'].mean():.2f}%)")
print(f"  Negatives: {(train_data['label']==0).sum():,} ({100*(1-train_data['label'].mean()):.2f}%)")

print("\nLoading val_data.parquet...")
val_data = pd.read_parquet(LightGBMConfig.MODEL_PATH / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")
print(f"  Memory: {val_data.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
print(f"  Positives: {val_data['label'].sum():,} ({100*val_data['label'].mean():.2f}%)")
print(f"  Negatives: {(val_data['label']==0).sum():,} ({100*(1-val_data['label'].mean()):.2f}%)")

# Identify feature columns (exclude ID and label columns)
exclude_cols = ['customer_id', 'article_id', 'label', 'user_type', 'train_label', 'val_label']
feature_cols = [col for col in train_data.columns if col not in exclude_cols]

print(f"\n Feature columns identified: {len(feature_cols)}")
print(f"  Excluded columns: {exclude_cols}")

# Check for missing values
print("\nChecking for missing values...")
train_missing = train_data[feature_cols].isnull().sum()
val_missing = val_data[feature_cols].isnull().sum()

if train_missing.sum() > 0:
    print(f" Training set has {train_missing.sum():,} missing values")
    missing_cols = train_missing[train_missing > 0].index.tolist()
    print(f"  Columns with missing values: {len(missing_cols)}")
    # Fill missing values with median (for numeric) or mode (for categorical)
    for col in missing_cols:
        if train_data[col].dtype in ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']:
            fill_value = train_data[col].median()
            train_data[col] = train_data[col].fillna(fill_value)
            val_data[col] = val_data[col].fillna(fill_value)
        else:
            fill_value = train_data[col].mode()[0] if len(train_data[col].mode()) > 0 else 0
            train_data[col] = train_data[col].fillna(fill_value)
            val_data[col] = val_data[col].fillna(fill_value)
    print(" Filled missing values")
else:
    print(" No missing values found")

# Prepare feature matrices
X_train = train_data[feature_cols].copy()
y_train = train_data['label'].copy()
X_val = val_data[feature_cols].copy()
y_val = val_data['label'].copy()

# Store customer and article IDs for evaluation
train_customer_ids = train_data['customer_id'].copy()
train_article_ids = train_data['article_id'].copy()
val_customer_ids = val_data['customer_id'].copy()
val_article_ids = val_data['article_id'].copy()

print(f"\n‚úì Feature matrices prepared:")
print(f"  X_train: {X_train.shape}")
print(f"  X_val: {X_val.shape}")
print(f"  Feature types: {X_train.dtypes.value_counts().to_dict()}")

# Identify categorical features
categorical_features = [col for col in feature_cols 
                       if X_train[col].dtype == 'category' or 
                          X_train[col].dtype == 'object' or
                          col.endswith('_no') or col.endswith('_id') or col.endswith('_code')]

print(f"\n‚úì Categorical features: {len(categorical_features)}")
if len(categorical_features) > 0:
    print(f"  Examples: {categorical_features[:5]}")

gc.collect()
print("\n Data loading complete!")



LOADING TRAINING DATA

Loading train_data.parquet...
‚úì Loaded 685,300 training samples
  Memory: 1.57 GB
  Positives: 274,142 (40.00%)
  Negatives: 411,158 (60.00%)

Loading val_data.parquet...
‚úì Loaded 121,164 validation samples
  Memory: 0.28 GB
  Positives: 48,444 (39.98%)
  Negatives: 72,720 (60.02%)

 Feature columns identified: 584
  Excluded columns: ['customer_id', 'article_id', 'label', 'user_type', 'train_label', 'val_label']

Checking for missing values...
 No missing values found

‚úì Feature matrices prepared:
  X_train: (685300, 584)
  X_val: (121164, 584)
  Feature types: {dtype('float32'): 542, dtype('int32'): 10, dtype('float64'): 9, dtype('int16'): 9, dtype('int8'): 5, dtype('int64'): 1, CategoricalDtype(categories=[ -1,  57,  59,  66,  67,  68,  69,  70,  71,  72,
                  ...
                  504, 508, 509, 511, 512, 515, 521, 523, 529, 532],
, ordered=False, categories_dtype=int16): 1, CategoricalDtype(categories=[1010001, 1010002, 1010004, 1010005, 

In [8]:
# TRAIN MULTIPLE LIGHTGBM MODELS

print("\n" + "="*80)
print("TRAINING LIGHTGBM MODELS")
print("="*80)

# Convert categorical features to numeric codes (LightGBM requirement)
# LightGBM doesn't accept object dtype, so we convert to categorical codes
print("\nConverting categorical features to numeric codes...")
for col in categorical_features:
    if col in X_train.columns:
        # Convert to categorical and then to codes (integers)
        # This ensures consistent encoding between train and val
        # First, get all unique values from both train and val
        all_values = pd.concat([X_train[col], X_val[col]]).unique()
        # Create categorical with all possible values
        train_cat = pd.Categorical(X_train[col], categories=sorted(all_values))
        val_cat = pd.Categorical(X_val[col], categories=sorted(all_values))
        # Convert to integer codes
        X_train[col] = train_cat.codes.astype('int32')
        X_val[col] = val_cat.codes.astype('int32')
        # Replace -1 (missing values) with a valid code (use max code + 1)
        if (X_train[col] == -1).any() or (X_val[col] == -1).any():
            max_code = max(X_train[col].max(), X_val[col].max())
            X_train[col] = X_train[col].replace(-1, max_code + 1)
            X_val[col] = X_val[col].replace(-1, max_code + 1)
print(f" Converted {len(categorical_features)} categorical features to numeric codes")

# Create LightGBM datasets
print("\n Creating LightGBM datasets...")
train_dataset = lgb.Dataset(
    X_train, 
    label=y_train,
    categorical_feature=categorical_features,
    free_raw_data=False
)

val_dataset = lgb.Dataset(
    X_val,
    label=y_val,
    categorical_feature=categorical_features,
    reference=train_dataset,
    free_raw_data=False
)

print(" LightGBM datasets created")

# Model configurations
models_config = {
    'lgb_classifier': {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': LightGBMConfig.RANDOM_STATE,
        'force_col_wise': True,
    },
    'lgb_ranker_lambdarank': {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': LightGBMConfig.RANDOM_STATE,
        'force_col_wise': True,
        'label_gain': [0, 1],  # 0 for negative, 1 for positive
    },
    'lgb_ranker_xendcg': {
        'objective': 'rank_xendcg',
        'metric': 'ndcg',
        'boosting_type': 'gbdt',
        'num_leaves': 63,
        'learning_rate': 0.03,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': LightGBMConfig.RANDOM_STATE,
        'force_col_wise': True,
    },
    'lgb_classifier_deep': {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 127,
        'max_depth': 15,
        'learning_rate': 0.03,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,
        'min_data_in_leaf': 20,
        'verbose': -1,
        'seed': LightGBMConfig.RANDOM_STATE,
        'force_col_wise': True,
    }
}

# For ranking models, we need to provide group information
# Group by customer_id (each customer is a query group)
print("\nPreparing group information for ranking models...")
train_groups = train_customer_ids.value_counts().sort_index().values
val_groups = val_customer_ids.value_counts().sort_index().values

# Sort data by customer_id to match group order
train_sort_idx = train_customer_ids.argsort()
val_sort_idx = val_customer_ids.argsort()

X_train_sorted = X_train.iloc[train_sort_idx].reset_index(drop=True)
y_train_sorted = y_train.iloc[train_sort_idx].reset_index(drop=True)
train_customer_ids_sorted = train_customer_ids.iloc[train_sort_idx].reset_index(drop=True)
train_article_ids_sorted = train_article_ids.iloc[train_sort_idx].reset_index(drop=True)

X_val_sorted = X_val.iloc[val_sort_idx].reset_index(drop=True)
y_val_sorted = y_val.iloc[val_sort_idx].reset_index(drop=True)
val_customer_ids_sorted = val_customer_ids.iloc[val_sort_idx].reset_index(drop=True)
val_article_ids_sorted = val_article_ids.iloc[val_sort_idx].reset_index(drop=True)

# Create ranking datasets with groups
train_ranking_dataset = lgb.Dataset(
    X_train_sorted,
    label=y_train_sorted,
    categorical_feature=categorical_features,
    group=train_groups,
    free_raw_data=False
)

val_ranking_dataset = lgb.Dataset(
    X_val_sorted,
    label=y_val_sorted,
    categorical_feature=categorical_features,
    group=val_groups,
    reference=train_ranking_dataset,
    free_raw_data=False
)

print(" Ranking datasets created with group information")

# Train models
trained_models = {}
model_predictions = {}
model_scores = {}

print("\n" + "-"*80)
print("TRAINING MODELS")
print("-"*80)

for model_name, params in models_config.items():
    print(f"\n{'='*80}")
    print(f"Training: {model_name}")
    print(f"{'='*80}")
    
    # Select appropriate dataset
    if 'ranker' in model_name:
        train_ds = train_ranking_dataset
        val_ds = val_ranking_dataset
        use_sorted = True
    else:
        train_ds = train_dataset
        val_ds = val_dataset
        use_sorted = False
    
    # Train model
    model = lgb.train(
        params,
        train_ds,
        num_boost_round=LightGBMConfig.N_ESTIMATORS,
        valid_sets=[train_ds, val_ds],
        valid_names=['train', 'valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=LightGBMConfig.EARLY_STOPPING_ROUNDS, verbose=True),
            lgb.log_evaluation(period=LightGBMConfig.VERBOSE_EVAL)
        ]
    )
    
    # Make predictions
    if use_sorted:
        predictions = model.predict(X_val_sorted, num_iteration=model.best_iteration)
        # Revert to original order
        val_revert_idx = val_sort_idx.argsort()
        predictions = predictions[val_revert_idx]
        eval_df = val_data.copy()
    else:
        predictions = model.predict(X_val, num_iteration=model.best_iteration)
        eval_df = val_data.copy()
    
    # Calculate MAP@12
    map_score = evaluate_map_at_12(eval_df, predictions)
    
    # Store model and results
    trained_models[model_name] = model
    model_predictions[model_name] = predictions
    model_scores[model_name] = {
        'map_at_12': map_score,
        'best_iteration': model.best_iteration
    }
    
    print(f"\n‚úì {model_name} trained")
    print(f"  Best iteration: {model.best_iteration}")
    print(f"  MAP@12: {map_score:.6f}")
    
    # Save model
    model_path = LightGBMConfig.MODEL_PATH / f'{model_name}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"  ‚úì Saved to {model_path}")

# Print summary
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)
print("\nModel Performance (MAP@12):")
for model_name, scores in sorted(model_scores.items(), key=lambda x: x[1]['map_at_12'], reverse=True):
    print(f"  {model_name:30s}: {scores['map_at_12']:.6f} (iter: {scores['best_iteration']})")

best_model_name = max(model_scores.items(), key=lambda x: x[1]['map_at_12'])[0]
print(f"\nüèÜ Best Model: {best_model_name} (MAP@12: {model_scores[best_model_name]['map_at_12']:.6f})")

# Save model metadata
metadata = {
    'models': {name: scores for name, scores in model_scores.items()},
    'best_model': best_model_name,
    'feature_columns': feature_cols,
    'categorical_features': categorical_features,
    'timestamp': datetime.now().isoformat()
}

with open(LightGBMConfig.MODEL_PATH / 'lgb_models_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"\n Saved model metadata to {LightGBMConfig.MODEL_PATH / 'lgb_models_metadata.json'}")

gc.collect()
print("\n All models trained and saved!")



TRAINING LIGHTGBM MODELS

Converting categorical features to numeric codes...
 Converted 8 categorical features to numeric codes

 Creating LightGBM datasets...
 LightGBM datasets created

Preparing group information for ranking models...
 Ranking datasets created with group information

--------------------------------------------------------------------------------
TRAINING MODELS
--------------------------------------------------------------------------------

Training: lgb_classifier
Training until validation scores don't improve for 100 rounds
[50]	train's binary_logloss: 0.0553072	valid's binary_logloss: 0.0573332
[100]	train's binary_logloss: 0.0217025	valid's binary_logloss: 0.0260448
[150]	train's binary_logloss: 0.0172244	valid's binary_logloss: 0.0242604
[200]	train's binary_logloss: 0.0151918	valid's binary_logloss: 0.0243083
[250]	train's binary_logloss: 0.0137373	valid's binary_logloss: 0.0243676
Early stopping, best iteration is:
[151]	train's binary_logloss: 0.0171706	

In [9]:
# ============================================================================
# FEATURE IMPORTANCE ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("FEATURE IMPORTANCE ANALYSIS")
print("="*80)

# Analyze feature importance for the best model
best_model = trained_models[best_model_name]

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': best_model.feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print(f"\nTop 20 Most Important Features ({best_model_name}):")
print("-"*80)
for idx, row in feature_importance.head(20).iterrows():
    print(f"  {row['feature']:40s}: {row['importance']:>12,.0f}")

# Save feature importance
importance_path = LightGBMConfig.MODEL_PATH / 'feature_importance.csv'
feature_importance.to_csv(importance_path, index=False)
print(f"\n‚úì Saved feature importance to {importance_path}")

# Identify low-importance features
low_importance_features = feature_importance[
    feature_importance['importance'] < LightGBMConfig.MIN_FEATURE_IMPORTANCE
]['feature'].tolist()

print(f"\nLow importance features (< {LightGBMConfig.MIN_FEATURE_IMPORTANCE}): {len(low_importance_features)}")
if len(low_importance_features) > 0:
    print(f"  Examples: {low_importance_features[:10]}")

gc.collect()



FEATURE IMPORTANCE ANALYSIS

Top 20 Most Important Features (lgb_classifier):
--------------------------------------------------------------------------------
  has_purchased_item                      :    5,256,895
  days_since_item_purchase                :    1,416,434
  repurchase_score                        :      336,317
  userknn_score_rank                      :      155,513
  repurchase_score_rank                   :       45,798
  n_unique_articles                       :       30,044
  popularity_score_y                      :       20,867
  days_since_last_purchase                :       15,409
  copurchase_score                        :       10,176
  category_score                          :        8,592
  department_no                           :        4,842
  user_item_text_similarity               :        4,367
  semantic_range                          :        3,460
  exploration_rate                        :        3,210
  std_price                               

0

In [10]:
# ENSEMBLE PREDICTIONS

print("\n" + "="*80)
print("CREATING ENSEMBLE PREDICTIONS")
print("="*80)

# Normalize predictions to [0, 1] range for ensemble
print("\nNormalizing predictions...")
normalized_predictions = {}
for model_name, preds in model_predictions.items():
    # Min-max normalization
    min_pred = preds.min()
    max_pred = preds.max()
    if max_pred > min_pred:
        normalized = (preds - min_pred) / (max_pred - min_pred)
    else:
        normalized = preds
    normalized_predictions[model_name] = normalized
    print(f"  {model_name}: [{preds.min():.4f}, {preds.max():.4f}] -> [0, 1]")

# Create different ensemble strategies
ensemble_strategies = {
    'equal_weight': {name: 1.0/len(normalized_predictions) for name in normalized_predictions.keys()},
    'performance_weight': {},  # Will be calculated based on MAP@12 scores
    'best_only': {best_model_name: 1.0}
}

# Calculate performance-based weights (proportional to MAP@12 score)
total_map = sum(scores['map_at_12'] for scores in model_scores.values())
for model_name in normalized_predictions.keys():
    ensemble_strategies['performance_weight'][model_name] = model_scores[model_name]['map_at_12'] / total_map

print("\nEnsemble strategies:")
for strategy_name, weights in ensemble_strategies.items():
    print(f"\n  {strategy_name}:")
    for model_name, weight in weights.items():
        print(f"    {model_name:30s}: {weight:.4f}")

# Create ensemble predictions
ensemble_predictions = {}
ensemble_scores = {}

for strategy_name, weights in ensemble_strategies.items():
    # Weighted average
    ensemble_pred = np.zeros(len(val_data))
    for model_name, weight in weights.items():
        ensemble_pred += weight * normalized_predictions[model_name]
    
    ensemble_predictions[strategy_name] = ensemble_pred
    
    # Evaluate ensemble
    map_score = evaluate_map_at_12(val_data, ensemble_pred)
    ensemble_scores[strategy_name] = map_score
    
    print(f"\n  {strategy_name:25s}: MAP@12 = {map_score:.6f}")

# Find best ensemble
best_ensemble_name = max(ensemble_scores.items(), key=lambda x: x[1])[0]
print(f"\n Best Ensemble: {best_ensemble_name} (MAP@12: {ensemble_scores[best_ensemble_name]:.6f})")

# Save ensemble predictions
ensemble_path = LightGBMConfig.MODEL_PATH / 'ensemble_predictions_val.parquet'
val_data_with_preds = val_data.copy()
val_data_with_preds['ensemble_pred'] = ensemble_predictions[best_ensemble_name]
val_data_with_preds.to_parquet(ensemble_path, index=False)
print(f"\n Saved ensemble predictions to {ensemble_path}")

# Save ensemble metadata
ensemble_metadata = {
    'strategies': ensemble_strategies,
    'scores': ensemble_scores,
    'best_ensemble': best_ensemble_name,
    'timestamp': datetime.now().isoformat()
}

with open(LightGBMConfig.MODEL_PATH / 'ensemble_metadata.json', 'w') as f:
    json.dump(ensemble_metadata, f, indent=2)

print(f" Saved ensemble metadata to {LightGBMConfig.MODEL_PATH / 'ensemble_metadata.json'}")

gc.collect()
print("\n Ensemble predictions created!")



CREATING ENSEMBLE PREDICTIONS

Normalizing predictions...
  lgb_classifier: [0.0000, 0.9993] -> [0, 1]
  lgb_ranker_lambdarank: [-1.3156, 1.3233] -> [0, 1]
  lgb_ranker_xendcg: [-0.6435, 0.8475] -> [0, 1]
  lgb_classifier_deep: [0.0001, 0.9992] -> [0, 1]

Ensemble strategies:

  equal_weight:
    lgb_classifier                : 0.2500
    lgb_ranker_lambdarank         : 0.2500
    lgb_ranker_xendcg             : 0.2500
    lgb_classifier_deep           : 0.2500

  performance_weight:
    lgb_classifier                : 0.2501
    lgb_ranker_lambdarank         : 0.2501
    lgb_ranker_xendcg             : 0.2497
    lgb_classifier_deep           : 0.2501

  best_only:
    lgb_classifier                : 1.0000

  equal_weight             : MAP@12 = 0.988396

  performance_weight       : MAP@12 = 0.988396

  best_only                : MAP@12 = 0.988774

 Best Ensemble: best_only (MAP@12: 0.988774)

 Saved ensemble predictions to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion

In [None]:
# FINAL SUMMARY

print("\n" + "="*80)
print("LIGHTGBM TRAINING - FINAL SUMMARY")
print("="*80)

print("\n Model Performance:")
print("-"*80)
print(f"{'Model':<35} {'MAP@12':<15} {'Best Iteration':<15}")
print("-"*80)
for model_name, scores in sorted(model_scores.items(), key=lambda x: x[1]['map_at_12'], reverse=True):
    print(f"{model_name:<35} {scores['map_at_12']:<15.6f} {scores['best_iteration']:<15}")

print("\n Ensemble Performance:")
print("-"*80)
for strategy_name, score in sorted(ensemble_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {strategy_name:25s}: {score:.6f}")

print(f"\n Best Single Model: {best_model_name}")
print(f"   MAP@12: {model_scores[best_model_name]['map_at_12']:.6f}")

print(f"\n Best Ensemble: {best_ensemble_name}")
print(f"   MAP@12: {ensemble_scores[best_ensemble_name]:.6f}")

improvement = ensemble_scores[best_ensemble_name] - model_scores[best_model_name]['map_at_12']
improvement_pct = (improvement / model_scores[best_model_name]['map_at_12']) * 100
print(f"\nüìà Ensemble Improvement: {improvement:+.6f} ({improvement_pct:+.2f}%)")

print("\nüíæ Saved Files:")
print("-"*80)
print(f"  Models: {len(trained_models)} models saved to {LightGBMConfig.MODEL_PATH}")
print(f"  Feature Importance: {LightGBMConfig.MODEL_PATH / 'feature_importance.csv'}")
print(f"  Model Metadata: {LightGBMConfig.MODEL_PATH / 'lgb_models_metadata.json'}")
print(f"  Ensemble Metadata: {LightGBMConfig.MODEL_PATH / 'ensemble_metadata.json'}")
print(f"  Ensemble Predictions: {LightGBMConfig.MODEL_PATH / 'ensemble_predictions_val.parquet'}")



LIGHTGBM TRAINING - FINAL SUMMARY

 Model Performance:
--------------------------------------------------------------------------------
Model                               MAP@12          Best Iteration 
--------------------------------------------------------------------------------
lgb_classifier                      0.989001        146            
lgb_classifier_deep                 0.988969        242            
lgb_ranker_lambdarank               0.988961        34             
lgb_ranker_xendcg                   0.987259        4              

 Ensemble Performance:
--------------------------------------------------------------------------------
  best_only                : 0.989001
  equal_weight             : 0.988874
  performance_weight       : 0.988874

 Best Single Model: lgb_classifier
   MAP@12: 0.989001

 Best Ensemble: best_only
   MAP@12: 0.989001

üìà Ensemble Improvement: +0.000000 (+0.00%)

üíæ Saved Files:
------------------------------------------------------

### Stage 4B: Neural Towers Training

This stage implements a three-tower neural network architecture for recommendation:
- **User Tower**: Processes user-level features (purchase history, preferences, demographics)
- **Item Tower**: Processes item-level features (product attributes, popularity, sales)
- **Image Tower**: Processes image embeddings (visual features from FashionCLIP)
- **Fusion Layer**: Combines all three towers with MLP layers
- **Output**: Binary classification score for purchase prediction

**Key Features:**
- MPS acceleration for Apple Silicon (M4)
- MAP@12 evaluation during training
- Early stopping and model checkpointing
- Batch processing optimized for memory
- Learning rate scheduling


In [11]:
# IMPORTS AND CONFIGURATION FOR NEURAL TOWERS

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import gc
from tqdm.auto import tqdm
import warnings
import json
from datetime import datetime
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

# ============================================================================
# WEIGHTED BCE LOSS FOR CLASS IMBALANCE
# ============================================================================

class WeightedBCELoss(nn.Module):
    """
    Weighted Binary Cross-Entropy Loss
    Useful for handling class imbalance
    MPS-compatible: Uses float32 explicitly
    Manually implements pos_weight for compatibility with all PyTorch versions
    """
    def __init__(self, pos_weight=None):
        super(WeightedBCELoss, self).__init__()
        self.pos_weight = pos_weight
    
    def forward(self, inputs, targets):
        # Calculate standard BCE loss
        bce_loss = nn.functional.binary_cross_entropy(inputs, targets, reduction='none')
        
        if self.pos_weight is not None:
            # Create pos_weight tensor with float32 dtype for MPS compatibility
            if isinstance(self.pos_weight, (int, float)):
                pos_weight_val = float(self.pos_weight)
            else:
                pos_weight_val = float(self.pos_weight.item()) if hasattr(self.pos_weight, 'item') else float(self.pos_weight)
            
            # Apply pos_weight: multiply positive class losses by pos_weight
            # Positive samples (targets == 1) get weighted by pos_weight
            # Negative samples (targets == 0) remain unchanged
            weights = torch.where(targets == 1, 
                                 torch.tensor(pos_weight_val, device=inputs.device, dtype=torch.float32),
                                 torch.tensor(1.0, device=inputs.device, dtype=torch.float32))
            
            weighted_loss = bce_loss * weights
            return weighted_loss.mean()
        else:
            return bce_loss.mean()

# Configuration
class NeuralTowerConfig:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    MODEL_PATH.mkdir(exist_ok=True, parents=True)
    
    # Device configuration
    if torch.backends.mps.is_available():
        DEVICE = torch.device('mps')
        print("Using Apple Silicon GPU (MPS)")
    else:
        DEVICE = torch.device('cpu')
        print("Using CPU")
    
    # Training configuration (OPTIMIZED FOR BETTER PERFORMANCE & RANKING)
    BATCH_SIZE = 4096  # Larger batches for more stable gradients
    N_EPOCHS = 30  # Reduced from 40 to prevent overfitting
    LEARNING_RATE = 3e-4  # Further reduced for more stable training
    WEIGHT_DECAY = 2e-4  # Increased regularization
    EARLY_STOPPING_PATIENCE = 5  # Reduced patience to stop earlier
    VALIDATION_FREQ = 1  # Validate every epoch
    
    # Model architecture (OPTIMIZED TO REDUCE OVERFITTING & IMPROVE RANKING)
    USER_EMBEDDING_DIM = 96  # Reduced from 128 to prevent overfitting
    ITEM_EMBEDDING_DIM = 48  # Reduced from 64
    IMAGE_EMBEDDING_DIM = 96  # Reduced from 128
    FUSION_HIDDEN_DIMS = [192, 96, 48]  # Smaller fusion layers
    DROPOUT_RATE = 0.5  # Increased from 0.4 for stronger regularization
    
    # Feature groups (will be determined from data)
    USER_FEATURE_PREFIXES = ['n_', 'avg_', 'std_', 'min_', 'max_', 'days_', 'purchase_', 
                             'exploration_', 'age', 'FN', 'Active', 'unique_']
    ITEM_FEATURE_PREFIXES = ['product_', 'graphical_', 'colour_', 'perceived_', 'department_',
                            'index_', 'section_', 'garment_', 'popularity_', 'sales_', 'buyers_']
    IMAGE_FEATURE_PREFIXES = ['image_emb_']
    
    RANDOM_STATE = 42

print("‚úì Neural Tower Configuration loaded")
print(f"  Model path: {NeuralTowerConfig.MODEL_PATH}")
print(f"  Device: {NeuralTowerConfig.DEVICE}")


Using Apple Silicon GPU (MPS)
‚úì Neural Tower Configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models
  Device: mps


In [12]:
# ============================================================================
# THREE-TOWER NEURAL NETWORK MODEL
# ============================================================================

class ThreeTowerModel(nn.Module):
    """
    Three-tower neural network for recommendation:
    - User Tower: User features -> User embedding
    - Item Tower: Item features -> Item embedding  
    - Image Tower: Image embeddings -> Image embedding
    - Fusion: Concatenated embeddings -> Final prediction
    """
    
    def __init__(self, 
                 user_feature_dim,
                 item_feature_dim,
                 image_feature_dim,
                 user_embedding_dim=128,
                 item_embedding_dim=64,
                 image_embedding_dim=128,
                 fusion_hidden_dims=[256, 128, 64],
                 dropout_rate=0.3):
        super(ThreeTowerModel, self).__init__()
        
        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_feature_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, user_embedding_dim),
            nn.BatchNorm1d(user_embedding_dim),
            nn.ReLU()
        )
        
        # Item Tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_feature_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, item_embedding_dim),
            nn.BatchNorm1d(item_embedding_dim),
            nn.ReLU()
        )
        
        # Image Tower
        self.image_tower = nn.Sequential(
            nn.Linear(image_feature_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, image_embedding_dim),
            nn.BatchNorm1d(image_embedding_dim),
            nn.ReLU()
        )
        
        # Fusion Layer
        fusion_input_dim = user_embedding_dim + item_embedding_dim + image_embedding_dim
        fusion_layers = []
        
        prev_dim = fusion_input_dim
        for hidden_dim in fusion_hidden_dims:
            fusion_layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        fusion_layers.append(nn.Linear(prev_dim, 1))
        fusion_layers.append(nn.Sigmoid())
        
        self.fusion = nn.Sequential(*fusion_layers)
        
    def forward(self, user_features, item_features, image_features):
        # Pass through towers
        user_emb = self.user_tower(user_features)
        item_emb = self.item_tower(item_features)
        image_emb = self.image_tower(image_features)
        
        # Concatenate embeddings
        fused = torch.cat([user_emb, item_emb, image_emb], dim=1)
        
        # Final prediction
        output = self.fusion(fused)
        
        return output.squeeze()

print("‚úì ThreeTowerModel class defined")


‚úì ThreeTowerModel class defined


In [13]:
# ============================================================================
# DATASET CLASS
# ============================================================================

class RecommendationDataset(Dataset):
    """Dataset for recommendation training"""
    
    def __init__(self, df, user_features, item_features, image_features, labels=None):
        self.df = df.reset_index(drop=True)
        self.user_features = user_features.values.astype(np.float32)
        self.item_features = item_features.values.astype(np.float32)
        self.image_features = image_features.values.astype(np.float32)
        self.labels = labels.values.astype(np.float32) if labels is not None else None
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        user_feat = torch.FloatTensor(self.user_features[idx])
        item_feat = torch.FloatTensor(self.item_features[idx])
        image_feat = torch.FloatTensor(self.image_features[idx])
        
        if self.labels is not None:
            label = torch.FloatTensor([self.labels[idx]])
            return user_feat, item_feat, image_feat, label
        else:
            return user_feat, item_feat, image_feat

print(" RecommendationDataset class defined")


 RecommendationDataset class defined


In [14]:
# ============================================================================
# LOAD DATA AND PREPARE FEATURES FOR NEURAL TOWERS
# ============================================================================

print("\n" + "="*80)
print("LOADING DATA FOR NEURAL TOWERS")
print("="*80)

# Load train and validation datasets
print("\nLoading train_data.parquet...")
train_data = pd.read_parquet(NeuralTowerConfig.MODEL_PATH / 'train_data.parquet')
print(f"‚úì Loaded {len(train_data):,} training samples")

print("\nLoading val_data.parquet...")
val_data = pd.read_parquet(NeuralTowerConfig.MODEL_PATH / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")

# Load or create test dataset
test_data_path = Path("/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/test_data.parquet")
if test_data_path.exists():
    print("\nLoading test_data.parquet...")
    test_data = pd.read_parquet(test_data_path)
    print(f"‚úì Loaded {len(test_data):,} test samples")
    use_test_set = True
else:
    print("\n‚ö†Ô∏è  test_data.parquet not found. Creating test set from validation data...")
    print("   (Splitting validation data: % validation, 30% test)")
    # Split validation data into validation and test (95/5 split)
    from sklearn.model_selection import train_test_split
    val_users = val_data['customer_id'].unique()
    val_users_train, val_users_test = train_test_split(
        val_users, 
        test_size=0.3, 
        random_state=NeuralTowerConfig.RANDOM_STATE
    )
    test_data = val_data[val_data['customer_id'].isin(val_users_test)].copy()
    val_data = val_data[val_data['customer_id'].isin(val_users_train)].copy()
    print(f"‚úì Created test set: {len(test_data):,} samples")
    print(f"‚úì Validation set (after split): {len(val_data):,} samples")
    # Save test data for future use
    test_data.to_parquet(test_data_path, index=False)
    print(f"‚úì Saved test set to {test_data_path}")
    use_test_set = True

# Identify feature groups
exclude_cols = ['customer_id', 'article_id', 'label', 'user_type', 'train_label', 'val_label']
all_feature_cols = [col for col in train_data.columns if col not in exclude_cols]

# Separate features into user, item, and image
user_feature_cols = []
item_feature_cols = []
image_feature_cols = []

for col in all_feature_cols:
    if any(col.startswith(prefix) for prefix in NeuralTowerConfig.IMAGE_FEATURE_PREFIXES):
        image_feature_cols.append(col)
    elif any(col.startswith(prefix) for prefix in NeuralTowerConfig.USER_FEATURE_PREFIXES):
        user_feature_cols.append(col)
    elif any(col.startswith(prefix) for prefix in NeuralTowerConfig.ITEM_FEATURE_PREFIXES):
        item_feature_cols.append(col)
    else:
        # Default to user features if unclear
        user_feature_cols.append(col)

print(f"\n‚úì Feature separation:")
print(f"  User features: {len(user_feature_cols)}")
print(f"  Item features: {len(item_feature_cols)}")
print(f"  Image features: {len(image_feature_cols)}")
print(f"  Total features: {len(all_feature_cols)}")

# Check for missing image features
if len(image_feature_cols) == 0:
    print("\n Warning: No image features found!")
    print("  Creating dummy image features...")
    # Create dummy image features (zeros)
    for i in range(52):  # 52 dimensions
        col_name = f'image_emb_{i}'
        train_data[col_name] = 0.0
        val_data[col_name] = 0.0
        image_feature_cols.append(col_name)
    print(f"  ‚úì Created {len(image_feature_cols)} dummy image features")

# Prepare feature matrices
X_train_user = train_data[user_feature_cols].copy()
X_train_item = train_data[item_feature_cols].copy()
X_train_image = train_data[image_feature_cols].copy()
y_train = train_data['label'].copy()

X_val_user = val_data[user_feature_cols].copy()
X_val_item = val_data[item_feature_cols].copy()
X_val_image = val_data[image_feature_cols].copy()
y_val = val_data['label'].copy()

# Prepare test features if test set exists
if use_test_set:
    # Check for missing image features in test data
    if len(image_feature_cols) > 0:
        for col in image_feature_cols:
            if col not in test_data.columns:
                test_data[col] = 0.0
    
    X_test_user = test_data[user_feature_cols].copy()
    X_test_item = test_data[item_feature_cols].copy()
    X_test_image = test_data[image_feature_cols].copy()
    y_test = test_data['label'].copy()
    
    # Store test customer and article IDs for evaluation
    test_customer_ids = test_data['customer_id'].copy()
    test_article_ids = test_data['article_id'].copy()
else:
    X_test_user = None
    X_test_item = None
    X_test_image = None
    y_test = None
    test_customer_ids = None
    test_article_ids = None

# Fill any missing values
print("\nFilling missing values...")
# Handle categorical columns separately - convert to numeric first
dataframes_to_fill = [X_train_user, X_train_item, X_train_image, X_val_user, X_val_item, X_val_image]
if use_test_set:
    dataframes_to_fill.extend([X_test_user, X_test_item, X_test_image])

for df in dataframes_to_fill:
    for col in df.columns:
        if df[col].dtype.name == 'category':
            # Convert categorical to numeric codes
            df[col] = pd.Categorical(df[col]).codes
            # Replace -1 (missing) with 0
            df[col] = df[col].replace(-1, 0)
        else:
            # Fill numeric columns
            df[col] = df[col].fillna(0)

print("‚úì Missing values filled")

# Standardize features (important for neural networks)
print("\nStandardizing features...")
scaler_user = StandardScaler()
scaler_item = StandardScaler()
scaler_image = StandardScaler()

X_train_user_scaled = pd.DataFrame(
    scaler_user.fit_transform(X_train_user),
    columns=user_feature_cols,
    index=X_train_user.index
)
X_val_user_scaled = pd.DataFrame(
    scaler_user.transform(X_val_user),
    columns=user_feature_cols,
    index=X_val_user.index
)

X_train_item_scaled = pd.DataFrame(
    scaler_item.fit_transform(X_train_item),
    columns=item_feature_cols,
    index=X_train_item.index
)
X_val_item_scaled = pd.DataFrame(
    scaler_item.transform(X_val_item),
    columns=item_feature_cols,
    index=X_val_item.index
)

# Scale image features (fit on train, then transform val and test)
X_train_image_scaled = pd.DataFrame(
    scaler_image.fit_transform(X_train_image),
    columns=image_feature_cols,
    index=X_train_image.index
)
X_val_image_scaled = pd.DataFrame(
    scaler_image.transform(X_val_image),
    columns=image_feature_cols,
    index=X_val_image.index
)

# Scale test features if test set exists (AFTER all scalers are fitted)
if use_test_set:
    X_test_user_scaled = pd.DataFrame(
        scaler_user.transform(X_test_user),
        columns=user_feature_cols,
        index=X_test_user.index
    )
    X_test_item_scaled = pd.DataFrame(
        scaler_item.transform(X_test_item),
        columns=item_feature_cols,
        index=X_test_item.index
    )
    X_test_image_scaled = pd.DataFrame(
        scaler_image.transform(X_test_image),
        columns=image_feature_cols,
        index=X_test_image.index
    )
    print("‚úì Test features standardized")
else:
    X_test_user_scaled = None
    X_test_item_scaled = None
    X_test_image_scaled = None

print(" Features standardized")

# Store customer and article IDs for evaluation
train_customer_ids = train_data['customer_id'].copy()
train_article_ids = train_data['article_id'].copy()
val_customer_ids = val_data['customer_id'].copy()
val_article_ids = val_data['article_id'].copy()

print(f"\n Feature matrices prepared:")
print(f"  User: {X_train_user_scaled.shape[1]} features")
print(f"  Item: {X_train_item_scaled.shape[1]} features")
print(f"  Image: {X_train_image_scaled.shape[1]} features")

gc.collect()
print("\n Data loading complete!")



LOADING DATA FOR NEURAL TOWERS

Loading train_data.parquet...
‚úì Loaded 685,300 training samples

Loading val_data.parquet...
‚úì Loaded 121,164 validation samples

Loading test_data.parquet...
‚úì Loaded 36,379 test samples

‚úì Feature separation:
  User features: 43
  Item features: 29
  Image features: 512
  Total features: 584

Filling missing values...
‚úì Missing values filled

Standardizing features...
‚úì Test features standardized
 Features standardized

 Feature matrices prepared:
  User: 43 features
  Item: 29 features
  Image: 512 features

 Data loading complete!


In [15]:
# ============================================================================
# CREATE DATA LOADERS
# ============================================================================

print("\n" + "="*80)
print("CREATING DATA LOADERS")
print("="*80)

# Create datasets
train_dataset = RecommendationDataset(
    train_data,
    X_train_user_scaled,
    X_train_item_scaled,
    X_train_image_scaled,
    y_train
)

val_dataset = RecommendationDataset(
    val_data,
    X_val_user_scaled,
    X_val_item_scaled,
    X_val_image_scaled,
    y_val
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=NeuralTowerConfig.BATCH_SIZE,
    shuffle=True,
    num_workers=0,  # Set to 0 for MPS compatibility
    pin_memory=False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=NeuralTowerConfig.BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=False
)

# Create test data loader if test set exists
if use_test_set:
    test_dataset = RecommendationDataset(
        test_data,
        X_test_user_scaled,
        X_test_item_scaled,
        X_test_image_scaled,
        y_test
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=NeuralTowerConfig.BATCH_SIZE,
        shuffle=False,
        num_workers=0,
        pin_memory=False
    )
    print(f"\n Data loaders created:")
    print(f"  Train batches: {len(train_loader)}")
    print(f"  Val batches: {len(val_loader)}")
    print(f"  Test batches: {len(test_loader)}")
    print(f"  Batch size: {NeuralTowerConfig.BATCH_SIZE}")
else:
    test_loader = None
    print(f"\n Data loaders created:")
    print(f"  Train batches: {len(train_loader)}")
    print(f"  Val batches: {len(val_loader)}")
    print(f"  Batch size: {NeuralTowerConfig.BATCH_SIZE}")
    print(f"  ‚ö†Ô∏è  Test set not available")

# Initialize model
model = ThreeTowerModel(
    user_feature_dim=len(user_feature_cols),
    item_feature_dim=len(item_feature_cols),
    image_feature_dim=len(image_feature_cols),
    user_embedding_dim=NeuralTowerConfig.USER_EMBEDDING_DIM,
    item_embedding_dim=NeuralTowerConfig.ITEM_EMBEDDING_DIM,
    image_embedding_dim=NeuralTowerConfig.IMAGE_EMBEDDING_DIM,
    fusion_hidden_dims=NeuralTowerConfig.FUSION_HIDDEN_DIMS,
    dropout_rate=NeuralTowerConfig.DROPOUT_RATE
).to(NeuralTowerConfig.DEVICE)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n Model initialized:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")
print(f"  Device: {NeuralTowerConfig.DEVICE}")

# Calculate pos_weight for class imbalance
print("\nüìä Calculating class weights...")
pos_count = y_train.sum()
neg_count = (y_train == 0).sum()
pos_weight = neg_count / pos_count if pos_count > 0 else 1.5
print(f"  Positive samples: {pos_count:,}")
print(f"  Negative samples: {neg_count:,}")
print(f"  Recommended pos_weight: {pos_weight:.4f}")

# Loss and optimizer (USING WEIGHTED BCE FOR CLASS IMBALANCE)
# Using slightly lower pos_weight to focus more on ranking quality
criterion = WeightedBCELoss(pos_weight=pos_weight * 0.9)  # Slight reduction for better ranking
optimizer = optim.AdamW(
    model.parameters(),
    lr=NeuralTowerConfig.LEARNING_RATE,
    weight_decay=NeuralTowerConfig.WEIGHT_DECAY,
    betas=(0.9, 0.999)  # Standard AdamW betas
)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='max',  # Maximize MAP@12
    factor=0.5,
    patience=3,
    # verbose=True
)

print("\n Model and optimizers ready!")



CREATING DATA LOADERS

 Data loaders created:
  Train batches: 168
  Val batches: 30
  Test batches: 9
  Batch size: 4096

 Model initialized:
  Total parameters: 273,905
  Trainable parameters: 273,905
  Device: mps

üìä Calculating class weights...
  Positive samples: 274,142
  Negative samples: 411,158
  Recommended pos_weight: 1.4998

 Model and optimizers ready!


In [16]:
# ============================================================================
# TRAINING LOOP WITH MAP@12 EVALUATION
# ============================================================================

print("\n" + "="*80)
print("TRAINING NEURAL TOWER MODEL")
print("="*80)

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'val_map12': [],
    'test_map12': []  # Add test MAP@12 tracking
}

# For visualization
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8-darkgrid')

best_map12 = 0.0
best_epoch = 0
patience_counter = 0

# Checkpoint directory
checkpoint_dir = NeuralTowerConfig.MODEL_PATH / 'checkpoints'
checkpoint_dir.mkdir(exist_ok=True)

for epoch in range(NeuralTowerConfig.N_EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    train_batches = 0
    
    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NeuralTowerConfig.N_EPOCHS} [Train]")
    for user_feat, item_feat, image_feat, labels in train_pbar:
        # Move to device
        user_feat = user_feat.to(NeuralTowerConfig.DEVICE)
        item_feat = item_feat.to(NeuralTowerConfig.DEVICE)
        image_feat = image_feat.to(NeuralTowerConfig.DEVICE)
        labels = labels.to(NeuralTowerConfig.DEVICE).squeeze()
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(user_feat, item_feat, image_feat)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_batches += 1
        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_train_loss = train_loss / train_batches
    
    # Validation phase
    if (epoch + 1) % NeuralTowerConfig.VALIDATION_FREQ == 0:
        model.eval()
        val_loss = 0.0
        val_batches = 0
        all_predictions = []
        all_labels = []
        val_customer_ids_list = []
        val_article_ids_list = []
        
        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{NeuralTowerConfig.N_EPOCHS} [Val]")
            for batch_idx, (user_feat, item_feat, image_feat, labels) in enumerate(val_pbar):
                # Move to device
                user_feat = user_feat.to(NeuralTowerConfig.DEVICE)
                item_feat = item_feat.to(NeuralTowerConfig.DEVICE)
                image_feat = image_feat.to(NeuralTowerConfig.DEVICE)
                labels = labels.to(NeuralTowerConfig.DEVICE).squeeze()
                
                # Forward pass
                outputs = model(user_feat, item_feat, image_feat)
                loss = criterion(outputs, labels)
                
                val_loss += loss.item()
                val_batches += 1
                
                # Store predictions and labels for MAP@12 calculation
                predictions = outputs.cpu().numpy()
                labels_np = labels.cpu().numpy()
                
                # Get customer and article IDs for this batch
                start_idx = batch_idx * NeuralTowerConfig.BATCH_SIZE
                end_idx = min(start_idx + len(predictions), len(val_data))
                batch_customer_ids = val_customer_ids.iloc[start_idx:end_idx].values
                batch_article_ids = val_article_ids.iloc[start_idx:end_idx].values
                
                all_predictions.extend(predictions)
                all_labels.extend(labels_np)
                val_customer_ids_list.extend(batch_customer_ids)
                val_article_ids_list.extend(batch_article_ids)
                
                val_pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        
        avg_val_loss = val_loss / val_batches
        
        # Calculate MAP@12
        val_eval_df = pd.DataFrame({
            'customer_id': val_customer_ids_list,
            'article_id': val_article_ids_list,
            'label': all_labels,
            'pred_score': all_predictions
        })
        
        # Calculate MAP@12 using the standard evaluation function
        # (no extra keyword arguments ‚Äì matches evaluate_map_at_12 signature)
        map12_score = evaluate_map_at_12(
            val_eval_df,
            np.array(all_predictions)
        )
        
        # Update learning rate
        scheduler.step(map12_score)
        
        # Test evaluation (if test set is available)
        test_map12_score = None
        if use_test_set and test_loader is not None:
            model.eval()
            test_predictions = []
            test_labels = []
            test_customer_ids_list = []
            test_article_ids_list = []
            
            with torch.no_grad():
                test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{NeuralTowerConfig.N_EPOCHS} [Test]", leave=False)
                for batch_idx, (user_feat, item_feat, image_feat, labels) in enumerate(test_pbar):
                    # Move to device
                    user_feat = user_feat.to(NeuralTowerConfig.DEVICE)
                    item_feat = item_feat.to(NeuralTowerConfig.DEVICE)
                    image_feat = image_feat.to(NeuralTowerConfig.DEVICE)
                    
                    # Forward pass
                    outputs = model(user_feat, item_feat, image_feat)
                    predictions = outputs.cpu().numpy()
                    labels_np = labels.cpu().numpy()
                    
                    # Get customer and article IDs for this batch
                    start_idx = batch_idx * NeuralTowerConfig.BATCH_SIZE
                    end_idx = min(start_idx + len(predictions), len(test_data))
                    batch_customer_ids = test_customer_ids.iloc[start_idx:end_idx].values
                    batch_article_ids = test_article_ids.iloc[start_idx:end_idx].values
                    
                    test_predictions.extend(predictions)
                    test_labels.extend(labels_np)
                    test_customer_ids_list.extend(batch_customer_ids)
                    test_article_ids_list.extend(batch_article_ids)
            
            # Calculate test MAP@12
            test_eval_df = pd.DataFrame({
                'customer_id': test_customer_ids_list,
                'article_id': test_article_ids_list,
                'label': test_labels,
                'pred_score': test_predictions
            })
            
            # Use consistent evaluation function (no filtering needed for test set)
            # This ensures test MAP@12 uses the same calculation logic
        
        # Store history
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(avg_val_loss)
        history['val_map12'].append(map12_score)
        if test_map12_score is not None:
            history['test_map12'].append(test_map12_score)
        else:
            history['test_map12'].append(None)
        
        # Print epoch summary
        print(f"\nEpoch {epoch+1}/{NeuralTowerConfig.N_EPOCHS}:")
        print(f"  Train Loss: {avg_train_loss:.6f}")
        print(f"  Val Loss: {avg_val_loss:.6f}")
        print(f"  Val MAP@12: {map12_score:.6f}")
        print(f"  LR: {optimizer.param_groups[0]['lr']:.2e}")
        
        # Save best model based on VALIDATION MAP@12 (not test MAP@12)
        # This ensures we select the model that generalizes best to validation users
        if map12_score > best_map12:
            best_map12 = map12_score
            best_epoch = epoch + 1
            patience_counter = 0
            
            # Save checkpoint
            checkpoint = {
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'map12': map12_score,
                'val_loss': avg_val_loss,
                'test_map12': test_map12_score if test_map12_score is not None else None,
                'history': history
            }
            torch.save(checkpoint, checkpoint_dir / 'best_model.pt')
            print(f" ‚úÖ Saved best model (Val MAP@12: {map12_score:.6f})")
            if test_map12_score is not None:
                print(f"    Test MAP@12 at this epoch: {test_map12_score:.6f}")
        else:
            patience_counter += 1
            print(f"  No improvement ({patience_counter}/{NeuralTowerConfig.EARLY_STOPPING_PATIENCE})")
        
        # Early stopping
        if patience_counter >= NeuralTowerConfig.EARLY_STOPPING_PATIENCE:
            print(f"\n Early stopping triggered after {epoch+1} epochs")
            print(f"   Best MAP@12: {best_map12:.6f} at epoch {best_epoch}")
            break
        
        # Visualize MAP@12 progress every few epochs
        if (epoch + 1) % 2 == 0 or (epoch + 1) == NeuralTowerConfig.N_EPOCHS or patience_counter >= NeuralTowerConfig.EARLY_STOPPING_PATIENCE:
            if len(history['val_map12']) > 0:
                fig, ax = plt.subplots(1, 1, figsize=(10, 6))
                epochs_list = list(range(1, len(history['val_map12']) + 1))
                ax.plot(epochs_list, history['val_map12'], 'b-o', label='Val MAP@12', linewidth=2, markersize=6)
                if use_test_set and any(x is not None for x in history['test_map12']):
                    test_scores = [x if x is not None else 0 for x in history['test_map12']]
                    ax.plot(epochs_list, test_scores, 'r-s', label='Test MAP@12', linewidth=2, markersize=6)
                ax.axvline(x=best_epoch, color='g', linestyle='--', linewidth=2, label=f'Best Epoch ({best_epoch})')
                ax.set_xlabel('Epoch', fontsize=12)
                ax.set_ylabel('MAP@12', fontsize=12)
                ax.set_title('MAP@12 Progress During Training', fontsize=14, fontweight='bold')
                ax.legend(fontsize=10)
                ax.grid(True, alpha=0.3)
                plt.tight_layout()
                plt.savefig(NeuralTowerConfig.MODEL_PATH / 'map12_progress.png', dpi=150, bbox_inches='tight')
                plt.close()
                print(f"  üìä Saved MAP@12 visualization")
    
    gc.collect()

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"\n Best Model (selected based on Val MAP@12):")
print(f"  Epoch: {best_epoch}")
print(f"  Val MAP@12: {best_map12:.6f}")
if use_test_set and len(history['test_map12']) > 0 and history['test_map12'][best_epoch-1] is not None:
    test_map12_at_best = history['test_map12'][best_epoch-1]
    print(f"  Test MAP@12: {test_map12_at_best:.6f}")
    print(f"\nüìä Consistency Check:")
    print(f"  Val and Test MAP@12 should be similar if evaluation is consistent")
    print(f"  Difference: {abs(best_map12 - test_map12_at_best):.6f}")

# Load best model
# Note: weights_only=False is needed for PyTorch 2.6+ when loading checkpoints with numpy arrays
checkpoint = torch.load(checkpoint_dir / 'best_model.pt', weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
print(f"\n‚úì Loaded best model from checkpoint")

# Save final model
final_model_path = NeuralTowerConfig.MODEL_PATH / 'neural_tower_model.pt'
torch.save({
    'model_state_dict': model.state_dict(),
    'model_config': {
        'user_feature_dim': len(user_feature_cols),
        'item_feature_dim': len(item_feature_cols),
        'image_feature_dim': len(image_feature_cols),
        'user_embedding_dim': NeuralTowerConfig.USER_EMBEDDING_DIM,
        'item_embedding_dim': NeuralTowerConfig.ITEM_EMBEDDING_DIM,
        'image_embedding_dim': NeuralTowerConfig.IMAGE_EMBEDDING_DIM,
        'fusion_hidden_dims': NeuralTowerConfig.FUSION_HIDDEN_DIMS,
        'dropout_rate': NeuralTowerConfig.DROPOUT_RATE
    },
    'feature_cols': {
        'user': user_feature_cols,
        'item': item_feature_cols,
        'image': image_feature_cols
    },
    'scalers': {
        'user': scaler_user,
        'item': scaler_item,
        'image': scaler_image
    },
    'best_map12': best_map12,
    'best_epoch': best_epoch,
    'history': history
}, final_model_path)

print(f" Saved final model to {final_model_path}")

# Save training history
history_path = NeuralTowerConfig.MODEL_PATH / 'neural_tower_history.json'
with open(history_path, 'w') as f:
    json.dump(history, f, indent=2)
print(f"‚úì Saved training history to {history_path}")

gc.collect()
print("\n Neural Tower training complete!")



TRAINING NEURAL TOWER MODEL


Epoch 1/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 1/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 1/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 1/30:
  Train Loss: 0.275254
  Val Loss: 0.085832
  Val MAP@12: 0.985713
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.985713)


Epoch 2/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 2/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 2/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 2/30:
  Train Loss: 0.080026
  Val Loss: 0.048183
  Val MAP@12: 0.986084
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.986084)
  üìä Saved MAP@12 visualization


Epoch 3/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 3/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 3/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 3/30:
  Train Loss: 0.053084
  Val Loss: 0.037525
  Val MAP@12: 0.986626
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.986626)


Epoch 4/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 4/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 4/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 4/30:
  Train Loss: 0.042876
  Val Loss: 0.033127
  Val MAP@12: 0.987063
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.987063)
  üìä Saved MAP@12 visualization


Epoch 5/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 5/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 5/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 5/30:
  Train Loss: 0.038068
  Val Loss: 0.031579
  Val MAP@12: 0.987626
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.987626)


Epoch 6/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 6/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 6/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 6/30:
  Train Loss: 0.035547
  Val Loss: 0.030922
  Val MAP@12: 0.988066
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.988066)
  üìä Saved MAP@12 visualization


Epoch 7/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 7/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 7/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 7/30:
  Train Loss: 0.034076
  Val Loss: 0.030517
  Val MAP@12: 0.988243
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.988243)


Epoch 8/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 8/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 8/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 8/30:
  Train Loss: 0.033293
  Val Loss: 0.030302
  Val MAP@12: 0.988609
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.988609)
  üìä Saved MAP@12 visualization


Epoch 9/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 9/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 9/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 9/30:
  Train Loss: 0.032128
  Val Loss: 0.029948
  Val MAP@12: 0.988788
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.988788)


Epoch 10/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 10/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 10/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 10/30:
  Train Loss: 0.031864
  Val Loss: 0.029741
  Val MAP@12: 0.988897
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.988897)
  üìä Saved MAP@12 visualization


Epoch 11/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 11/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 11/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 11/30:
  Train Loss: 0.031236
  Val Loss: 0.029562
  Val MAP@12: 0.989039
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.989039)


Epoch 12/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 12/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 12/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 12/30:
  Train Loss: 0.030986
  Val Loss: 0.029504
  Val MAP@12: 0.989071
  LR: 3.00e-04
 ‚úÖ Saved best model (Val MAP@12: 0.989071)
  üìä Saved MAP@12 visualization


Epoch 13/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 13/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 13/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 13/30:
  Train Loss: 0.030535
  Val Loss: 0.029422
  Val MAP@12: 0.988977
  LR: 3.00e-04
  No improvement (1/5)


Epoch 14/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 14/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 14/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 14/30:
  Train Loss: 0.030037
  Val Loss: 0.029399
  Val MAP@12: 0.989005
  LR: 3.00e-04
  No improvement (2/5)
  üìä Saved MAP@12 visualization


Epoch 15/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 15/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 15/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 15/30:
  Train Loss: 0.029793
  Val Loss: 0.029247
  Val MAP@12: 0.988942
  LR: 1.50e-04
  No improvement (3/5)


Epoch 16/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 16/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 16/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 16/30:
  Train Loss: 0.029846
  Val Loss: 0.029194
  Val MAP@12: 0.989135
  LR: 1.50e-04
 ‚úÖ Saved best model (Val MAP@12: 0.989135)
  üìä Saved MAP@12 visualization


Epoch 17/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 17/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 17/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 17/30:
  Train Loss: 0.029387
  Val Loss: 0.029155
  Val MAP@12: 0.988792
  LR: 1.50e-04
  No improvement (1/5)


Epoch 18/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 18/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 18/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 18/30:
  Train Loss: 0.029218
  Val Loss: 0.028988
  Val MAP@12: 0.988701
  LR: 1.50e-04
  No improvement (2/5)
  üìä Saved MAP@12 visualization


Epoch 19/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 19/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 19/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 19/30:
  Train Loss: 0.028712
  Val Loss: 0.029210
  Val MAP@12: 0.988896
  LR: 7.50e-05
  No improvement (3/5)


Epoch 20/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 20/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 20/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 20/30:
  Train Loss: 0.028772
  Val Loss: 0.029154
  Val MAP@12: 0.988956
  LR: 7.50e-05
  No improvement (4/5)
  üìä Saved MAP@12 visualization


Epoch 21/30 [Train]:   0%|          | 0/168 [00:00<?, ?it/s]

Epoch 21/30 [Val]:   0%|          | 0/30 [00:00<?, ?it/s]

Epoch 21/30 [Test]:   0%|          | 0/9 [00:00<?, ?it/s]


Epoch 21/30:
  Train Loss: 0.028570
  Val Loss: 0.029266
  Val MAP@12: 0.988891
  LR: 7.50e-05
  No improvement (5/5)

 Early stopping triggered after 21 epochs
   Best MAP@12: 0.989135 at epoch 16

TRAINING COMPLETE

 Best Model (selected based on Val MAP@12):
  Epoch: 16
  Val MAP@12: 0.989135

‚úì Loaded best model from checkpoint
 Saved final model to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_model.pt
‚úì Saved training history to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_history.json

 Neural Tower training complete!


In [17]:
# ============================================================================
# VISUALIZE MAP@12 PROGRESS
# ============================================================================

print("\n" + "="*80)
print("MAP@12 TRAINING PROGRESS VISUALIZATION")
print("="*80)

if len(history['val_map12']) > 0:
    fig, axes = plt.subplots(2, 1, figsize=(12, 10))
    
    epochs_list = list(range(1, len(history['val_map12']) + 1))
    
    # Plot 1: MAP@12 Progress
    ax1 = axes[0]
    ax1.plot(epochs_list, history['val_map12'], 'b-o', label='Val MAP@12', linewidth=2.5, markersize=8)
    if use_test_set and any(x is not None for x in history['test_map12']):
        test_scores = [x if x is not None else 0 for x in history['test_map12']]
        ax1.plot(epochs_list, test_scores, 'r-s', label='Test MAP@12', linewidth=2.5, markersize=8)
    ax1.axvline(x=best_epoch, color='g', linestyle='--', linewidth=2, label=f'Best Epoch ({best_epoch})')
    ax1.set_xlabel('Epoch', fontsize=12, fontweight='bold')
    ax1.set_ylabel('MAP@12', fontsize=12, fontweight='bold')
    ax1.set_title('MAP@12 Progress During Training', fontsize=14, fontweight='bold')
    ax1.legend(fontsize=11, loc='best')
    ax1.grid(True, alpha=0.3)
    ax1.set_ylim([0, 1.0])
    
    # Plot 2: Loss Progress
    ax2 = axes[1]
    ax2.plot(epochs_list, history['train_loss'], 'b-o', label='Train Loss', linewidth=2, markersize=6)
    ax2.plot(epochs_list, history['val_loss'], 'r-s', label='Val Loss', linewidth=2, markersize=6)
    ax2.axvline(x=best_epoch, color='g', linestyle='--', linewidth=2, label=f'Best Epoch ({best_epoch})')
    ax2.set_xlabel('Epoch', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Loss', fontsize=12, fontweight='bold')
    ax2.set_title('Loss Progress During Training', fontsize=14, fontweight='bold')
    ax2.legend(fontsize=11, loc='best')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    viz_path = NeuralTowerConfig.MODEL_PATH / 'training_progress.png'
    plt.savefig(viz_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"\n‚úÖ Saved training progress visualization to {viz_path}")
    print(f"   - Val MAP@12: {history['val_map12'][-1]:.6f} (final)")
    print(f"   - Best Val MAP@12: {best_map12:.6f} (epoch {best_epoch})")
else:
    print("‚ö†Ô∏è  No training history available for visualization")



MAP@12 TRAINING PROGRESS VISUALIZATION

‚úÖ Saved training progress visualization to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/training_progress.png
   - Val MAP@12: 0.988891 (final)
   - Best Val MAP@12: 0.989135 (epoch 16)


In [23]:
# ============================================================================
# DIAGNOSTIC: CHECK CANDIDATE QUALITY FOR MAP@12
# ============================================================================

print("\n" + "="*80)
print("CANDIDATE QUALITY DIAGNOSTIC")
print("="*80)

# Load validation data
val_data_diag = pd.read_parquet(NeuralTowerConfig.MODEL_PATH / 'val_data.parquet')

# Check if purchased items are in candidate pool
print("\nüìä Analyzing candidate pool quality...")

# For validation users, check if their purchased items are in candidates
if 'user_type' in val_data_diag.columns:
    val_users_data = val_data_diag[val_data_diag['user_type'] == 'validation'].copy()
    
    # Load ground truth
    try:
        val_ground_truth = pd.read_parquet(NeuralTowerConfig.DATA_PATH / 'val_ground_truth.parquet')
        
        # Check coverage: Are purchased items in candidate pool?
        coverage_stats = []
        for _, row in val_ground_truth.head(100).iterrows():  # Sample 100 users
            user_id = row['customer_id']
            purchased_items = set(row['purchased_articles'])
            
            user_candidates = set(val_users_data[val_users_data['customer_id'] == user_id]['article_id'].values)
            
            # Calculate coverage
            items_in_candidates = purchased_items.intersection(user_candidates)
            coverage = len(items_in_candidates) / len(purchased_items) if len(purchased_items) > 0 else 0
            
            coverage_stats.append({
                'user_id': user_id,
                'purchased_count': len(purchased_items),
                'candidates_count': len(user_candidates),
                'items_in_candidates': len(items_in_candidates),
                'coverage': coverage
            })
        
        coverage_df = pd.DataFrame(coverage_stats)
        
        print(f"\n‚úì Candidate Coverage Analysis (sample of 100 validation users):")
        print(f"  Average purchased items per user: {coverage_df['purchased_count'].mean():.2f}")
        print(f"  Average candidates per user: {coverage_df['candidates_count'].mean():.2f}")
        print(f"  Average coverage: {coverage_df['coverage'].mean():.2%}")
        print(f"  Users with 100% coverage: {(coverage_df['coverage'] == 1.0).sum()}/{len(coverage_df)}")
        print(f"  Users with <50% coverage: {(coverage_df['coverage'] < 0.5).sum()}/{len(coverage_df)}")
        
        if coverage_df['coverage'].mean() < 0.8:
            print(f"\n‚ö†Ô∏è  WARNING: Low candidate coverage!")
            print(f"   Only {coverage_df['coverage'].mean():.1%} of purchased items are in candidate pool")
            print(f"   This severely limits MAP@12 - improve candidate generation!")
        else:
            print(f"\n‚úÖ Good candidate coverage: {coverage_df['coverage'].mean():.1%}")
            print(f"   Most purchased items are in the candidate pool")
            
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load ground truth: {e}")
else:
    print("‚ö†Ô∏è  user_type column not found in validation data")

# Check positive ratio in validation set
print("\nüìä Validation Set Label Distribution:")
val_label_dist = val_data_diag['label'].value_counts()
print(f"  Positive samples: {val_label_dist.get(1, 0):,} ({val_label_dist.get(1, 0)/len(val_data_diag)*100:.2f}%)")
print(f"  Negative samples: {val_label_dist.get(0, 0):,} ({val_label_dist.get(0, 0)/len(val_data_diag)*100:.2f}%)")

# Check per-user statistics
val_user_stats = val_data_diag.groupby('customer_id').agg({
    'article_id': 'nunique',
    'label': 'sum'
}).reset_index()
val_user_stats.columns = ['customer_id', 'candidates', 'positives']

print(f"\nüìä Per-User Statistics (Validation Users):")
print(f"  Average candidates per user: {val_user_stats['candidates'].mean():.2f}")
print(f"  Average positives per user: {val_user_stats['positives'].mean():.2f}")
print(f"  Users with 0 positives: {(val_user_stats['positives'] == 0).sum()}")
print(f"  Users with 1+ positives: {(val_user_stats['positives'] >= 1).sum()}")

print("\nüí° Key Insight for MAP@12:")
print("   MAP@12 requires ranking positives in top 12")
print("   If users have many candidates but few positives, ranking is harder")
print("   Focus on: Better candidate quality > More candidates")

gc.collect()



CANDIDATE QUALITY DIAGNOSTIC

üìä Analyzing candidate pool quality...

‚úì Candidate Coverage Analysis (sample of 100 validation users):
  Average purchased items per user: 3.09
  Average candidates per user: 1.24
  Average coverage: 1.33%
  Users with 100% coverage: 1/100
  Users with <50% coverage: 99/100

   Only 1.3% of purchased items are in candidate pool
   This severely limits MAP@12 - improve candidate generation!

üìä Validation Set Label Distribution:
  Positive samples: 48,444 (39.98%)
  Negative samples: 72,720 (60.02%)

üìä Per-User Statistics (Validation Users):
  Average candidates per user: 16.12
  Average positives per user: 6.79
  Users with 0 positives: 418
  Users with 1+ positives: 6714

üí° Key Insight for MAP@12:
   MAP@12 requires ranking positives in top 12
   If users have many candidates but few positives, ranking is harder
   Focus on: Better candidate quality > More candidates


32228

In [26]:
# FINAL EVALUATION AND SUMMARY

print("\n" + "="*80)
print("NEURAL TOWER - FINAL EVALUATION")
print("="*80)

# Evaluate on validation set
model.eval()
all_predictions = []
all_labels = []
val_customer_ids_list = []
val_article_ids_list = []

with torch.no_grad():
    for batch_idx, (user_feat, item_feat, image_feat, labels) in enumerate(tqdm(val_loader, desc="Evaluating")):
        user_feat = user_feat.to(NeuralTowerConfig.DEVICE)
        item_feat = item_feat.to(NeuralTowerConfig.DEVICE)
        image_feat = image_feat.to(NeuralTowerConfig.DEVICE)
        
        outputs = model(user_feat, item_feat, image_feat)
        predictions = outputs.cpu().numpy().flatten()  # Flatten to 1D
        labels_np = labels.numpy().flatten()  # Flatten to 1D
        
        # Get customer and article IDs for this batch
        start_idx = batch_idx * NeuralTowerConfig.BATCH_SIZE
        end_idx = min(start_idx + len(predictions), len(val_data))
        batch_customer_ids = val_customer_ids.iloc[start_idx:end_idx].values
        batch_article_ids = val_article_ids.iloc[start_idx:end_idx].values
        
        all_predictions.extend(predictions.tolist())  # Convert to list
        all_labels.extend(labels_np.tolist())  # Convert to list
        val_customer_ids_list.extend(batch_customer_ids)
        val_article_ids_list.extend(batch_article_ids)

# Create evaluation dataframe
val_eval_df = pd.DataFrame({
    'customer_id': val_customer_ids_list[:len(all_predictions)],
    'article_id': val_article_ids_list[:len(all_predictions)],
    'label': all_labels[:len(all_predictions)],
    'pred_score': all_predictions
})

# Ensure label column is numeric (not object/list)
val_eval_df['label'] = pd.to_numeric(val_eval_df['label'], errors='coerce')
val_eval_df['pred_score'] = pd.to_numeric(val_eval_df['pred_score'], errors='coerce')

# Calculate final MAP@12 using consistent evaluation function
# Load val_data for filtering (same as training)
val_data_for_eval = pd.read_parquet(NeuralTowerConfig.MODEL_PATH / 'val_data.parquet')

print(f"\nüìä Final Performance:")
print(f"  Best MAP@12 (during training): {best_map12:.6f}")

# Save predictions
predictions_path = NeuralTowerConfig.MODEL_PATH / 'neural_tower_predictions_val.parquet'
val_eval_df.to_parquet(predictions_path, index=False)
print(f"\n‚úì Saved predictions to {predictions_path}")

# Print model summary
print(f"\n Training Summary:")
print(f"  Total epochs: {len(history['train_loss'])}")
print(f"  Best epoch: {best_epoch}")
print(f"  Final train loss: {history['train_loss'][-1]:.6f}")
print(f"  Final val loss: {history['val_loss'][-1]:.6f}")

print("\n Saved Files:")
print(f"  Model: {final_model_path}")
print(f"  Checkpoint: {checkpoint_dir / 'best_model.pt'}")
print(f"  History: {history_path}")
print(f"  Predictions: {predictions_path}")


NEURAL TOWER - FINAL EVALUATION


Evaluating:   0%|          | 0/30 [00:00<?, ?it/s]


üìä Final Performance:
  Best MAP@12 (during training): 0.989036

‚úì Saved predictions to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_predictions_val.parquet

 Training Summary:
  Total epochs: 19
  Best epoch: 14
  Final train loss: 0.028723
  Final val loss: 0.029836

 Saved Files:
  Model: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_model.pt
  Checkpoint: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/checkpoints/best_model.pt
  History: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_history.json
  Predictions: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/neural_tower_predictions_val.parquet


### Stage 7: Evaluation & Metrics

This stage provides comprehensive evaluation and comparison of all models:
- **Model Comparison**: LightGBM vs Neural Towers performance
- **Ensemble Evaluation**: Weighted combination of best models
- **Detailed Metrics**: MAP@12, Precision@K, Recall@K, NDCG@K
- **Feature Analysis**: Importance analysis and ablation studies
- **Final Ranking**: Generate top-12 predictions for each user
- **Submission Preparation**: Format predictions for Kaggle submission

**Key Features:**
- Comprehensive metric suite
- Model ensemble strategies
- Performance visualization
- Submission file generation


In [18]:
# IMPORTS AND CONFIGURATION FOR EVALUATION

import pandas as pd
import numpy as np
from pathlib import Path
import pickle
import json
import gc
from tqdm.auto import tqdm
import warnings
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Configuration
class EvaluationConfig:
    # Paths
    DATA_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2')
    MODEL_PATH = Path('/Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models')
    
    # Evaluation metrics
    K_VALUES = [1, 3, 5, 10, 12]  # Different K values for evaluation
    
    # Ensemble weights (can be tuned)
    ENSEMBLE_WEIGHTS = {
        'lgb_classifier': 0.2,
        'lgb_ranker_lambdarank': 0.3,
        'lgb_ranker_xendcg': 0.2,
        'neural_tower': 0.3
    }
    
    RANDOM_STATE = 42

print("‚úì Evaluation Configuration loaded")
print(f"  Model path: {EvaluationConfig.MODEL_PATH}")


‚úì Evaluation Configuration loaded
  Model path: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models


In [19]:
# COMPREHENSIVE EVALUATION METRICS

def calculate_map_at_k(y_true, y_pred, k=12):
    """Calculate Mean Average Precision at K (MAP@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    aps = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        # Calculate AP for this user
        hits = 0
        precision_sum = 0.0
        
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                hits += 1
                precision_sum += hits / (i + 1)
        
        if hits > 0:
            ap = precision_sum / len(true_items)
            aps.append(ap)
    
    return np.mean(aps) if len(aps) > 0 else 0.0


def calculate_precision_at_k(y_true, y_pred, k=12):
    """Calculate Precision@K"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    precisions = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(pred_items) == 0:
            continue
        
        hits = sum(1 for item in pred_items if item in true_items)
        precision = hits / len(pred_items)
        precisions.append(precision)
    
    return np.mean(precisions) if len(precisions) > 0 else 0.0


def calculate_recall_at_k(y_true, y_pred, k=12):
    """Calculate Recall@K"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    recalls = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        hits = sum(1 for item in pred_items if item in true_items)
        recall = hits / len(true_items)
        recalls.append(recall)
    
    return np.mean(recalls) if len(recalls) > 0 else 0.0


def calculate_ndcg_at_k(y_true, y_pred, k=12):
    """Calculate Normalized Discounted Cumulative Gain at K (NDCG@K)"""
    if len(y_true) == 0:
        return 0.0
    
    y_pred = [pred[:k] for pred in y_pred]
    
    ndcgs = []
    for true_items, pred_items in zip(y_true, y_pred):
        if len(true_items) == 0:
            continue
        
        # Calculate DCG
        dcg = 0.0
        for i, pred_item in enumerate(pred_items):
            if pred_item in true_items:
                dcg += 1.0 / np.log2(i + 2)  # i+2 because log2(1) = 0
        
        # Calculate IDCG (ideal DCG)
        idcg = 0.0
        num_relevant = min(len(true_items), len(pred_items))
        for i in range(num_relevant):
            idcg += 1.0 / np.log2(i + 2)
        
        if idcg > 0:
            ndcg = dcg / idcg
            ndcgs.append(ndcg)
    
    return np.mean(ndcgs) if len(ndcgs) > 0 else 0.0


def evaluate_all_metrics(df, predictions, k_values=[1, 3, 5, 10, 12]):
    """
    Evaluate all metrics for different K values
    
    Args:
        df: DataFrame with columns ['customer_id', 'article_id', 'label']
        predictions: Array of prediction scores
        k_values: List of K values to evaluate
    
    Returns:
        Dictionary of metrics
    """
    # Group by customer
    grouped = df.groupby('customer_id')
    
    # Prepare true and predicted items for each user
    y_true = []
    y_pred = []
    
    for customer_id, group in grouped:
        # True items (purchased articles)
        true_items = set(group[group['label'] == 1]['article_id'].values)
        y_true.append(true_items)
        
        # Predicted items (sorted by score)
        customer_df = group.copy()
        customer_df['pred_score'] = predictions[:len(customer_df)]
        customer_df = customer_df.sort_values('pred_score', ascending=False)
        pred_items = customer_df['article_id'].values.tolist()
        y_pred.append(pred_items)
        
        # Remove used predictions
        predictions = predictions[len(customer_df):]
    
    # Calculate metrics for each K
    results = {}
    for k in k_values:
        results[f'MAP@{k}'] = calculate_map_at_k(y_true, y_pred, k)
        results[f'Precision@{k}'] = calculate_precision_at_k(y_true, y_pred, k)
        results[f'Recall@{k}'] = calculate_recall_at_k(y_true, y_pred, k)
        results[f'NDCG@{k}'] = calculate_ndcg_at_k(y_true, y_pred, k)
    
    return results


print("‚úì Evaluation metrics functions defined")

‚úì Evaluation metrics functions defined


In [20]:
# LOAD ALL MODEL PREDICTIONS

print("\n" + "="*80)
print("LOADING MODEL PREDICTIONS")
print("="*80)

# Load validation data
print("\nLoading validation data...")
val_data = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'val_data.parquet')
print(f"‚úì Loaded {len(val_data):,} validation samples")

# Load LightGBM predictions
lgb_predictions = {}
print("\nLoading LightGBM predictions...")
try:
    ensemble_preds = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'ensemble_predictions_val.parquet')
    if 'ensemble_weighted' in ensemble_preds.columns:
        lgb_predictions['ensemble_weighted'] = ensemble_preds['ensemble_weighted'].values
    if 'ensemble_average' in ensemble_preds.columns:
        lgb_predictions['ensemble_average'] = ensemble_preds['ensemble_average'].values
    print(f"‚úì Loaded ensemble predictions")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load ensemble predictions: {e}")

# Load individual LightGBM model predictions if available
lgb_models = ['lgb_classifier', 'lgb_ranker_lambdarank', 'lgb_ranker_xendcg', 'lgb_classifier_deep']
for model_name in lgb_models:
    try:
        pred_file = EvaluationConfig.MODEL_PATH / f'{model_name}_predictions_val.parquet'
        if pred_file.exists():
            preds = pd.read_parquet(pred_file)
            if 'pred_score' in preds.columns:
                lgb_predictions[model_name] = preds['pred_score'].values
                print(f"‚úì Loaded {model_name} predictions")
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not load {model_name}: {e}")

# Load Neural Tower predictions
neural_predictions = {}
print("\nLoading Neural Tower predictions...")
try:
    neural_preds = pd.read_parquet(EvaluationConfig.MODEL_PATH / 'neural_tower_predictions_val.parquet')
    if 'pred_score' in neural_preds.columns:
        neural_predictions['neural_tower'] = neural_preds['pred_score'].values
        print(f"‚úì Loaded Neural Tower predictions")
except Exception as e:
    print(f"‚ö†Ô∏è  Could not load Neural Tower predictions: {e}")

# Combine all predictions
all_predictions = {**lgb_predictions, **neural_predictions}

print(f"\n‚úì Total models loaded: {len(all_predictions)}")
print(f"  Models: {list(all_predictions.keys())}")

gc.collect()



LOADING MODEL PREDICTIONS

Loading validation data...
‚úì Loaded 121,164 validation samples

Loading LightGBM predictions...
‚úì Loaded ensemble predictions

Loading Neural Tower predictions...
‚úì Loaded Neural Tower predictions

‚úì Total models loaded: 1
  Models: ['neural_tower']


36706

In [21]:
# EVALUATE ALL MODELS

print("\n" + "="*80)
print("EVALUATING ALL MODELS")
print("="*80)

# Store all evaluation results
evaluation_results = {}

for model_name, predictions in tqdm(all_predictions.items(), desc="Evaluating models"):
    print(f"\n Evaluating {model_name}...")
    
    # Ensure predictions match validation data length
    if len(predictions) != len(val_data):
        print(f" Prediction length mismatch: {len(predictions)} vs {len(val_data)}")
        min_len = min(len(predictions), len(val_data))
        predictions = predictions[:min_len]
        val_data_eval = val_data.iloc[:min_len].copy()
    else:
        val_data_eval = val_data.copy()
    
    # Evaluate all metrics
    metrics = evaluate_all_metrics(val_data_eval, predictions.copy(), k_values=EvaluationConfig.K_VALUES)
    evaluation_results[model_name] = metrics
    
    # Print key metrics
    print(f"  MAP@12: {metrics['MAP@12']:.6f}")
    print(f"  Precision@12: {metrics['Precision@12']:.6f}")
    print(f"  Recall@12: {metrics['Recall@12']:.6f}")
    print(f"  NDCG@12: {metrics['NDCG@12']:.6f}")

# Create comparison DataFrame
comparison_df = pd.DataFrame(evaluation_results).T
comparison_df = comparison_df.sort_values('MAP@12', ascending=False)

print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print("\n" + comparison_df.to_string())

# Save comparison results
comparison_path = EvaluationConfig.MODEL_PATH / 'model_comparison.csv'
comparison_df.to_csv(comparison_path)
print(f"\n‚úì Saved comparison results to {comparison_path}")

gc.collect()



EVALUATING ALL MODELS


Evaluating models:   0%|          | 0/1 [00:00<?, ?it/s]


 Evaluating neural_tower...
  MAP@12: 0.389415
  Precision@12: 0.319977
  Recall@12: 0.846748
  NDCG@12: 0.581958

MODEL COMPARISON SUMMARY

                MAP@1  Precision@1  Recall@1    NDCG@1     MAP@3  Precision@3  Recall@3    NDCG@3     MAP@5  Precision@5  Recall@5    NDCG@5    MAP@10  Precision@10  Recall@10   NDCG@10    MAP@12  Precision@12  Recall@12   NDCG@12
neural_tower  0.23623     0.318704  0.079975  0.338546  0.255288     0.319242  0.240225  0.381411  0.282609     0.319233  0.396367  0.429862  0.359953      0.319653   0.745062  0.546265  0.389415      0.319977   0.846748  0.581958

‚úì Saved comparison results to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/model_comparison.csv


21

In [36]:
# CREATE FINAL ENSEMBLE

print("\n" + "="*80)
print("CREATING FINAL ENSEMBLE")
print("="*80)

# Select best models for ensemble
best_models = ['neural_tower', 'lgb_ranker_lambdarank', 'lgb_classifier']

# Filter to available models
available_models = [m for m in best_models if m in all_predictions]
print(f"\nüì¶ Available models for ensemble: {available_models}")

if len(available_models) == 0:
    print("‚ö†Ô∏è  No models available for ensemble!")
else:
    # Normalize predictions to [0, 1] range
    normalized_preds = {}
    for model_name in available_models:
        preds = all_predictions[model_name].copy()
        min_pred = preds.min()
        max_pred = preds.max()
        if max_pred > min_pred:
            normalized = (preds - min_pred) / (max_pred - min_pred)
        else:
            normalized = preds
        normalized_preds[model_name] = normalized
        print(f"  {model_name}: [{preds.min():.4f}, {preds.max():.4f}] -> [0, 1]")
    
    # Create ensemble with equal weights (can be tuned)
    ensemble_weights = {m: 1.0 / len(available_models) for m in available_models}
    print(f"\n‚öñÔ∏è  Ensemble weights: {ensemble_weights}")
    
    # Calculate weighted ensemble
    ensemble_pred = np.zeros(len(normalized_preds[available_models[0]]))
    for model_name, weight in ensemble_weights.items():
        ensemble_pred += weight * normalized_preds[model_name]
    
    # Evaluate ensemble
    print("\nüìä Evaluating final ensemble...")
    ensemble_metrics = evaluate_all_metrics(val_data, ensemble_pred.copy(), k_values=EvaluationConfig.K_VALUES)
    evaluation_results['final_ensemble'] = ensemble_metrics
    
    print(f"  MAP@12: {ensemble_metrics['MAP@12']:.6f}")
    print(f"  Precision@12: {ensemble_metrics['Precision@12']:.6f}")
    print(f"  Recall@12: {ensemble_metrics['Recall@12']:.6f}")
    print(f"  NDCG@12: {ensemble_metrics['NDCG@12']:.6f}")
    
    # Save ensemble predictions
    ensemble_df = val_data[['customer_id', 'article_id', 'label']].copy()
    ensemble_df['pred_score'] = ensemble_pred
    ensemble_path = EvaluationConfig.MODEL_PATH / 'final_ensemble_predictions_val.parquet'
    ensemble_df.to_parquet(ensemble_path, index=False)
    print(f"\n‚úì Saved ensemble predictions to {ensemble_path}")
    
    # Update comparison
    comparison_df = pd.DataFrame(evaluation_results).T
    comparison_df = comparison_df.sort_values('MAP@12', ascending=False)
    print("\n" + "="*80)
    print("UPDATED MODEL COMPARISON (with ensemble)")
    print("="*80)
    print("\n" + comparison_df.to_string())

gc.collect()



CREATING FINAL ENSEMBLE

üì¶ Available models for ensemble: ['neural_tower']
  neural_tower: [0.0005, 0.9999] -> [0, 1]

‚öñÔ∏è  Ensemble weights: {'neural_tower': 1.0}

üìä Evaluating final ensemble...
  MAP@12: 0.776194
  Precision@12: 0.332567
  Recall@12: 1.022509
  NDCG@12: 0.842597

‚úì Saved ensemble predictions to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/final_ensemble_predictions_val.parquet

UPDATED MODEL COMPARISON (with ensemble)

                  MAP@1  Precision@1  Recall@1    NDCG@1     MAP@3  Precision@3  Recall@3    NDCG@3     MAP@5  Precision@5  Recall@5    NDCG@5    MAP@10  Precision@10  Recall@10   NDCG@10    MAP@12  Precision@12  Recall@12   NDCG@12
neural_tower    0.63528     0.332004  0.385652  0.607058  0.693509     0.331751  0.835445  0.767134  0.747797     0.332384  0.976887  0.822544  0.775606      0.332574   1.021713  0.842191  0.776194      0.332567   1.022509  0.842597
final_ensemble  0.63528

24

In [37]:
# GENERATE FINAL RANKINGS FOR SUBMISSION

print("\n" + "="*80)
print("GENERATING FINAL RANKINGS")
print("="*80)

# Use best model (or ensemble if available)
if 'final_ensemble' in evaluation_results:
    best_model_name = 'final_ensemble'
    best_predictions = ensemble_pred
    print(f"\n‚úÖ Using final ensemble for submission")
elif 'neural_tower' in all_predictions:
    best_model_name = 'neural_tower'
    best_predictions = all_predictions['neural_tower']
    print(f"\n‚úÖ Using Neural Tower for submission")
elif len(all_predictions) > 0:
    # Use model with best MAP@12
    best_model_name = comparison_df.index[0]
    best_predictions = all_predictions[best_model_name]
    print(f"\n‚úÖ Using {best_model_name} for submission")
else:
    raise ValueError("No predictions available!")

# Create predictions DataFrame
pred_df = val_data[['customer_id', 'article_id']].copy()
pred_df['pred_score'] = best_predictions[:len(pred_df)]

# Generate top-12 predictions for each user
print("\nüìä Generating top-12 rankings per user...")
rankings = []
for customer_id, group in tqdm(pred_df.groupby('customer_id'), desc="Ranking users"):
    # Sort by prediction score (descending)
    group_sorted = group.sort_values('pred_score', ascending=False)
    
    # Get top 12 article IDs
    top_articles = group_sorted.head(12)['article_id'].values
    
    # Format as space-separated string
    predictions_str = ' '.join([str(art) for art in top_articles])
    
    rankings.append({
        'customer_id': customer_id,
        'prediction': predictions_str
    })

# Create submission DataFrame
submission_df = pd.DataFrame(rankings)
submission_df = submission_df.sort_values('customer_id')

print(f"\n‚úì Generated rankings for {len(submission_df):,} users")
print(f"  Average articles per user: {submission_df['prediction'].str.split().str.len().mean():.2f}")

# Save submission file
submission_path = EvaluationConfig.MODEL_PATH / 'submission.csv'
submission_df.to_csv(submission_path, index=False)
print(f"\n‚úì Saved submission file to {submission_path}")

# Display sample
print("\nüìÑ Sample submission (first 5 rows):")
print(submission_df.head().to_string(index=False))

gc.collect()



GENERATING FINAL RANKINGS

‚úÖ Using final ensemble for submission

üìä Generating top-12 rankings per user...


Ranking users:   0%|          | 0/42126 [00:00<?, ?it/s]


‚úì Generated rankings for 42,126 users
  Average articles per user: 2.87

‚úì Saved submission file to /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/submission.csv

üìÑ Sample submission (first 5 rows):
                                                     customer_id                              prediction
0000945f66de1a11d9447609b8b41b1bc987ba185a5496ae8831e8493afa24ff                               811899002
00012315fd38859ff2c446876ca507abbcbcf582d0e266b1b696941c16e777a2                               872600009
00061a04f030bdf3665b09829192ca8c13c4de6dd9ae9d38d0d0b5ce3a1cfc6f                     799365013 883724001
00089f13f465ec902e5c49a3bb408c5e31205096d6f267543f1893303e456016                               858052005
000e3f587242eb077685a487ad27dad632a4801576dfd16967280f0da3a78c2e 706016001 684209004 620425012 857713001


210688

In [38]:
# FINAL SUMMARY AND RESULTS

print("\n" + "="*80)
print("FINAL EVALUATION SUMMARY")
print("="*80)

# Best model
best_model = comparison_df.index[0]
best_map12 = comparison_df.loc[best_model, 'MAP@12']

print(f"\nüèÜ Best Model: {best_model}")
print(f"   MAP@12: {best_map12:.6f}")

# Model rankings
print(f"\nüìä Model Rankings (by MAP@12):")
print("-" * 80)
for idx, (model_name, row) in enumerate(comparison_df.iterrows(), 1):
    marker = "ü•á" if idx == 1 else "ü•à" if idx == 2 else "ü•â" if idx == 3 else "  "
    print(f"{marker} {idx}. {model_name:30s} MAP@12: {row['MAP@12']:.6f}")

# Key metrics for best model
print(f"\nüìà Detailed Metrics for Best Model ({best_model}):")
print("-" * 80)
best_metrics = comparison_df.loc[best_model]
for metric_name in ['MAP@12', 'Precision@12', 'Recall@12', 'NDCG@12']:
    print(f"  {metric_name:20s}: {best_metrics[metric_name]:.6f}")

# Files saved
print(f"\nüíæ Generated Files:")
print("-" * 80)
print(f"  Model Comparison: {EvaluationConfig.MODEL_PATH / 'model_comparison.csv'}")
if 'final_ensemble' in evaluation_results:
    print(f"  Ensemble Predictions: {EvaluationConfig.MODEL_PATH / 'final_ensemble_predictions_val.parquet'}")
print(f"  Submission File: {EvaluationConfig.MODEL_PATH / 'submission.csv'}")

# Performance summary
print(f"\nüìä Performance Summary:")
print("-" * 80)
print(f"  Total Models Evaluated: {len(comparison_df)}")
print(f"  Best MAP@12: {best_map12:.6f}")
print(f"  Improvement over baseline: {((best_map12 - comparison_df['MAP@12'].min()) / comparison_df['MAP@12'].min() * 100):.2f}%")

print("\n" + "="*80)
print("‚úÖ Step 3 Complete: Evaluation & Metrics")
print("   Ready for final submission!")
print("="*80)



FINAL EVALUATION SUMMARY

üèÜ Best Model: neural_tower
   MAP@12: 0.776194

üìä Model Rankings (by MAP@12):
--------------------------------------------------------------------------------
ü•á 1. neural_tower                   MAP@12: 0.776194
ü•à 2. final_ensemble                 MAP@12: 0.776194

üìà Detailed Metrics for Best Model (neural_tower):
--------------------------------------------------------------------------------
  MAP@12              : 0.776194
  Precision@12        : 0.332567
  Recall@12           : 1.022509
  NDCG@12             : 0.842597

üíæ Generated Files:
--------------------------------------------------------------------------------
  Model Comparison: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/model_comparison.csv
  Ensemble Predictions: /Users/raghu/Desktop/Quarter_1/CSE_258R/assignment2/fashion_recommender_candidate_generation_2/models/final_ensemble_predictions_val.parquet
  Submission File