## 1. Import Libraries and Setup

In [14]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import lightgbm as lgb
import joblib

# Define custom RMSLE function to handle zero values
def rmsle_score(y_true, y_pred):
    """Calculate RMSLE with handling for zero and negative values"""
    # Add small epsilon to avoid log(0) and ensure positive values
    epsilon = 1e-15
    y_true_log = np.log1p(np.maximum(y_true, epsilon))
    y_pred_log = np.log1p(np.maximum(y_pred, epsilon))
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

# Create scorer for GridSearchCV
rmsle_scorer = make_scorer(rmsle_score, greater_is_better=False)

# Set random seed for reproducibility
np.random.seed(42)

# Visualization settings
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 2. Data Loading and Initial Preprocessing

In [3]:
train_df = pd.read_csv('../data/interim/traditional_final_train.csv', parse_dates=['date'])
test_df = pd.read_csv('../data/interim/traditional_final_test.csv', parse_dates=['date'])

In [4]:
# print categorial columns
print("Categorical columns in the dataset:")
for col in train_df.select_dtypes(include=['object']).columns:
    print(f"- {col}")
    
# Encode categorical features
categorical_cols = train_df.select_dtypes(include=['object']).columns
encoders = {}
for col in categorical_cols:
    encoder = OrdinalEncoder()
    train_df[col+"_encoded"] = encoder.fit_transform(train_df[[col]])
    encoders[col] = encoder

Categorical columns in the dataset:
- family
- city
- state
- type


In [5]:
for col in categorical_cols:
    test_df[col+"_encoded"] = encoders[col].transform(test_df[[col]])

In [6]:
train_df.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'year',
       'month', 'day', 'dayofweek', 'weekofyear', 'day_of_year', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_payday', 'days_since_payday', 'days_until_payday',
       'sales_rolling_mean_7', 'sales_rolling_std_7', 'sales_rolling_max_7',
       'sales_rolling_min_7', 'sales_rolling_mean_14', 'sales_rolling_std_14',
       'sales_rolling_max_14', 'sales_rolling_min_14', 'sales_rolling_mean_30',
       'sales_rolling_std_30', 'sales_rolling_max_30', 'sales_rolling_min_30',
       'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
       'is_national_holiday', 'is_regional_holiday', 'is_local_holiday',
       'is_additional_holiday', 'is_working_day', 'is_event', 'is_bridge_day',
       'is_transferred_day', 'dcoilwtico', 'city', 'state', 'type', 'cluster',
       'transactions', 'family_encoded', 'city_encoded', 'state_encoded',
       'type_en

In [7]:
feature_cols = ['date', 'store_nbr',"family",'onpromotion', 'year',
       'month', 'day', 'dayofweek', 'weekofyear', 'day_of_year', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_payday', 'days_since_payday', 'days_until_payday',
       'is_national_holiday', 'is_regional_holiday', 'is_local_holiday',
       'is_additional_holiday', 'is_working_day', 'is_event', 'is_bridge_day',
       'is_transferred_day', 'dcoilwtico', 'cluster',
       'transactions', 'family_encoded', 'city_encoded', 'state_encoded',
       'type_encoded','sales_rolling_mean_7', 'sales_rolling_std_7', 'sales_rolling_max_7',
       'sales_rolling_min_7', 'sales_rolling_mean_14', 'sales_rolling_std_14',
       'sales_rolling_max_14', 'sales_rolling_min_14', 'sales_rolling_mean_30',
       'sales_rolling_std_30', 'sales_rolling_max_30', 'sales_rolling_min_30',
       'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
]

In [8]:
train = train_df[feature_cols]
test = test_df[feature_cols]

## 3. Hyperparameter Tuning Setup

In [9]:
# Define hyperparameter grid for LightGBM
param_grid = {
    'n_estimators': [500, 1000, 1500],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

# For faster tuning, use a smaller parameter grid
# You can expand this for more thorough tuning
param_grid_small = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.01, 0.05],
    'num_leaves': [31, 50],
    'max_depth': [-1, 10],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1]
}

print("Hyperparameter grid defined for LightGBM tuning")
print(f"Total combinations in small grid: {np.prod([len(v) for v in param_grid_small.values()])}")

Hyperparameter grid defined for LightGBM tuning
Total combinations in small grid: 256


In [None]:
# Function to perform hyperparameter tuning for a specific family
def tune_hyperparameters(X_train, y_train, family_name, param_grid=param_grid_small, cv_folds=3):
    """
    Perform hyperparameter tuning for LightGBM model
    """
    print(f"\n=== Tuning hyperparameters for {family_name} ===")
    
    # Create base model
    lgb_model = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        random_state=42,
        force_row_wise=True,
        verbose=-1  # Reduce verbosity
    )
    
    # Create time series split for cross-validation
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    
    # Perform grid search
    grid_search = GridSearchCV(
        estimator=lgb_model,
        param_grid=param_grid,
        scoring=rmsle_scorer,
        cv=tscv,
        n_jobs=-1,
        verbose=1
    )
    
    # Fit grid search
    print(f"Starting grid search with {np.prod([len(v) for v in param_grid.values()])} parameter combinations...")
    grid_search.fit(X_train, y_train)
    
    # Print results
    print(f"Best RMSLE score: {-grid_search.best_score_:.4f}")
    print(f"Best parameters: {grid_search.best_params_}")
    
    return grid_search.best_estimator_, grid_search.best_params_, -grid_search.best_score_

In [20]:
# Function to perform hyperparameter tuning for a specific family
def Randomized_tune_hyperparameters(X_train, y_train, family_name, param_grid=param_grid_small, cv_folds=3):
    """
    Perform hyperparameter tuning for LightGBM model using RandomizedSearchCV
    """
    print(f"\n=== Randomized Tuning hyperparameters for {family_name} ===")
    
    # Create base model
    lgb_model = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',
        random_state=42,
        force_row_wise=True,
        verbose=-1  # Reduce verbosity
    )
    
    # Create time series split for cross-validation
    tscv = TimeSeriesSplit(n_splits=cv_folds)
    
    # Perform randomized search
    random_search = RandomizedSearchCV(
        estimator=lgb_model,
        param_distributions=param_grid,
        scoring=rmsle_scorer,
        cv=tscv,
        n_iter=20,  # Number of iterations for randomized search
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    
    # Fit randomized search
    print(f"Starting randomized search with {len(param_grid)} parameter combinations...")
    random_search.fit(X_train, y_train)
    
    # Print results
    print(f"Best RMSLE score: {-random_search.best_score_:.4f}")
    print(f"Best parameters: {random_search.best_params_}")
    
    return random_search.best_estimator_, random_search.best_params_, -random_search.best_score_

## 4. Hyperparameter Tuning and Model Training

In [21]:
# Train LightGBM models with hyperparameter tuning for each family
families = train_df['family'].unique()
models = {}
best_params_dict = {}
tuning_results = {}

print(f"Starting hyperparameter tuning for {len(families)} families...")
print("This may take a while depending on the parameter grid size.\n")

for i, family in enumerate(families, 1):
    print(f"\n{'='*60}")
    print(f"Progress: {i}/{len(families)} - Training model for family: {family}")
    print(f"{'='*60}")
    
    # Filter the data for the current family
    family_data = train[train['family'] == family].copy()
    family_target = train_df[train_df['family'] == family]['sales'].copy()
    
    # Prepare features (remove date and family columns)
    X_family = family_data.drop(columns=['date', 'family'])
    y_family = family_target
    
    print(f"Family {family} data shape: {X_family.shape}")
    
    # Split data for hyperparameter tuning (80% for tuning, 20% for validation)
    split_index = int(len(X_family) * 0.8)
    X_tune = X_family[:split_index]
    y_tune = y_family[:split_index]
    X_val = X_family[split_index:]
    y_val = y_family[split_index:]
    
    print(f"Tuning set: {X_tune.shape}, Validation set: {X_val.shape}")
    
    # Perform hyperparameter tuning
    try:
        best_model, best_params, best_score = Randomized_tune_hyperparameters(
            X_tune, y_tune, family, param_grid_small, cv_folds=3
        )
        
        # Store results
        best_params_dict[family] = best_params
        tuning_results[family] = best_score
        
        # Evaluate on validation set
        y_val_pred = best_model.predict(X_val)
        y_val_pred = np.maximum(y_val_pred, 0)  # Ensure no negative predictions
        
        # Calculate evaluation metrics
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        mae = mean_absolute_error(y_val, y_val_pred)
        rmsle = rmsle_score(y_val, y_val_pred)
        
        print(f"Validation - RMSE: {rmse:.4f}, MAE: {mae:.4f}, RMSLE: {rmsle:.4f}")
        print(f"✓ Successfully tuned model for {family}")
        
        # Store the tuned model (we'll retrain on full data later)
        models[family] = best_model
        
    except Exception as e:
        print(f"✗ Error tuning model for {family}: {str(e)}")
        # Fallback to default parameters
        default_model = lgb.LGBMRegressor(
            objective='regression',
            metric='rmse',
            n_estimators=1000,
            learning_rate=0.01,
            num_leaves=31,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            force_row_wise=True
        )
        default_model.fit(X_tune, y_tune)
        models[family] = default_model
        best_params_dict[family] = 'default_params'
        tuning_results[family] = None

print(f"\n{'='*60}")
print("HYPERPARAMETER TUNING COMPLETED")
print(f"{'='*60}")

Starting hyperparameter tuning for 33 families...
This may take a while depending on the parameter grid size.


Progress: 1/33 - Training model for family: AUTOMOTIVE
Family AUTOMOTIVE data shape: (89316, 47)
Tuning set: (71452, 47), Validation set: (17864, 47)

=== Randomized Tuning hyperparameters for AUTOMOTIVE ===
Starting randomized search with 8 parameter combinations...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: 

In [None]:
# Display tuning results summary
print("\n=== HYPERPARAMETER TUNING RESULTS SUMMARY ===")
print("-" * 80)
print(f"{'Family':<25} {'Best RMSLE':<15} {'Key Parameters':<40}")
print("-" * 80)

for family in families:
    best_score = tuning_results.get(family, 'N/A')
    best_params = best_params_dict.get(family, 'N/A')
    
    if isinstance(best_score, float):
        score_str = f"{best_score:.4f}"
    else:
        score_str = str(best_score)
    
    if isinstance(best_params, dict):
        # Show only key parameters for readability
        key_params = {k: v for k, v in best_params.items() if k in ['n_estimators', 'learning_rate', 'num_leaves']}
        params_str = str(key_params)[:35] + '...' if len(str(key_params)) > 35 else str(key_params)
    else:
        params_str = str(best_params)
    
    print(f"{family[:24]:<25} {score_str:<15} {params_str:<40}")

print("-" * 80)

# Calculate average RMSLE
valid_scores = [score for score in tuning_results.values() if isinstance(score, float)]
if valid_scores:
    avg_rmsle = np.mean(valid_scores)
    print(f"\nAverage RMSLE across all families: {avg_rmsle:.4f}")
    best_family = min(tuning_results.items(), key=lambda x: x[1] if isinstance(x[1], float) else float('inf'))[0]
    worst_family = max(tuning_results.items(), key=lambda x: x[1] if isinstance(x[1], float) else 0)[0]
    print(f"Best performing family: {best_family}")
    print(f"Worst performing family: {worst_family}")

## 5. Final Model Training on Full Dataset

In [None]:
# Retrain models with best parameters on the full training dataset
print("\n=== RETRAINING MODELS WITH BEST PARAMETERS ON FULL DATASET ===")
print("This will train the final models using the entire training and validation data...\n")

final_models = {}
final_training_results = {}

for i, family in enumerate(families, 1):
    print(f"Progress: {i}/{len(families)} - Final training for family: {family}")
    
    # Filter the data for the current family
    family_data = train[train['family'] == family].copy()
    family_target = train_df[train_df['family'] == family]['sales'].copy()
    
    # Prepare features (use full dataset)
    X_family_full = family_data.drop(columns=['date', 'family'])
    y_family_full = family_target
    
    # Get best parameters for this family
    best_params = best_params_dict.get(family, {})
    
    if isinstance(best_params, dict):
        # Create model with best parameters
        final_model = lgb.LGBMRegressor(
            objective='regression',
            metric='rmse',
            random_state=42,
            force_row_wise=True,
            **best_params
        )
    else:
        # Use default parameters if tuning failed
        final_model = lgb.LGBMRegressor(
            objective='regression',
            metric='rmse',
            n_estimators=1000,
            learning_rate=0.01,
            num_leaves=31,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            force_row_wise=True
        )
    
    # Train the final model on the full dataset
    final_model.fit(X_family_full, y_family_full)
    
    # Store the final model
    final_models[family] = final_model
    
    # Store training info
    final_training_results[family] = {
        'training_samples': len(X_family_full),
        'features': X_family_full.shape[1],
        'best_params': best_params
    }
    
    print(f"  ✓ Trained on {len(X_family_full)} samples with {X_family_full.shape[1]} features")

print(f"\n✓ Final training completed for all {len(families)} families!")

# Update the models dictionary to use final models
models = final_models

In [None]:
# Save the trained models and metadata
print("\n=== SAVING TRAINED MODELS ===")

import os
from datetime import datetime

# Create models directory if it doesn't exist
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

# Create timestamp for model versioning
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save individual models and metadata
saved_models_info = {}

for family in families:
    # Clean family name for filename
    clean_family_name = family.replace('/', '_').replace(' ', '_').replace('&', 'and')
    
    # Save model
    model_filename = f'lgb_model_{clean_family_name}_{timestamp}.pkl'
    model_path = os.path.join(models_dir, model_filename)
    joblib.dump(models[family], model_path)
    
    saved_models_info[family] = {
        'model_path': model_path,
        'best_params': best_params_dict.get(family, 'default'),
        'tuning_score': tuning_results.get(family, 'N/A'),
        'training_samples': final_training_results[family]['training_samples']
    }
    
    print(f"  ✓ Saved model for {family[:30]}: {model_filename}")

# Save models metadata
metadata_filename = f'models_metadata_{timestamp}.pkl'
metadata_path = os.path.join(models_dir, metadata_filename)
joblib.dump(saved_models_info, metadata_path)

print(f"\n✓ Models metadata saved: {metadata_filename}")
print(f"✓ All models saved in: {models_dir}")

# Also save best parameters as JSON for easy inspection
import json
params_filename = f'best_parameters_{timestamp}.json'
params_path = os.path.join(models_dir, params_filename)
with open(params_path, 'w') as f:
    # Convert numpy types to native Python types for JSON serialization
    json_params = {}
    for family, params in best_params_dict.items():
        if isinstance(params, dict):
            json_params[family] = {k: int(v) if isinstance(v, np.integer) else float(v) if isinstance(v, np.floating) else v for k, v in params.items()}
        else:
            json_params[family] = str(params)
    json.dump(json_params, f, indent=2)

print(f"✓ Best parameters saved as JSON: {params_filename}")

## 6. Generate Predictions for Test Set

In [None]:
# Generate predictions for test set
print("Generating predictions for test set...")

# Initialize predictions array
test_predictions = []
test_ids = []

# Generate predictions for each family
for family in families:
    print(f"Predicting for family: {family}")
    
    # Filter test data for current family
    family_test_data = test[test['family'] == family].copy()
    
    if len(family_test_data) > 0:
        # Prepare features (remove date and family columns)
        X_test_family = family_test_data.drop(columns=['date', 'family'])
        
        # Get the trained model for this family
        model = models[family]
        
        # Generate predictions
        family_predictions = model.predict(X_test_family)
        
        # Ensure no negative predictions
        family_predictions = np.maximum(family_predictions, 0)
        
        # Store predictions and corresponding IDs
        test_predictions.extend(family_predictions)
        
        # Get corresponding IDs from test_df
        family_ids = test_df[test_df['family'] == family]['id'].values
        test_ids.extend(family_ids)
        
        print(f"Generated {len(family_predictions)} predictions for {family}")

print(f"Total predictions generated: {len(test_predictions)}")

In [None]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'sales': test_predictions
})

# Sort by id to ensure proper order
submission_df = submission_df.sort_values('id').reset_index(drop=True)

print(f"Submission shape: {submission_df.shape}")
print("\nFirst few predictions:")
print(submission_df.head(10))

print("\nLast few predictions:")
print(submission_df.tail(10))

print(f"\nPrediction statistics:")
print(f"Min: {submission_df['sales'].min():.4f}")
print(f"Max: {submission_df['sales'].max():.4f}")
print(f"Mean: {submission_df['sales'].mean():.4f}")
print(f"Median: {submission_df['sales'].median():.4f}")

In [None]:
# Save submission file
submission_filename = f'../data/submission/traditional_submission_tuned_{timestamp}.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file saved as: {submission_filename}")
print(f"File contains {len(submission_df)} predictions")

# Verify the submission file
print("\nVerifying submission file...")
verify_df = pd.read_csv(submission_filename)
print(f"Loaded file shape: {verify_df.shape}")
print(f"Columns: {list(verify_df.columns)}")
print(f"No missing values: {verify_df.isnull().sum().sum() == 0}")
print(f"All IDs unique: {len(verify_df['id'].unique()) == len(verify_df)}")