## 1. Import Libraries and Setup

In [12]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
import lightgbm as lgb

# Define custom RMSLE function to handle zero values
def rmsle_score(y_true, y_pred):
    """Calculate RMSLE with handling for zero and negative values"""
    # Add small epsilon to avoid log(0) and ensure positive values
    epsilon = 1e-15
    y_true_log = np.log1p(np.maximum(y_true, epsilon))
    y_pred_log = np.log1p(np.maximum(y_pred, epsilon))
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

# Set random seed for reproducibility
np.random.seed(42)

# Visualization settings
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 2. Data Loading and Initial Preprocessing

In [2]:
train_df = pd.read_csv('../data/interim/traditional_final_train.csv', parse_dates=['date'])
test_df = pd.read_csv('../data/interim/traditional_final_test.csv', parse_dates=['date'])

In [3]:
# print categorial columns
print("Categorical columns in the dataset:")
for col in train_df.select_dtypes(include=['object']).columns:
    print(f"- {col}")
    
# Encode categorical features
categorical_cols = train_df.select_dtypes(include=['object']).columns
encoders = {}
for col in categorical_cols:
    encoder = OrdinalEncoder()
    train_df[col+"_encoded"] = encoder.fit_transform(train_df[[col]])
    encoders[col] = encoder

Categorical columns in the dataset:
- family
- city
- state
- type


In [4]:
for col in categorical_cols:
    test_df[col+"_encoded"] = encoders[col].transform(test_df[[col]])

In [5]:
train_df.columns

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'year',
       'month', 'day', 'dayofweek', 'weekofyear', 'day_of_year', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_payday', 'days_since_payday', 'days_until_payday',
       'sales_rolling_mean_7', 'sales_rolling_std_7', 'sales_rolling_max_7',
       'sales_rolling_min_7', 'sales_rolling_mean_14', 'sales_rolling_std_14',
       'sales_rolling_max_14', 'sales_rolling_min_14', 'sales_rolling_mean_30',
       'sales_rolling_std_30', 'sales_rolling_max_30', 'sales_rolling_min_30',
       'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
       'is_national_holiday', 'is_regional_holiday', 'is_local_holiday',
       'is_additional_holiday', 'is_working_day', 'is_event', 'is_bridge_day',
       'is_transferred_day', 'dcoilwtico', 'city', 'state', 'type', 'cluster',
       'transactions', 'family_encoded', 'city_encoded', 'state_encoded',
       'type_en

In [17]:
feature_cols = ['date', 'store_nbr',"family",'onpromotion', 'year',
       'month', 'day', 'dayofweek', 'weekofyear', 'day_of_year', 'is_weekend',
       'is_month_start', 'is_month_end', 'is_quarter_start', 'is_quarter_end',
       'is_payday', 'days_since_payday', 'days_until_payday',
       'is_national_holiday', 'is_regional_holiday', 'is_local_holiday',
       'is_additional_holiday', 'is_working_day', 'is_event', 'is_bridge_day',
       'is_transferred_day', 'dcoilwtico', 'cluster',
       'transactions', 'family_encoded', 'city_encoded', 'state_encoded',
       'type_encoded','sales_rolling_mean_7', 'sales_rolling_std_7', 'sales_rolling_max_7',
       'sales_rolling_min_7', 'sales_rolling_mean_14', 'sales_rolling_std_14',
       'sales_rolling_max_14', 'sales_rolling_min_14', 'sales_rolling_mean_30',
       'sales_rolling_std_30', 'sales_rolling_max_30', 'sales_rolling_min_30',
       'sales_lag_1', 'sales_lag_7', 'sales_lag_14', 'sales_lag_30',
]

In [18]:
train = train_df[feature_cols]
test = test_df[feature_cols]

## 6. Model Training and Evaluation

In [19]:
# train a LightGBM model for each family
families = train_df['family'].unique()
models = {}
for family in families:
    print(f"Training model for family: {family}")
    # Filter the data for the current family
    family_data = train[train['family'] == family]
    # Define features and target
    X_family_train = family_data.drop(columns=['family'])
    y_family_train = train_df[train_df['family'] == family]['sales']
    # split the data into training and validation sets by 0.8 ratio
    split_index = int(len(X_family_train) * 0.8)
    X_family_train, X_family_val = X_family_train[:split_index], X_family_train[split_index:]
    y_family_train, y_family_val = y_family_train[:split_index], y_family_train[split_index:]
    # Print the shape of the training and validation sets
    X_family_train.drop(columns=['date'], inplace=True)
    X_family_val.drop(columns=['date'], inplace=True)
    # Define the model
    lgb_model = lgb.LGBMRegressor(
        objective='regression',
        metric='rmse',  # Changed from 'rmsle' to 'rmse' to avoid issues
        n_estimators=1000,
        learning_rate=0.01,
        num_leaves=31,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        force_row_wise=True
    )
    
    # Fit the model
    lgb_model.fit(X_family_train, y_family_train)
    
    # Evaluate on validation set
    y_family_pred = lgb_model.predict(X_family_val)
    # Ensure no negative predictions
    y_family_pred = np.maximum(y_family_pred, 0)

    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_family_val, y_family_pred))
    mae = mean_absolute_error(y_family_val, y_family_pred)
    rmsle = rmsle_score(y_family_val, y_family_pred)  # Use custom RMSLE function

    print(f"Family {family} - RMSE: {rmse:.4f}, MAE: {mae:.4f}, RMSLE: {rmsle:.4f}")
    
    # Store the model
    models[family] = lgb_model

Training model for family: AUTOMOTIVE
[LightGBM] [Info] Total Bins 2918
[LightGBM] [Info] Number of data points in the train set: 71452, number of used features: 46
[LightGBM] [Info] Start training from score 4.978027
Family AUTOMOTIVE - RMSE: 4.8568, MAE: 3.3605, RMSLE: 0.4110
Training model for family: BABY CARE
[LightGBM] [Info] Total Bins 2189
[LightGBM] [Info] Number of data points in the train set: 71452, number of used features: 45
[LightGBM] [Info] Start training from score 0.136119
Family BABY CARE - RMSE: 0.1705, MAE: 0.0191, RMSLE: 0.0752
Training model for family: BEAUTY
[LightGBM] [Info] Total Bins 2761
[LightGBM] [Info] Number of data points in the train set: 71452, number of used features: 46
[LightGBM] [Info] Start training from score 2.816716
Family BEAUTY - RMSE: 3.3204, MAE: 2.0660, RMSLE: 0.3728
Training model for family: BEVERAGES
[LightGBM] [Info] Total Bins 5214
[LightGBM] [Info] Number of data points in the train set: 71452, number of used features: 46
[LightGBM

## 10. Generate Predictions for Test Set

In [20]:
# Generate predictions for test set
print("Generating predictions for test set...")

# Initialize predictions array
test_predictions = []
test_ids = []

# Generate predictions for each family
for family in families:
    print(f"Predicting for family: {family}")
    
    # Filter test data for current family
    family_test_data = test[test['family'] == family].copy()
    
    if len(family_test_data) > 0:
        # Prepare features (remove date and family columns)
        X_test_family = family_test_data.drop(columns=['date', 'family'])
        
        # Get the trained model for this family
        model = models[family]
        
        # Generate predictions
        family_predictions = model.predict(X_test_family)
        
        # Ensure no negative predictions
        family_predictions = np.maximum(family_predictions, 0)
        
        # Store predictions and corresponding IDs
        test_predictions.extend(family_predictions)
        
        # Get corresponding IDs from test_df
        family_ids = test_df[test_df['family'] == family]['id'].values
        test_ids.extend(family_ids)
        
        print(f"Generated {len(family_predictions)} predictions for {family}")

print(f"Total predictions generated: {len(test_predictions)}")

Generating predictions for test set...
Predicting for family: AUTOMOTIVE
Generated 864 predictions for AUTOMOTIVE
Predicting for family: BABY CARE
Generated 864 predictions for BABY CARE
Predicting for family: BEAUTY
Generated 864 predictions for BEAUTY
Predicting for family: BEVERAGES
Generated 864 predictions for BEVERAGES
Predicting for family: BOOKS
Generated 864 predictions for BOOKS
Predicting for family: BREAD/BAKERY
Generated 864 predictions for BREAD/BAKERY
Predicting for family: CELEBRATION
Generated 864 predictions for CELEBRATION
Predicting for family: CLEANING
Generated 864 predictions for CLEANING
Predicting for family: DAIRY
Generated 864 predictions for DAIRY
Predicting for family: DELI
Generated 864 predictions for DELI
Predicting for family: EGGS
Generated 864 predictions for EGGS
Predicting for family: FROZEN FOODS
Generated 864 predictions for FROZEN FOODS
Predicting for family: GROCERY I
Generated 864 predictions for GROCERY I
Predicting for family: GROCERY II
Gene

In [21]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'id': test_ids,
    'sales': test_predictions
})

# Sort by id to ensure proper order
submission_df = submission_df.sort_values('id').reset_index(drop=True)

print(f"Submission shape: {submission_df.shape}")
print("\nFirst few predictions:")
print(submission_df.head(10))

print("\nLast few predictions:")
print(submission_df.tail(10))

print(f"\nPrediction statistics:")
print(f"Min: {submission_df['sales'].min():.4f}")
print(f"Max: {submission_df['sales'].max():.4f}")
print(f"Mean: {submission_df['sales'].mean():.4f}")
print(f"Median: {submission_df['sales'].median():.4f}")

Submission shape: (28512, 2)

First few predictions:
        id        sales
0  3000888     3.760310
1  3000889     0.001289
2  3000890     5.263258
3  3000891  1953.355253
4  3000892     0.002907
5  3000893   374.237450
6  3000894     8.156067
7  3000895   685.423848
8  3000896   755.716024
9  3000897   133.769381

Last few predictions:
            id        sales
28502  3029390    10.052362
28503  3029391   465.271125
28504  3029392   466.896927
28505  3029393     5.640648
28506  3029394     5.868682
28507  3029395   440.849554
28508  3029396   147.499217
28509  3029397  2362.269610
28510  3029398   116.053000
28511  3029399    14.159561

Prediction statistics:
Min: 0.0000
Max: 11710.2927
Mean: 428.7287
Median: 27.9316


In [22]:
# Save submission file
submission_filename = 'traditional_submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"Submission file saved as: {submission_filename}")
print(f"File contains {len(submission_df)} predictions")

# Verify the submission file
print("\nVerifying submission file...")
verify_df = pd.read_csv(submission_filename)
print(f"Loaded file shape: {verify_df.shape}")
print(f"Columns: {list(verify_df.columns)}")
print(f"No missing values: {verify_df.isnull().sum().sum() == 0}")
print(f"All IDs unique: {len(verify_df['id'].unique()) == len(verify_df)}")

Submission file saved as: traditional_submission.csv
File contains 28512 predictions

Verifying submission file...
Loaded file shape: (28512, 2)
Columns: ['id', 'sales']
No missing values: True
All IDs unique: True
