<a href="https://colab.research.google.com/github/eghib22/Store-Sales-Forecasting/blob/main/model_experiment_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
from google.colab import files
files.upload()
!mv "kaggle.json" ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!ls -l ~/.kaggle/

!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip walmart-recruiting-store-sales-forecasting
!unzip '*.csv.zip'
!unzip '*.csv.zip'
!pip install mlflow dagshub lightgbm scikit-learn joblib


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/root/.kaggle’: File exists


Saving kaggle.json to kaggle.json
total 4
-rw------- 1 root root 74 Jul 13 16:10 kaggle.json
walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: features.csv.zip        
replace sampleSubmission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: yyy
  inflating: sampleSubmission.csv.zip  
replace stores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
y
  inflating: stores.csv              
replace test.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: test.csv.zip            
replace train.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv.zip           
Archive:  sampleSubmission.csv.zip
replace sampleSubmission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sampleSubmission.csv    

Archive:  train.csv.zip
replace train.csv? [y]es, [n]o, [

In [7]:
import dagshub
dagshub.init(repo_owner='eghib22', repo_name='Store-Sales-Forecasting', mlflow=True)

import mlflow
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import joblib


In [8]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')
sample_submission = pd.read_csv('sampleSubmission.csv')

train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

train_merged = pd.merge(train, features, on=['Store', 'Date'], how='left')
train_merged = pd.merge(train_merged, stores, on='Store', how='left')

test_merged = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_merged = pd.merge(test_merged, stores, on='Store', how='left')

train_merged['Date'] = pd.to_datetime(train_merged['Date'])
train_data = train_merged[train_merged['Date'] < '2012-01-01']
val_data = train_merged[(train_merged['Date'] >= '2012-01-01') & (train_merged['Date'] < '2012-07-01')]


In [9]:
def preprocess(df):
    type_map = {'A': 0, 'B': 1, 'C': 2}
    df = df.copy()
    df['Type'] = df['Type'].map(type_map)
    if 'IsHoliday_x' in df.columns:
        df['IsHoliday'] = df['IsHoliday_x'].astype(int)
        df = df.drop(columns=['IsHoliday_x', 'IsHoliday_y'])
    elif 'IsHoliday' in df.columns:
        df['IsHoliday'] = df['IsHoliday'].astype(int)
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
    for col in markdown_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    df = df.drop(columns=['Date'])
    return df


In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import joblib
import mlflow
import mlflow.sklearn
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def enhanced_preprocess_simple(df):
    """Simple enhanced preprocessing that works with your existing data"""
    type_map = {'A': 0, 'B': 1, 'C': 2}
    df = df.copy()

    # Handle Type column
    if 'Type' in df.columns:
        df['Type'] = df['Type'].map(type_map)

    # Handle holiday columns - check what exists
    holiday_col = None
    if 'IsHoliday_x' in df.columns:
        df['IsHoliday'] = df['IsHoliday_x'].astype(int)
        df = df.drop(columns=['IsHoliday_x'], errors='ignore')
        if 'IsHoliday_y' in df.columns:
            df = df.drop(columns=['IsHoliday_y'], errors='ignore')
        holiday_col = 'IsHoliday'
    elif 'IsHoliday' in df.columns:
        df['IsHoliday'] = df['IsHoliday'].astype(int)
        holiday_col = 'IsHoliday'
    else:
        df['IsHoliday'] = 0  # Default to no holiday
        holiday_col = 'IsHoliday'

    # Enhanced time features
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    df['Quarter'] = df['Date'].dt.quarter
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear

    # Cyclical encoding for better seasonal patterns
    df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
    df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
    df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
    df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)
    df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
    df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)

    # Handle markdowns with better features
    markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
    for col in markdown_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
            # Create binary indicators
            df[f'{col}_Present'] = (df[col] > 0).astype(int)

    # Create markdown summary features
    if any(col in df.columns for col in markdown_cols):
        existing_markdown_cols = [col for col in markdown_cols if col in df.columns]
        df['Total_MarkDown'] = df[existing_markdown_cols].sum(axis=1)
        df['MarkDown_Count'] = (df[existing_markdown_cols] > 0).sum(axis=1)
        df['Has_MarkDown'] = (df['Total_MarkDown'] > 0).astype(int)

    # Economic indicators interactions
    if 'CPI' in df.columns and 'Unemployment' in df.columns:
        df['CPI_Unemployment_Ratio'] = df['CPI'] / (df['Unemployment'] + 0.01)
        df['CPI_Normalized'] = (df['CPI'] - df['CPI'].mean()) / df['CPI'].std()
        df['Unemployment_Normalized'] = (df['Unemployment'] - df['Unemployment'].mean()) / df['Unemployment'].std()

    # Store size features
    if 'Size' in df.columns:
        df['Size_log'] = np.log1p(df['Size'])
        df['Size_Normalized'] = (df['Size'] - df['Size'].mean()) / df['Size'].std()

    # Temperature features if available
    if 'Temperature' in df.columns:
        df['Temperature_squared'] = df['Temperature'] ** 2
        df['Temperature_Normalized'] = (df['Temperature'] - df['Temperature'].mean()) / df['Temperature'].std()

    # Fuel price features
    if 'Fuel_Price' in df.columns:
        df['Fuel_Price_log'] = np.log1p(df['Fuel_Price'])
        df['Fuel_Price_Normalized'] = (df['Fuel_Price'] - df['Fuel_Price'].mean()) / df['Fuel_Price'].std()

    # Store-Department interaction
    if 'Store' in df.columns and 'Dept' in df.columns:
        df['Store_Dept_Interaction'] = df['Store'] * 1000 + df['Dept']

    # Special dates (Christmas, Thanksgiving periods)
    df['Is_Christmas_Period'] = ((df['Month'] == 12) & (df['Day'] >= 15)).astype(int)
    df['Is_Thanksgiving_Period'] = ((df['Month'] == 11) & (df['Day'] >= 20)).astype(int)
    df['Is_Back_To_School'] = ((df['Month'] == 8) | ((df['Month'] == 9) & (df['Day'] <= 15))).astype(int)

    # Remove date column
    df = df.drop(columns=['Date'])

    # Fill any remaining NaN values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    return df

def create_lag_features(df, target_col='Weekly_Sales'):
    """Create lag features for time series"""
    if target_col not in df.columns:
        return df

    df = df.copy()
    df = df.sort_values(['Store', 'Dept', 'Date'])

    # Create lag features
    for lag in [1, 2, 4, 8]:
        df[f'Sales_Lag_{lag}'] = df.groupby(['Store', 'Dept'])[target_col].shift(lag)

    # Rolling statistics
    for window in [4, 8, 12]:
        df[f'Sales_Mean_{window}'] = df.groupby(['Store', 'Dept'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).mean().shift(1)
        )
        df[f'Sales_Std_{window}'] = df.groupby(['Store', 'Dept'])[target_col].transform(
            lambda x: x.rolling(window=window, min_periods=1).std().shift(1)
        )

    # Fill NaN values created by lag features
    lag_cols = [col for col in df.columns if 'Lag_' in col or 'Mean_' in col or 'Std_' in col]
    for col in lag_cols:
        df[col] = df[col].fillna(df[col].median())

    return df

def weighted_mean_absolute_error(y_true, y_pred, weights):
    """Calculate WMAE"""
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

def train_improved_model(X_train, y_train, X_val, y_val, weights_val):
    """Train improved model with better regularization"""

    # Model with improved regularization
    model = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=2000,
        learning_rate=0.015,
        num_leaves=70,
        max_depth=14,
        min_child_samples=20,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        min_split_gain=0.1,
        min_child_weight=0.001,
        subsample_freq=1,
        objective='regression',
        metric='rmse',
        boosting_type='gbdt',
        verbose=-1
    )

    # Fit with early stopping - fallback version
    try:
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
    except Exception as e:
        print(f"Early stopping failed: {e}")
        print("Training without early stopping...")
        model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_val)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    wmae = weighted_mean_absolute_error(y_val, y_pred, weights_val)

    print(f"Validation RMSE: {rmse:.2f}")
    print(f"Validation WMAE: {wmae:.2f}")

    return model, y_pred, wmae

# Main execution - Simple version
def run_simple_optimization():
    """Run the simple optimization"""

    print("=== Simple Walmart Sales Optimization ===")

    # Use your existing data splits
    # Assuming train_data and val_data are already defined from your original code

    # Create lag features (this might take a moment)
    print("Creating lag features...")
    train_with_lags = create_lag_features(train_data)
    val_with_lags = create_lag_features(val_data)

    # Enhanced preprocessing
    print("Enhanced preprocessing...")
    X_train = enhanced_preprocess_simple(train_with_lags.drop(columns=['Weekly_Sales']))
    y_train = train_with_lags['Weekly_Sales']

    val_processed = enhanced_preprocess_simple(val_with_lags)
    X_val = val_processed.drop(columns=['Weekly_Sales'])
    y_val = val_processed['Weekly_Sales']

    # Create weights
    if 'IsHoliday' in val_processed.columns:
        weights_val = val_processed['IsHoliday'].apply(lambda x: 5 if x else 1)
    else:
        weights_val = pd.Series([1] * len(val_processed))

    print(f"Training set shape: {X_train.shape}")
    print(f"Validation set shape: {X_val.shape}")
    print(f"Number of features: {X_train.shape[1]}")

    # Train model
    print("Training improved model...")
    model, y_pred, wmae = train_improved_model(X_train, y_train, X_val, y_val, weights_val)

    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

    # MLflow logging
    mlflow.set_experiment("LightGBM_Training")

    with mlflow.start_run(run_name="optimized_lightGBM"):
        mlflow.log_param("approach", "enhanced_features_simple")
        mlflow.log_param("n_estimators", 2000)
        mlflow.log_param("learning_rate", 0.015)
        mlflow.log_param("num_leaves", 70)
        mlflow.log_param("max_depth", 14)
        mlflow.log_param("regularization", "alpha_0.1_lambda_0.1")
        mlflow.log_param("features_count", X_train.shape[1])
        mlflow.log_metric("Validation_WMAE", wmae)

        # Save model
        joblib.dump(model, "improved_lgbm_model.pkl")
        mlflow.log_artifact("improved_lgbm_model.pkl")

    print(f"\nFinal WMAE: {wmae:.2f}")

    if wmae < 2682:
        print("🎉 Improvement achieved!")
    else:
        print("Still need more optimization. Try ensemble methods or different model architectures.")

    return model, wmae, feature_importance

# Run the optimization
if __name__ == "__main__":
    # This assumes your original data loading and splitting code has been run
    model, wmae, feature_importance = run_simple_optimization()

=== Simple Walmart Sales Optimization ===
Creating lag features...
Enhanced preprocessing...
Training set shape: (294132, 58)
Validation set shape: (77110, 58)
Number of features: 58
Training improved model...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1793]	valid_0's rmse: 4989.84
Validation RMSE: 4989.84
Validation WMAE: 1877.43

Top 10 Most Important Features:
         feature  importance
1           Dept       15252
13   Sales_Lag_1        7460
27           Day        6706
18   Sales_Std_4        5381
22  Sales_Std_12        5111
16   Sales_Lag_8        5071
2    Temperature        4780
14   Sales_Lag_2        4768
26          Week        4715
20   Sales_Std_8        4649
🏃 View run optimized_lightGBM at: https://dagshub.com/eghib22/Store-Sales-Forecasting.mlflow/#/experiments/3/runs/3703e8c4faad40a38962a866af316ff6
🧪 View experiment at: https://dagshub.com/eghib22/Store-Sales-Forecasting.mlflow/#/experiments/3

Final WMAE: 1877