<a href="https://colab.research.google.com/github/eghib22/Store-Sales-Forecasting/blob/main/model_experiment_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
from google.colab import files
files.upload()
!mv "kaggle.json" ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!ls -l ~/.kaggle/

!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip walmart-recruiting-store-sales-forecasting
!unzip '*.csv.zip'
!unzip '*.csv.zip'
!pip install mlflow dagshub lightgbm scikit-learn joblib


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/root/.kaggle’: File exists


Saving kaggle.json to kaggle.json
total 4
-rw------- 1 root root 71 Jul  7 13:41 kaggle.json
walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: features.csv.zip        
replace sampleSubmission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sampleSubmission.csv.zip  
replace stores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: stores.csv              
replace test.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv.zip            
replace train.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.csv.zip           
Archive:  test.csv.zip
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                

Archive:  features.csv.zip
replace features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ena

In [2]:
import dagshub
dagshub.init(repo_owner='eghib22', repo_name='Store-Sales-Forecasting', mlflow=True)

import mlflow
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import joblib


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
features = pd.read_csv('features.csv')
stores = pd.read_csv('stores.csv')
sample_submission = pd.read_csv('sampleSubmission.csv')

train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

train_merged = pd.merge(train, features, on=['Store', 'Date'], how='left')
train_merged = pd.merge(train_merged, stores, on='Store', how='left')

test_merged = pd.merge(test, features, on=['Store', 'Date'], how='left')
test_merged = pd.merge(test_merged, stores, on='Store', how='left')

train_merged['Date'] = pd.to_datetime(train_merged['Date'])
train_data = train_merged[train_merged['Date'] < '2012-01-01']
val_data = train_merged[(train_merged['Date'] >= '2012-01-01') & (train_merged['Date'] < '2012-07-01')]


In [4]:
def preprocess(df):
    type_map = {'A': 0, 'B': 1, 'C': 2}
    df = df.copy()
    df['Type'] = df['Type'].map(type_map)
    if 'IsHoliday_x' in df.columns:
        df['IsHoliday'] = df['IsHoliday_x'].astype(int)
        df = df.drop(columns=['IsHoliday_x', 'IsHoliday_y'])
    elif 'IsHoliday' in df.columns:
        df['IsHoliday'] = df['IsHoliday'].astype(int)
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    markdown_cols = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
    for col in markdown_cols:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    df = df.drop(columns=['Date'])
    return df


In [9]:
import joblib
import mlflow
import mlflow.sklearn

mlflow.set_experiment("LightGBM_Training")

with mlflow.start_run(run_name="LightGBM_Final_Pipeline"):
    X_train = preprocess(train_data.drop(columns=['Weekly_Sales']))
    y_train = train_data['Weekly_Sales']
    val_data_processed = preprocess(val_data)
    X_val = val_data_processed.drop(columns=['Weekly_Sales'])
    y_val = val_data_processed['Weekly_Sales']
    weights_val = val_data_processed['IsHoliday'].apply(lambda x: 5 if x else 1)

    model = lgb.LGBMRegressor(
        random_state=42,
        n_estimators=1500,
        learning_rate=0.02,
        num_leaves=50,
        max_depth=12
    )
    pipeline = Pipeline([
        ('preprocess', FunctionTransformer(preprocess)),
        ('model', model)
    ])

    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    wmae = np.sum(weights_val * np.abs(y_val - y_pred)) / np.sum(weights_val)

    print("Validation RMSE:", rmse)
    print("Validation WMAE:", wmae)

    mlflow.log_param("n_estimators", 1500)
    mlflow.log_param("learning_rate", 0.02)
    mlflow.log_param("num_leaves", 50)
    mlflow.log_param("max_depth", 12)
    mlflow.log_metric("Validation_RMSE", rmse)
    mlflow.log_metric("Validation_WMAE", wmae)

    joblib.dump(pipeline, "lgbm_pipeline.pkl")
    mlflow.log_artifact("lgbm_pipeline.pkl", artifact_path="LightGBM_Pipeline")

mlflow.end_run()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.116118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2468
[LightGBM] [Info] Number of data points in the train set: 294132, number of used features: 18
[LightGBM] [Info] Start training from score 16105.306894
Validation RMSE: 5012.979184438132
Validation WMAE: 3006.6014773001953
🏃 View run LightGBM_Final_Pipeline at: https://dagshub.com/eghib22/Store-Sales-Forecasting.mlflow/#/experiments/3/runs/eefa7700736a49929d5f6c8a806119dc
🧪 View experiment at: https://dagshub.com/eghib22/Store-Sales-Forecasting.mlflow/#/experiments/3
