In [None]:
import pandas as pd


In [None]:
# Load files
train = pd.read_csv("/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/train.csv/train.csv")
features = pd.read_csv("/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/features.csv/features.csv")
stores = pd.read_csv("/content/drive/MyDrive/walmart-recruiting-store-sales-forecasting/stores.csv")


In [None]:
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.



In [None]:
features.head()

Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Total_MarkDown,Num_Active_MarkDowns,Holiday_Promo
0,1,2010-02-05,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,False,0.0,0,0.0
1,1,2010-02-12,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,True,0.0,0,0.0
2,1,2010-02-19,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,False,0.0,0,0.0
3,1,2010-02-26,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,False,0.0,0,0.0
4,1,2010-03-05,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,False,0.0,0,0.0


In [None]:
#Step 1: Clean features.csv
markdown_cols = ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]
for col in markdown_cols:
    features[col] = features[col].fillna(0)

# Add engineered features
features["Total_MarkDown"] = features[markdown_cols].sum(axis=1)
features["Num_Active_MarkDowns"] = (features[markdown_cols] > 0).sum(axis=1)
features["Holiday_Promo"] = features["IsHoliday"].astype(int) * features["Total_MarkDown"]

In [None]:
stores.head()

Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [None]:
# Merge
train_full = train.merge(features, on=["Store", "Date"], how="left")
train_full = train_full.merge(stores, on="Store", how="left")

In [None]:
# Convert Date
train_full["Date"] = pd.to_datetime(train_full["Date"])
train_full["Year"] = train_full["Date"].dt.year
train_full["Month"] = train_full["Date"].dt.month
train_full["Week"] = train_full["Date"].dt.isocalendar().week

In [None]:
# Date Features (Basically the Seasonality)
# Convert Date to datetime
train_full["Date"] = pd.to_datetime(train_full["Date"])

# Extract time-based features
train_full["Year"] = train_full["Date"].dt.year
train_full["Month"] = train_full["Date"].dt.month
train_full["Week"] = train_full["Date"].dt.isocalendar().week.astype(int)
train_full["Day"] = train_full["Date"].dt.day

# Season feature (simple mapping)
def get_season(month):
    if month in [12,1,2]:
        return "Winter"
    elif month in [3,4,5]:
        return "Spring"
    elif month in [6,7,8]:
        return "Summer"
    else:
        return "Fall"

train_full["Season"] = train_full["Month"].apply(get_season)


In [None]:
#Lag Features (previous sales)
# Sort values to maintain order
train_full = train_full.sort_values(["Store","Dept","Date"])

# Create lag features
train_full["Sales_Lag1"] = train_full.groupby(["Store","Dept"])["Weekly_Sales"].shift(1)
train_full["Sales_Lag2"] = train_full.groupby(["Store","Dept"])["Weekly_Sales"].shift(2)

# Rolling averages
train_full["Sales_MA3"] = train_full.groupby(["Store","Dept"])["Weekly_Sales"].shift(1).rolling(window=3).mean()
train_full["Sales_MA7"] = train_full.groupby(["Store","Dept"])["Weekly_Sales"].shift(1).rolling(window=7).mean()


In [None]:
#Enriched Holiday Features
#Holiday lag/lead (sales before and after holiday)
train_full["Prev_Holiday"] = train_full["IsHoliday_x"].shift(1).fillna(0).astype(int)
train_full["Next_Holiday"] = train_full["IsHoliday_x"].shift(-1).fillna(0).astype(int)

# Interaction with promotions (already did earlier)
# train_full["Holiday_Promo"] = train_full["IsHoliday"].astype(int) * train_full["Total_MarkDown"]

In [None]:
#Normalization For Stability
from sklearn.preprocessing import StandardScaler

scale_cols = ["CPI","Unemployment","Fuel_Price","Temperature"]
scaler = StandardScaler()
train_full[scale_cols] = scaler.fit_transform(train_full[scale_cols])


In [None]:
# One-hot encode BEFORE splitting
X_encoded = pd.get_dummies(X, columns=["Type", "Season"], drop_first=True)


In [None]:
# Preparing Features and Target
from sklearn.model_selection import train_test_split

# Drop columns not useful for training
drop_cols = ["Date", "Weekly_Sales"]
X = train_full.drop(columns=drop_cols)
y = train_full["Weekly_Sales"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [None]:
#Training the XGBoost Model
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

# One-hot encode categorical columns
X_train = pd.get_dummies(X_train, columns=["Type", "Season"], drop_first=True)
X_val = pd.get_dummies(X_val, columns=["Type", "Season"], drop_first=True)

# Initialize model
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist"   # faster training
)

# Train
model.fit(X_train, y_train)

# Validation predictions
y_pred = model.predict(X_val)

# RMSE evaluation
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 3542.78974585448


In [None]:
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)


In [None]:
# RMSE evaluation
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 3542.78974585448


In [None]:
pip install optuna


Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.5.0


In [None]:
#Defining the Optuna Function
import optuna
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    # Suggest hyperparameters
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1500),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "tree_method": "hist"
    }

    # Train model
    model = XGBRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    # Predict
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    return rmse


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # run 50 trials


[I 2025-09-08 14:57:17,961] A new study created in memory with name: no-name-3f286234-1ce6-44d5-ba81-b503bea00848
[I 2025-09-08 14:59:57,202] Trial 0 finished with value: 3321.5588324485684 and parameters: {'n_estimators': 1209, 'learning_rate': 0.011690113577729335, 'max_depth': 12, 'subsample': 0.5506724462273944, 'colsample_bytree': 0.8592035415812173, 'gamma': 0.22201634261700798, 'min_child_weight': 5}. Best is trial 0 with value: 3321.5588324485684.
[I 2025-09-08 15:01:02,297] Trial 1 finished with value: 3251.645292396743 and parameters: {'n_estimators': 999, 'learning_rate': 0.05730777471361701, 'max_depth': 7, 'subsample': 0.5338934744947206, 'colsample_bytree': 0.6963809893022952, 'gamma': 2.5266830957514723, 'min_child_weight': 7}. Best is trial 1 with value: 3251.645292396743.
[I 2025-09-08 15:02:01,512] Trial 2 finished with value: 3480.9047338465953 and parameters: {'n_estimators': 856, 'learning_rate': 0.019633836392598546, 'max_depth': 7, 'subsample': 0.9663485570785051

In [None]:
from xgboost import XGBRegressor
import joblib

# Best parameters from Optuna
best_params = study.best_params

# Create model with best parameters
best_model = XGBRegressor(**best_params, n_jobs=-1, random_state=42)

# Train on full training set
best_model.fit(X_train, y_train)


In [None]:
joblib.dump(best_model, "xgb_demand_forecast_model.pkl")
print("Model saved successfully!")


Model saved successfully!


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import joblib

# Path in your Google Drive
drive_path = '/content/drive/MyDrive/xgb_demand_forecast_model.pkl'

# Save model
joblib.dump(best_model, drive_path)
print(f"Model saved to {drive_path}")


Model saved to /content/drive/MyDrive/xgb_demand_forecast_model.pkl
