In [51]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [52]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

walmart_recruiting_store_sales_forecasting_path = kagglehub.competition_download('walmart-recruiting-store-sales-forecasting')

print('Data source import complete.')


Data source import complete.


In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [78]:
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error



import warnings
warnings.filterwarnings("ignore")

# Load data
path = '/kaggle/input/walmart-recruiting-store-sales-forecasting/'
df_train = pd.read_csv(f'{path}train.csv', parse_dates=['Date'])
df_test = pd.read_csv(f'{path}test.csv', parse_dates=['Date'])
df_features = pd.read_csv(f'{path}features.csv', parse_dates=['Date'])
df_stores = pd.read_csv(f'{path}stores.csv')

# Merge dataframes
df = df_train.merge(df_stores, on='Store', how='left')
df = df.merge(df_features, on=['Store', 'Date'], how='left')

# Handle missing values (example: fill with 0 or interpolate)
# For simplicity, let's fill NaNs in numerical columns with 0 for now.
# In a real project, you'd use more sophisticated imputation.
for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
    df[col] = df[col].fillna(0) # Or df[col].fillna(df[col].mean())

# Convert IsHoliday to numerical
df['IsHoliday'] = df['IsHoliday'].astype(int)

# Handle negative Weekly_Sales values (common in this dataset)
df['Weekly_Sales'] = df['Weekly_Sales'].apply(lambda x: max(0, x)) # Set negative sales to 0

# Feature Engineering (as per previous instructions)
def create_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear
    # df['Is_Holiday'] is already handled by df['IsHoliday'].astype(int) above
    return df

df = create_features(df)

# Define features and target
features = [
    'Store', 'Dept', 'Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'DayOfYear',
    'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'Size', 'Type', # Add 'Size' and 'Type' from stores.csv
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5' # MarkDown features
]
target = 'Weekly_Sales'

# Convert 'Type' to numerical using OneHotEncoder or LabelEncoder
df = pd.get_dummies(df, columns=['Type'], prefix='Type', drop_first=True)
features.remove('Type') # Remove original 'Type' from features list
features.extend([col for col in df.columns if 'Type_' in col]) # Add new one-hot encoded columns

# Filter features to ensure they exist after all preprocessing
final_features = [f for f in features if f in df.columns]
print(f"Final features used for training: {final_features}")

# Split data into training and validation sets (time-series split)
df = df.sort_values('Date')
split_date = pd.to_datetime('2011-12-31') # Example split date, adjust as needed

X_train = df[df['Date'] <= split_date][final_features]
y_train = df[df['Date'] <= split_date][target]
X_valid = df[df['Date'] > split_date][final_features]
y_valid = df[df['Date'] > split_date][target]

print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_valid)}")

# Check for missing columns in X_train/X_valid after splitting and one-hot encoding
missing_in_valid = set(X_train.columns) - set(X_valid.columns)
missing_in_train = set(X_valid.columns) - set(X_train.columns)
if missing_in_valid or missing_in_train:
    print("Warning: Mismatch in columns between train and validation sets!")
    print(f"Missing in valid: {missing_in_valid}")
    print(f"Missing in train: {missing_in_train}")
    # Align columns to ensure both have the same set
    common_cols = list(set(X_train.columns) & set(X_valid.columns))
    X_train = X_train[common_cols]
    X_valid = X_valid[common_cols]
    print("Columns aligned for train and validation sets.")


USE_SMALL = True

if USE_SMALL:
    df_work = df.query("Store == 1 and Dept == 1").copy()
else:
    df_work = df.copy()

print(f"ave: {df_work['Weekly_Sales'].mean():.2f}")
print(f"stdev: {df_work['Weekly_Sales'].std():.2f}")
print(f"min: {df_work['Weekly_Sales'].min():.2f}")
print(f"max: {df_work['Weekly_Sales'].max():.2f}")


Final features used for training: ['Store', 'Dept', 'Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'DayOfYear', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'Type_B', 'Type_C']
Train set size: 382439
Validation set size: 48946
ave: 17968.43
stdev: 24524.41
min: -370.12
max: 630999.19


In [79]:
import ngrok

ngrok.set_auth_token("2g3h4j5k6l7m8n9o0p1q2r3s4t5u6v7w") # Replace with your actual ngrok token

mlflow.set_tracking_uri("http://localhost:5000")

get_ipython().system_raw("mlflow ui --port 5000 &")

ngrok_tunnel = ngrok.connect(addr="5000", proto="http")
print(f"MLflow UI: {ngrok_tunnel.public_url} -> {ngrok_tunnel.addr}")

ngrok  MLflow UI
MLflow UI: NgrokTunnel: "https://1bd1-34-61-217-101.ngrok-free.app" -> "http://localhost:5000"


In [79]:

print(f"ave: {df_work['Weekly_Sales'].mean():.2f}")
print(f"stdev: {df_work['Weekly_Sales'].std():.2f}")
print(f"min: {df_work['Weekly_Sales'].min():.2f}")
print(f"max: {df_work['Weekly_Sales'].max():.2f}")

ave: 17968.43
stdev: 24524.41
min: -370.12
max: 630999.19
