<a href="https://colab.research.google.com/github/ekvirika/WalmartRecruiting/blob/main/notebooks/model_experiment_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# Install required packages
!pip install wandb torch torchvision pandas numpy matplotlib seaborn scikit-learn

# Set up Kaggle API
!pip install kaggle



In [13]:

# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [14]:

# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace sampleSubmission.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace stores.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace train.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [15]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
unzip:  cannot find or open stores.csv.zip, stores.csv.zip.zip or stores.csv.zip.ZIP.
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace features.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [16]:
!pip install mlflow



# Experiment 1

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import xgboost as xgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.models.signature import infer_signature
import warnings
warnings.filterwarnings('ignore')


In [18]:
! pip install optuna



In [20]:
import pandas as pd

# Load each split CSV (adjust paths if needed)
df1 = pd.read_csv('https://raw.githubusercontent.com/ekvirika/WalmartRecruiting/refs/heads/main/notebooks/part_1.csv')
df2 = pd.read_csv('https://raw.githubusercontent.com/ekvirika/WalmartRecruiting/refs/heads/main/notebooks/part_2.csv')
df3 = pd.read_csv('https://raw.githubusercontent.com/ekvirika/WalmartRecruiting/refs/heads/main/notebooks/part_3.csv')

# Concatenate them back into one DataFrame
full_df = pd.concat([df1, df2, df3], ignore_index=True)

print(full_df.shape)  # should be the original size


(420212, 23)


In [21]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
stores = pd.read_csv('stores.csv')
features = pd.read_csv('features.csv')

In [68]:
def load_data():

  with mlflow.start_run(run_name="XGBoost_Data_Loading") as run:

      train_df = pd.read_csv('train.csv')
      test_df = pd.read_csv('test.csv')
      stores_df = pd.read_csv('stores.csv')
      features_df = pd.read_csv('features.csv')

      df = train_df.merge(features_df, on=['Store', 'Date'], how='inner').merge(stores_df, on=['Store'], how='inner')

      train_info = {
          "train_shape": train_df.shape,
          "test_shape": test_df.shape,
          "train_columns": list(train_df.columns),
          "missing_values_train": train_df.isnull().sum().sum(),
          "missing_values_test": test_df.isnull().sum().sum()
      }

      mlflow.log_params(train_info)

      print(f"Train shape: {train_df.shape}")
      print(f"Test shape: {test_df.shape}")
      print(f"Missing values in train: {train_df.isnull().sum().sum()}")

      return df


In [69]:
def preprocess(df):
  df = df.copy()

  if 'IsHoliday_y' in df.columns:
      df.drop(['IsHoliday_y'], axis=1, inplace=True)
  if 'IsHoliday_x' in df.columns:
      df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

  # df.drop(['IsHoliday_y'], axis=1,inplace=True)
  # df.rename(columns={'IsHoliday_x':'IsHoliday'},inplace=True)

  df = df.loc[df['Weekly_Sales'] > 0]

  df["Date"] = pd.to_datetime(df["Date"])
  df = df.sort_values(by="Date")

  # Fill missing MarkDowns with 0
  markdown_cols = [col for col in df.columns if "MarkDown" in col]
  df[markdown_cols] = df[markdown_cols].fillna(0)

  # Feature engineering
  df["Year"] = df["Date"].dt.year
  df["Month"] = df["Date"].dt.month
  df["Week"] = df["Date"].dt.isocalendar().week
  df.drop(columns=["Date"], inplace=True)

  return df


In [70]:
def build_pipeline():
  numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                  'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
                  'Year', 'Month', 'Week']
  categorical_cols = ['Type', 'Store']

  # Preprocessor
  preprocessor = ColumnTransformer(transformers=[
      ('num', SimpleImputer(strategy='mean'), numeric_cols),
      ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
  ])

  # Full pipeline
  pipeline = Pipeline([
      ('preprocessor', preprocessor),
      ('model', XGBRegressor(
          n_estimators=100,
          max_depth=6,
          learning_rate=0.1,
          random_state=42,
          n_jobs=-1
      ))
  ])

  return pipeline

In [71]:
def run():
  df = load_data()
  preprocess(df)

  X = df.drop(columns=['Weekly_Sales'])
  y = df['Weekly_Sales']

  print("Available columns:", X.columns.tolist())

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  pipe = build_pipeline()
  pipe.fit(X_train, y_train)

  y_pred = pipe.predict(X_test)

  score = mae(y_test, y_pred)
  print(f"✅ XGBoost Pipeline MAE: {score:.2f}")

  return pipe

# Run pipeline
model_pipeline = run()

Train shape: (421570, 5)
Test shape: (115064, 4)
Missing values in train: 0
Available columns: ['Store', 'Dept', 'Date', 'IsHoliday_x', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'IsHoliday_y', 'Type', 'Size']


ValueError: A given column is not a column of the dataframe

In [72]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error as mae
from xgboost import XGBRegressor

def load_data():
    with mlflow.start_run(run_name="XGBoost_Data_Loading") as run:
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        stores_df = pd.read_csv('stores.csv')
        features_df = pd.read_csv('features.csv')

        df = train_df.merge(features_df, on=['Store', 'Date'], how='inner').merge(stores_df, on=['Store'], how='inner')

        train_info = {
            "train_shape": train_df.shape,
            "test_shape": test_df.shape,
            "train_columns": list(train_df.columns),
            "missing_values_train": train_df.isnull().sum().sum(),
            "missing_values_test": test_df.isnull().sum().sum()
        }

        mlflow.log_params(train_info)

        print(f"Train shape: {train_df.shape}")
        print(f"Test shape: {test_df.shape}")
        print(f"Missing values in train: {train_df.isnull().sum().sum()}")

        return df

def preprocess(df):
    df = df.copy()

    # Remove duplicate IsHoliday column and rename
    if 'IsHoliday_y' in df.columns:
        df.drop(['IsHoliday_y'], axis=1, inplace=True)
    if 'IsHoliday_x' in df.columns:
        df.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

    # Filter positive sales
    df = df.loc[df['Weekly_Sales'] > 0]

    # Convert Date to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    df = df.sort_values(by="Date")

    # Fill missing MarkDowns with 0
    markdown_cols = [col for col in df.columns if "MarkDown" in col]
    df[markdown_cols] = df[markdown_cols].fillna(0)

    # Feature engineering - UNCOMMENTED THIS PART
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Week"] = df["Date"].dt.isocalendar().week
    df.drop(columns=["Date"], inplace=True)

    return df

def build_pipeline():
    # Define columns based on what will exist after preprocessing
    numeric_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
                    'Year', 'Month', 'Week']
    categorical_cols = ['Type', 'Store']

    # Add IsHoliday to categorical if it exists
    # We'll handle this dynamically in the pipeline

    # Preprocessor
    preprocessor = ColumnTransformer(transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ])

    # Full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1
        ))
    ])

    return pipeline

def run():
    # Load and preprocess data
    df = load_data()
    df = preprocess(df)  # Store the result back to df

    # Prepare features and target
    X = df.drop(columns=['Weekly_Sales'])
    y = df['Weekly_Sales']

    # Check what columns we actually have
    print("Available columns:", X.columns.tolist())

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build and train pipeline
    pipe = build_pipeline()
    pipe.fit(X_train, y_train)

    # Make predictions
    y_pred = pipe.predict(X_test)

    # Calculate score
    score = mae(y_test, y_pred)
    print(f"✅ XGBoost Pipeline MAE: {score:.2f}")

    return pipe

# Alternative version that dynamically detects columns
def build_dynamic_pipeline(X_sample):
    """
    Build pipeline based on actual columns in the data
    """
    # Automatically detect numeric and categorical columns
    numeric_cols = X_sample.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_sample.select_dtypes(include=['object', 'category']).columns.tolist()

    print(f"Numeric columns detected: {numeric_cols}")
    print(f"Categorical columns detected: {categorical_cols}")

    transformers = []

    if numeric_cols:
        transformers.append(('num', SimpleImputer(strategy='mean'), numeric_cols))

    if categorical_cols:
        transformers.append(('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols))

    # Preprocessor
    preprocessor = ColumnTransformer(transformers=transformers)

    # Full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1
        ))
    ])

    return pipeline

def run_dynamic():
    """
    Version that automatically detects column types
    """
    # Load and preprocess data
    df = load_data()
    df = preprocess(df)

    # Prepare features and target
    X = df.drop(columns=['Weekly_Sales'])
    y = df['Weekly_Sales']

    # Check what columns we actually have
    print("Available columns:", X.columns.tolist())
    print("Data types:")
    print(X.dtypes)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build pipeline dynamically based on actual data
    pipe = build_dynamic_pipeline(X_train)
    pipe.fit(X_train, y_train)

    # Make predictions
    y_pred = pipe.predict(X_test)

    # Calculate score
    score = mae(y_test, y_pred)
    print(f"✅ XGBoost Pipeline MAE: {score:.2f}")

    return pipe

# Run the fixed pipeline
if __name__ == "__main__":
    print("Running fixed pipeline...")
    try:
        model_pipeline = run()
        print("Pipeline completed successfully!")
    except Exception as e:
        print(f"Error in fixed pipeline: {e}")
        print("\nTrying dynamic pipeline...")
        try:
            model_pipeline = run_dynamic()
            print("Dynamic pipeline completed successfully!")
        except Exception as e2:
            print(f"Error in dynamic pipeline: {e2}")

Running fixed pipeline...
Train shape: (421570, 5)
Test shape: (115064, 4)
Missing values in train: 0
Available columns: ['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Year', 'Month', 'Week']
✅ XGBoost Pipeline MAE: 14245.00
Pipeline completed successfully!
