In [None]:
import os
import warnings
from datetime import datetime

import pandas as pd
import numpy as np

import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

import mlflow
import mlflow.lightgbm

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

data_base_path = None

# Option 1: Load data directly from Kaggle using kagglehub
# try:
#     import kagglehub
#     data_base_path = kagglehub.competition_download('walmart-recruiting-store-sales-forecasting')
#     print(f"Data will be loaded from KaggleHub path: {data_base_path}")
# except Exception as e:
#     print(f"Warning: KaggleHub download failed. Error: {e}")
#     print("Falling back to checking other data loading options.")

# Option 2: Load data from Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# data_base_path = '/content/drive/MyDrive/path/to/your/walmart_sales_data/'
# print(f"Data will be loaded from Google Drive path: {data_base_path}")

# Option 3: Load data from Colab session storage
data_base_path = '/content/'
print(f"Data will be loaded from Colab /content/ path: {data_base_path}")

if data_base_path is None or not os.path.exists(data_base_path):
    print("\nNo valid data path set or path does not exist from chosen options.")
    print("Please uncomment and configure one of the 'Option' blocks above to specify your data location.")
    if os.path.exists('/kaggle/input/walmart-recruiting-store-sales-forecasting/'):
        data_base_path = '/kaggle/input/walmart-recruiting-store-sales-forecasting/'
        print(f"Attempting to load from default Kaggle input path: {data_base_path}")
    else:
        print("Critical: No accessible data path found. Please ensure your data files are in one of the specified locations.")
        raise FileNotFoundError("Data files not found. Please check your data loading configuration.")


print("\nLoading dataframes...")
try:
    df_train = pd.read_csv(f'{data_base_path}train.csv', parse_dates=['Date'])
    df_test = pd.read_csv(f'{data_base_path}test.csv', parse_dates=['Date'])
    df_features = pd.read_csv(f'{data_base_path}features.csv', parse_dates=['Date'])
    df_stores = pd.read_csv(f'{data_base_path}stores.csv')
    print("All dataframes loaded successfully.")
    print(f"\n--- df_features head (check 'Store', 'Date', 'IsHoliday') ---")
    print(df_features[['Store', 'Date', 'IsHoliday']].head())
    print(f"Columns in df_features after loading: {df_features.columns.tolist()}")
except FileNotFoundError as e:
    print(f"Critical Error: One or more data files not found in '{data_base_path}'. Error: {e}")
    print("Please ensure the CSV files (train.csv, test.csv, features.csv, stores.csv) are directly in the specified 'data_base_path'.")
    raise
except Exception as e:
    print(f"Critical Error: An unexpected error occurred during data loading. Error: {e}")
    raise


print("\nMerging dataframes...")
df = df_train.merge(df_stores, on='Store', how='left')

print("\n--- Pre-merge Debugging ---")
print("df (train + stores) info:")
df.info()
print("\ndf_features info:")
df_features.info()

print("\ndf (train + stores) head for merge keys:")
print(df[['Store', 'Date']].head())
print("\ndf_features head for merge keys:")
print(df_features[['Store', 'Date']].head())

common_store_dates_df = df[['Store', 'Date']].drop_duplicates()
common_store_dates_features = df_features[['Store', 'Date']].drop_duplicates()
overlap_count = pd.merge(common_store_dates_df, common_store_dates_features, on=['Store', 'Date'], how='inner').shape[0]
print(f"\nNumber of unique Store-Date pairs in df (train + stores): {common_store_dates_df.shape[0]}")
print(f"Number of unique Store-Date pairs in df_features: {common_store_dates_features.shape[0]}")
print(f"Number of overlapping Store-Date pairs (inner join): {overlap_count}")
if overlap_count == 0:
    print("WARNING: No overlapping Store-Date pairs found between df and df_features. This will result in many NaNs after merge.")
    print("Please check the 'Store' and 'Date' columns for consistency (e.g., data types, ranges).")


common_cols_to_drop_from_df = [
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
    'IsHoliday'
]
cols_to_drop_existing = [col for col in common_cols_to_drop_from_df if col in df.columns]

if cols_to_drop_existing:
    print(f"Dropping common columns from df (from train.csv/stores.csv) before merging with features.csv: {cols_to_drop_existing}")
    df = df.drop(columns=cols_to_drop_existing)
else:
    print("No common columns to drop from df before merging with features.csv (or they don't exist).")


df = df.merge(df_features, on=['Store', 'Date'], how='left')
print("Dataframes merged.")
print(f"\n--- Merged DataFrame head (check 'Store', 'Date', 'IsHoliday') ---")
if 'IsHoliday' in df.columns:
    print(df[['Store', 'Date', 'IsHoliday']].head())
else:
    print("Error: 'IsHoliday' column still not found after merge. This indicates a deeper issue with merge keys or data in features.csv.")

print(f"Columns in merged DataFrame after final merge: {df.columns.tolist()}")


print("\nHandling missing values...")
for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
    if col in df.columns:
        df[col] = df[col].fillna(0)
    else:
        print(f"Warning: Column '{col}' not found in DataFrame for NaN handling.")
print("Missing values filled.")


if 'IsHoliday' in df.columns:
    df['IsHoliday'] = df['IsHoliday'].astype(int)
    print("'IsHoliday' column converted to integer type.")
else:
    print("Critical Warning: 'IsHoliday' column still not found in DataFrame. Adding it with default value 0.")
    df['IsHoliday'] = 0


df['Weekly_Sales'] = df['Weekly_Sales'].apply(lambda x: max(0, x))
print("Negative 'Weekly_Sales' values handled.")


def create_features(df_input):
    df_input['Year'] = df_input['Date'].dt.year
    df_input['Month'] = df_input['Date'].dt.month
    df_input['Week'] = df_input['Date'].dt.isocalendar().week.astype(int)
    df_input['Day'] = df_input['Date'].dt.day
    df_input['DayOfWeek'] = df_input['Date'].dt.dayofweek
    df_input['DayOfYear'] = df_input['Date'].dt.dayofyear
    return df_input

print("\nCreating time-based features...")
df = create_features(df.copy())
print("Time-based features created.")


features = [
    'Store', 'Dept', 'Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'DayOfYear',
    'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'Size', 'Type',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'
]
target = 'Weekly_Sales'


print("\nApplying One-Hot Encoding to 'Type' column...")
df = pd.get_dummies(df, columns=['Type'], prefix='Type', drop_first=True)
features.remove('Type')
features.extend([col for col in df.columns if 'Type_' in col])
print("'Type' column encoded.")


final_features = [f for f in features if f in df.columns]
print(f"\nFinal features used for training: {final_features}")


df = df.sort_values('Date')
split_date = pd.to_datetime('2011-12-31')

X_train = df[df['Date'] <= split_date][final_features]
y_train = df[df['Date'] <= split_date][target]
X_valid = df[df['Date'] > split_date][final_features]
y_valid = df[df['Date'] > split_date][target]

print(f"\nTrain set size: {len(X_train)} (Dates: {df['Date'].min().strftime('%Y-%m-%d')} to {df[df['Date'] <= split_date]['Date'].max().strftime('%Y-%m-%d')})")
print(f"Validation set size: {len(X_valid)} (Dates: {df[df['Date'] > split_date]['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')})")


missing_in_valid = set(X_train.columns) - set(X_valid.columns)
missing_in_train = set(X_valid.columns) - set(X_train.columns)
if missing_in_valid or missing_in_train:
    print("Warning: Mismatch in columns between train and validation sets!")
    if missing_in_valid:
        print(f"Missing in valid: {missing_in_valid}")
    if missing_in_train:
        print(f"Missing in train: {missing_in_train}")
    common_cols = list(set(X_train.columns) & set(X_valid.columns))
    X_train = X_train[common_cols]
    X_valid = X_valid[common_cols]
    print("Columns aligned for train and validation sets.")


print(f"\nTarget variable ('Weekly_Sales') statistics (full dataset):")
print(f"Mean: {df['Weekly_Sales'].mean():.2f}")
print(f"Standard Deviation: {df['Weekly_Sales'].std():.2f}")
print(f"Min: {df['Weekly_Sales'].min():.2f}")
print(f"Max: {df['Weekly_Sales'].max():.2f}")


In [None]:
mlflow.set_tracking_uri("file:/content/mlruns")

experiment_name = "LightGBM_Training"
mlflow.set_experiment(experiment_name)

print(f"MLflow Experiment '{experiment_name}' set up.")
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

with mlflow.start_run(run_name='LGBM_baseline_run'):
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'n_estimators': 1500,
        'learning_rate': 0.03,
        'num_leaves': 31,
        'max_depth': 8,
        'min_child_samples': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'n_jobs': -1,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1
    }

    mlflow.log_params(params)
    print("\nLightGBM model training in progress...")

    model = lgb.LGBMRegressor(**params)

    callbacks = [
        lgb.early_stopping(100, verbose=False),
        lgb.log_evaluation(period=200)
    ]

    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='rmse',
              callbacks=callbacks)

    print("LightGBM model training finished.")

    preds_val = model.predict(X_valid)

    val_rmse = np.sqrt(mean_squared_error(y_valid, preds_val))
    val_mae = mean_absolute_error(y_valid, preds_val)

    print(f"\nValidation RMSE: {val_rmse:.2f}")
    print(f"Validation MAE: {val_mae:.2f}")

    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("best_iteration", model.best_iteration_)

    mlflow.lightgbm.log_model(model, artifact_path='model', registered_model_name="LightGBMSalesForecaster")
    print("\nModel logged to MLflow.")
    print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")

    print("\nPreparing test data for evaluation...")

    df_test_merged = df_test.merge(df_stores, on='Store', how='left')

    common_cols_to_drop_from_test = [
        'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
        'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
        'IsHoliday'
    ]
    cols_to_drop_existing_test = [col for col in common_cols_to_drop_from_test if col in df_test_merged.columns]

    if cols_to_drop_existing_test:
        print(f"Dropping common columns from df_test (from test.csv/stores.csv) before merging with features.csv: {cols_to_drop_existing_test}")
        df_test_merged = df_test_merged.drop(columns=cols_to_drop_existing_test)
    else:
        print("No common columns to drop from df_test_merged before merging with features.csv (or they don't exist).")

    df_test_final = df_test_merged.merge(df_features, on=['Store', 'Date'], how='left')

    print("Handling missing values in test data...")
    for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
        if col in df_test_final.columns:
            df_test_final[col] = df_test_final[col].fillna(0)
        else:
            print(f"Warning: Column '{col}' not found in df_test_final for NaN handling.")
    print("Missing values in test data filled.")

    if 'IsHoliday' in df_test_final.columns:
        df_test_final['IsHoliday'] = df_test_final['IsHoliday'].astype(int)
    else:
        print("Warning: 'IsHoliday' column not found in df_test_final. Adding it with default value 0.")
        df_test_final['IsHoliday'] = 0

    print("Creating time-based features for test data...")
    df_test_final = create_features(df_test_final.copy())
    print("Time-based features for test data created.")

    print("Applying One-Hot Encoding to 'Type' column in test data...")
    if 'Type' in df_test_final.columns:
        df_test_final = pd.get_dummies(df_test_final, columns=['Type'], prefix='Type', drop_first=True)
    else:
        print("Warning: 'Type' column not found in df_test_final. Skipping One-Hot Encoding for 'Type'.")
    print("Test data preprocessing complete.")

    X_test = df_test_final.reindex(columns=X_train.columns, fill_value=0)

    if X_test.isnull().sum().sum() > 0:
        print("Warning: NaNs found in X_test after alignment. Filling with 0.")
        X_test = X_test.fillna(0)

    print(f"Final Test set size for prediction: {len(X_test)}")
    print(f"Test set features (first 5 rows):\n{X_test.head()}")

    print("\nEvaluating on test set...")
    preds_test = model.predict(X_test)

    submission_df = pd.DataFrame({'Id': df_test['Store'].astype(str) + '_' + df_test['Dept'].astype(str) + '_' + df_test['Date'].dt.strftime('%Y-%m-%d'),
                                  'Weekly_Sales': preds_test})

    submission_df['Weekly_Sales'] = submission_df['Weekly_Sales'].apply(lambda x: max(0, x))

    submission_file_path = 'submission_lightgbm.csv'
    submission_df.to_csv(submission_file_path, index=False)
    print(f"\nSubmission file '{submission_file_path}' created successfully.")

    mlflow.log_artifact(submission_file_path)
    print(f"Submission file logged as MLflow artifact.")

print("\nLightGBM experiment finished. Check MLflow UI for full results and submission file.")


In [None]:
import time
from pyngrok import ngrok, conf

try:
    ngrok.kill()
    print("Terminated any existing ngrok tunnels.")
    time.sleep(2)
except Exception as e:
    print(f"Could not terminate existing ngrok tunnels (might not exist): {e}")

try:
    NGROK_AUTH_TOKEN = "2zXieAERZUJhQWKxXhWwvjXc1fh_2CSAEsiYxxG6iSoupmRN9"
    print(f"NGROK_AUTH_TOKEN value being used: '{NGROK_AUTH_TOKEN}'")

    if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTH_TOKEN":
        print("CRITICAL: 'YOUR_NGROK_AUTH_TOKEN' placeholder still present. Please replace it with your actual token.")
        raise ValueError("ngrok authentication token not set correctly.")
    
    conf.get_default().auth_token = NGROK_AUTH_TOKEN
    print("ngrok authentication token set.")
except Exception as e:
    print(f"Error setting ngrok auth token: {e}")
    print("Please ensure your ngrok token is correct and pasted without extra spaces or characters.")
    print("Also, check your ngrok dashboard (https://dashboard.ngrok.com/agents) to ensure no active sessions are running.")


get_ipython().system_raw("mlflow ui --port 5000 &")

print("Creating ngrok tunnel for MLflow UI...")
try:
    public_url = ngrok.connect(addr="5000", proto="http")
    print(f"MLflow UI is available at: {public_url}")
    print("Click the link above to access the MLflow UI in your browser.")
except Exception as e:
    print(f"Error creating ngrok tunnel: {e}")
    print("Please ensure ngrok is installed and your auth token is correct.")
    print("If the error persists, check your ngrok dashboard (https://dashboard.ngrok.com/agents) and manually kill any active sessions.")
