In [ ]:
# --- Cell 1: Imports and Data Preparation ---

# Standard library imports
import os
import warnings
from datetime import datetime

# Data handling and numerical operations
import pandas as pd
import numpy as np

# Machine learning libraries
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit # Useful for more advanced CV, but simple split used here

# MLflow for experiment tracking
import mlflow
import mlflow.lightgbm # Specific MLflow integration for LightGBM

# Visualization (optional, but good for understanding data)
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# --- Data Loading Configuration ---
# You MUST choose ONE of the following options for loading your data.
# Uncomment and configure the option that matches how your data is available in Colab.

data_base_path = None # Initialize the base path for your data files

# Option 1: Load data directly from Kaggle using kagglehub (recommended if you have Kaggle API key set up)
# Requires: !pip install kagglehub and !kagglehub login (run in a separate cell before this one)
# try:
#     import kagglehub
#     # Ensure you have run `!kagglehub login` and entered your API key in a previous cell
#     data_base_path = kagglehub.competition_download('walmart-recruiting-store-sales-forecasting')
#     print(f"Data will be loaded from KaggleHub path: {data_base_path}")
# except Exception as e:
#     print(f"Warning: KaggleHub download failed. Error: {e}")
#     print("Falling back to checking other data loading options.")

# Option 2: Load data from Google Drive (recommended for persistent storage)
# Requires: Mounting Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# data_base_path = '/content/drive/MyDrive/path/to/your/walmart_sales_data/' # <--- IMPORTANT: ADJUST THIS PATH!
# print(f"Data will be loaded from Google Drive path: {data_base_path}")

# Option 3: Load data from Colab session storage (if you manually uploaded CSVs to /content/)
# Files uploaded this way are deleted when the Colab runtime resets.
# If you manually uploaded train.csv, test.csv, features.csv, stores.csv to /content/
data_base_path = '/content/' # <--- THIS IS THE MOST LIKELY CORRECT PATH FOR YOUR SCENARIO
print(f"Data will be loaded from Colab /content/ path: {data_base_path}")

# --- Fallback if no specific option is chosen or successful ---
if data_base_path is None or not os.path.exists(data_base_path):
    print("\nNo valid data path set or path does not exist from chosen options.")
    print("Please uncomment and configure one of the 'Option' blocks above to specify your data location.")
    # As a last resort, try default Kaggle input path if running directly on Kaggle,
    # or if data is expected to be in a standard input directory.
    if os.path.exists('/kaggle/input/walmart-recruiting-store-sales-forecasting/'):
        data_base_path = '/kaggle/input/walmart-recruiting-store-sales-forecasting/'
        print(f"Attempting to load from default Kaggle input path: {data_base_path}")
    else:
        print("Critical: No accessible data path found. Please ensure your data files are in one of the specified locations.")
        raise FileNotFoundError("Data files not found. Please check your data loading configuration.")


# --- Load DataFrames ---
print("\nLoading dataframes...")
try:
    df_train = pd.read_csv(f'{data_base_path}train.csv', parse_dates=['Date'])
    df_test = pd.read_csv(f'{data_base_path}test.csv', parse_dates=['Date'])
    df_features = pd.read_csv(f'{data_base_path}features.csv', parse_dates=['Date'])
    df_stores = pd.read_csv(f'{data_base_path}stores.csv')
    print("All dataframes loaded successfully.")
    print(f"\n--- df_features head (check 'Store', 'Date', 'IsHoliday') ---")
    print(df_features[['Store', 'Date', 'IsHoliday']].head()) # Debug print
    print(f"Columns in df_features after loading: {df_features.columns.tolist()}") # Debug print
except FileNotFoundError as e:
    print(f"Critical Error: One or more data files not found in '{data_base_path}'. Error: {e}")
    print("Please ensure the CSV files (train.csv, test.csv, features.csv, stores.csv) are directly in the specified 'data_base_path'.")
    raise # Re-raise the error as data loading is critical
except Exception as e:
    print(f"Critical Error: An unexpected error occurred during data loading. Error: {e}")
    raise # Re-raise any other unexpected errors


# --- Merge DataFrames ---
print("\nMerging dataframes...")
# First merge train with stores
df = df_train.merge(df_stores, on='Store', how='left')

# --- Debugging Merge ---
print("\n--- Pre-merge Debugging ---")
print("df (train + stores) info:")
df.info()
print("\ndf_features info:")
df_features.info()

print("\ndf (train + stores) head for merge keys:")
print(df[['Store', 'Date']].head())
print("\ndf_features head for merge keys:")
print(df_features[['Store', 'Date']].head())

# Check for common Store-Date pairs
common_store_dates_df = df[['Store', 'Date']].drop_duplicates()
common_store_dates_features = df_features[['Store', 'Date']].drop_duplicates()
overlap_count = pd.merge(common_store_dates_df, common_store_dates_features, on=['Store', 'Date'], how='inner').shape[0]
print(f"\nNumber of unique Store-Date pairs in df (train + stores): {common_store_dates_df.shape[0]}")
print(f"Number of unique Store-Date pairs in df_features: {common_store_dates_features.shape[0]}")
print(f"Number of overlapping Store-Date pairs (inner join): {overlap_count}")
if overlap_count == 0:
    print("WARNING: No overlapping Store-Date pairs found between df and df_features. This will result in many NaNs after merge.")
    print("Please check the 'Store' and 'Date' columns for consistency (e.g., data types, ranges).")


# --- FIX: Identify and drop common columns from df before merging with df_features ---
# This prevents _x and _y suffixes for features that exist in both train.csv/stores.csv and features.csv.
# We want to keep the features.csv version of these columns.
common_cols_to_drop_from_df = [
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
    'IsHoliday' # IsHoliday is also common, and we want features.csv's version
]
# Filter to only drop columns that actually exist in df
cols_to_drop_existing = [col for col in common_cols_to_drop_from_df if col in df.columns]

if cols_to_drop_existing:
    print(f"Dropping common columns from df (from train.csv/stores.csv) before merging with features.csv: {cols_to_drop_existing}")
    df = df.drop(columns=cols_to_drop_existing)
else:
    print("No common columns to drop from df before merging with features.csv (or they don't exist).")


# Then merge the result with features
df = df.merge(df_features, on=['Store', 'Date'], how='left')
print("Dataframes merged.")
print(f"\n--- Merged DataFrame head (check 'Store', 'Date', 'IsHoliday') ---")
# 'IsHoliday' and other common columns should now exist directly from df_features without suffixes
if 'IsHoliday' in df.columns:
    print(df[['Store', 'Date', 'IsHoliday']].head()) # Debug print
else:
    print("Error: 'IsHoliday' column still not found after merge. This indicates a deeper issue with merge keys or data in features.csv.")

print(f"Columns in merged DataFrame after final merge: {df.columns.tolist()}") # Debug print to check for all columns


# --- Handle Missing Values ---
print("\nHandling missing values...")
# Fill NaNs in numerical columns with 0. Consider more sophisticated imputation strategies for production.
# This loop now covers all relevant numerical columns that might have NaNs after the merge.
for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
    if col in df.columns: # Check if column exists before filling NaNs
        df[col] = df[col].fillna(0)
    else:
        print(f"Warning: Column '{col}' not found in DataFrame for NaN handling.")
print("Missing values filled.")


# --- Convert IsHoliday to numerical ---
# This step should now consistently find 'IsHoliday' from df_features
if 'IsHoliday' in df.columns:
    df['IsHoliday'] = df['IsHoliday'].astype(int)
    print("'IsHoliday' column converted to integer type.")
else:
    # This else block should ideally not be hit if the merge and previous fix are successful
    print("Critical Warning: 'IsHoliday' column still not found in DataFrame. Adding it with default value 0.")
    df['IsHoliday'] = 0 # Fallback to ensure column exists


# --- Handle Negative Weekly_Sales values ---
# Set negative sales to 0, as sales cannot be negative in reality.
df['Weekly_Sales'] = df['Weekly_Sales'].apply(lambda x: max(0, x))
print("Negative 'Weekly_Sales' values handled.")


# --- Feature Engineering ---
# Function to create time-based features
def create_features(df_input):
    df_input['Year'] = df_input['Date'].dt.year
    df_input['Month'] = df_input['Date'].dt.month
    df_input['Week'] = df_input['Date'].dt.isocalendar().week.astype(int)
    df_input['Day'] = df_input['Date'].dt.day
    df_input['DayOfWeek'] = df_input['Date'].dt.dayofweek
    df_input['DayOfYear'] = df_input['Date'].dt.dayofyear
    return df_input

print("\nCreating time-based features...")
df = create_features(df.copy()) # Apply features to the main merged dataframe
print("Time-based features created.")


# --- Define Features and Target ---
# 'Type' will be one-hot encoded separately
features = [
    'Store', 'Dept', 'Year', 'Month', 'Week', 'Day', 'DayOfWeek', 'DayOfYear',
    'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'Size', 'Type', # 'Type' will be removed and replaced by one-hot encoded columns
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5'
]
target = 'Weekly_Sales'


# --- Convert 'Type' to numerical using OneHotEncoder ---
print("\nApplying One-Hot Encoding to 'Type' column...")
df = pd.get_dummies(df, columns=['Type'], prefix='Type', drop_first=True)
# Remove original 'Type' from features list and add new one-hot encoded columns
features.remove('Type')
features.extend([col for col in df.columns if 'Type_' in col])
print("'Type' column encoded.")


# --- Final Feature Filtering ---
# Ensure only existing features are used after all preprocessing steps
final_features = [f for f in features if f in df.columns]
print(f"\nFinal features used for training: {final_features}")


# --- Split Data into Training and Validation Sets (Time-Series Split) ---
# Sort data by date for proper time-series splitting
df = df.sort_values('Date')
# Define a split date to separate training and validation data chronologically
split_date = pd.to_datetime('2011-12-31') # Example split date, adjust as needed for your validation strategy

X_train = df[df['Date'] <= split_date][final_features]
y_train = df[df['Date'] <= split_date][target]
X_valid = df[df['Date'] > split_date][final_features]
y_valid = df[df['Date'] > split_date][target]

print(f"\nTrain set size: {len(X_train)} (Dates: {df['Date'].min().strftime('%Y-%m-%d')} to {df[df['Date'] <= split_date]['Date'].max().strftime('%Y-%m-%d')})")
print(f"Validation set size: {len(X_valid)} (Dates: {df[df['Date'] > split_date]['Date'].min().strftime('%Y-%m-%d')} to {df['Date'].max().strftime('%Y-%m-%d')})")


# --- Check for missing columns in X_train/X_valid after splitting and one-hot encoding ---
# This is crucial to ensure consistency between feature sets for model training.
missing_in_valid = set(X_train.columns) - set(X_valid.columns)
missing_in_train = set(X_valid.columns) - set(X_train.columns)
if missing_in_valid or missing_in_train:
    print("Warning: Mismatch in columns between train and validation sets!")
    if missing_in_valid:
        print(f"Missing in valid: {missing_in_valid}")
    if missing_in_train:
        print(f"Missing in train: {missing_in_train}")
    # Align columns to ensure both have the same set
    common_cols = list(set(X_train.columns) & set(X_valid.columns))
    X_train = X_train[common_cols]
    X_valid = X_valid[common_cols]
    print("Columns aligned for train and validation sets.")


# --- Display basic statistics for the target variable ---
# This helps in understanding the scale of the target and interpreting RMSE/MAE.
print(f"\nTarget variable ('Weekly_Sales') statistics (full dataset):")
print(f"Mean: {df['Weekly_Sales'].mean():.2f}")
print(f"Standard Deviation: {df['Weekly_Sales'].std():.2f}")
print(f"Min: {df['Weekly_Sales'].min():.2f}")
print(f"Max: {df['Weekly_Sales'].max():.2f}")


In [ ]:
# --- Cell 2: MLflow Setup and LightGBM Training and Test Evaluation ---

# --- MLflow Configuration ---
# Set MLflow tracking URI.
# Use 'file:/content/mlruns' for local tracking in Colab.
# If you are using a remote MLflow server, replace with its URI.
mlflow.set_tracking_uri("file:/content/mlruns")

# Set MLflow experiment name.
# As per project instructions, each model architecture should have a separate experiment.
experiment_name = "LightGBM_Training"
mlflow.set_experiment(experiment_name)

print(f"MLflow Experiment '{experiment_name}' set up.")
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")

# --- Start MLflow Run and Train LightGBM Model ---
# Start an MLflow run to log the training process.
# This run will now encompass training, validation, and test evaluation.
with mlflow.start_run(run_name='LGBM_baseline_run'):
    # Define LightGBM hyperparameters
    # You can tune these parameters for better performance.
    params = {
        'objective': 'regression_l1', # MAE objective: 'regression_l1', RMSE objective: 'regression_l2'
        'metric': 'rmse',             # Metric to evaluate during training
        'n_estimators': 1500,         # Number of boosting rounds (trees)
        'learning_rate': 0.03,        # Step size shrinkage
        'num_leaves': 31,             # Max number of leaves in one tree
        'max_depth': 8,               # Max tree depth
        'min_child_samples': 20,      # Minimum number of data needed in a child (leaf)
        'subsample': 0.8,             # Subsample ratio of the training instance
        'colsample_bytree': 0.8,      # Subsample ratio of columns when constructing each tree
        'random_state': 42,           # Random seed for reproducibility
        'n_jobs': -1,                 # Use all available CPU cores
        'reg_alpha': 0.1,             # L1 regularization (alpha)
        'reg_lambda': 0.1             # L2 regularization (lambda)
    }

    # Log hyperparameters to MLflow
    mlflow.log_params(params)
    print("\nLightGBM model training in progress...")

    # Initialize and train LightGBM Regressor model
    model = lgb.LGBMRegressor(**params)

    # Use callbacks for early stopping and logging during training
    callbacks = [
        lgb.early_stopping(100, verbose=False), # Stop if validation metric doesn't improve for 100 rounds
        lgb.log_evaluation(period=200) # Log evaluation results every 200 boosting rounds
    ]

    model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)],
              eval_metric='rmse', # Specify evaluation metric for early stopping
              callbacks=callbacks)

    print("LightGBM model training finished.")

    # --- Evaluation on Validation Set ---
    # Make predictions on the validation set
    preds_val = model.predict(X_valid)

    # Calculate evaluation metrics
    val_rmse = np.sqrt(mean_squared_error(y_valid, preds_val))
    val_mae = mean_absolute_error(y_valid, preds_val)

    print(f"\nValidation RMSE: {val_rmse:.2f}")
    print(f"Validation MAE: {val_mae:.2f}")

    # Log metrics to MLflow
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("best_iteration", model.best_iteration_) # Log the best iteration found by early stopping

    # --- Save Model to MLflow ---
    # Log the trained LightGBM model to MLflow.
    # Register the model in MLflow Model Registry for easy retrieval later.
    mlflow.lightgbm.log_model(model, artifact_path='model', registered_model_name="LightGBMSalesForecaster")
    print("\nModel logged to MLflow.")
    print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")

    # --- Test Set Evaluation (Moved from Cell 3) ---
    print("\nPreparing test data for evaluation...")

    # Merge df_test with df_stores
    df_test_merged = df_test.merge(df_stores, on='Store', how='left')

    # Identify common columns in df_test_merged and df_features (excluding merge keys)
    common_cols_to_drop_from_test = [
        'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
        'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
        'IsHoliday'
    ]
    cols_to_drop_existing_test = [col for col in common_cols_to_drop_from_test if col in df_test_merged.columns]

    if cols_to_drop_existing_test:
        print(f"Dropping common columns from df_test (from test.csv/stores.csv) before merging with features.csv: {cols_to_drop_existing_test}")
        df_test_merged = df_test_merged.drop(columns=cols_to_drop_existing_test)
    else:
        print("No common columns to drop from df_test_merged before merging with features.csv (or they don't exist).")

    # Merge df_test_merged with df_features
    df_test_final = df_test_merged.merge(df_features, on=['Store', 'Date'], how='left')

    # Handle missing values in df_test_final (consistent with training)
    print("Handling missing values in test data...")
    for col in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
        if col in df_test_final.columns:
            df_test_final[col] = df_test_final[col].fillna(0)
        else:
            print(f"Warning: Column '{col}' not found in df_test_final for NaN handling.")
    print("Missing values in test data filled.")

    # Convert IsHoliday to numerical in df_test_final
    if 'IsHoliday' in df_test_final.columns:
        df_test_final['IsHoliday'] = df_test_final['IsHoliday'].astype(int)
    else:
        print("Warning: 'IsHoliday' column not found in df_test_final. Adding it with default value 0.")
        df_test_final['IsHoliday'] = 0

    # Apply feature engineering to df_test_final
    print("Creating time-based features for test data...")
    df_test_final = create_features(df_test_final.copy())
    print("Time-based features for test data created.")

    # Convert 'Type' to numerical using OneHotEncoder for df_test_final
    print("Applying One-Hot Encoding to 'Type' column in test data...")
    if 'Type' in df_test_final.columns:
        df_test_final = pd.get_dummies(df_test_final, columns=['Type'], prefix='Type', drop_first=True)
    else:
        print("Warning: 'Type' column not found in df_test_final. Skipping One-Hot Encoding for 'Type'.")
    print("Test data preprocessing complete.")

    # Align test set columns with training set columns (X_train)
    X_test = df_test_final.reindex(columns=X_train.columns, fill_value=0)

    # Ensure no NaNs remain in X_test before prediction (after reindexing)
    if X_test.isnull().sum().sum() > 0:
        print("Warning: NaNs found in X_test after alignment. Filling with 0.")
        X_test = X_test.fillna(0)

    print(f"Final Test set size for prediction: {len(X_test)}")
    print(f"Test set features (first 5 rows):\n{X_test.head()}")

    print("\nEvaluating on test set...")
    preds_test = model.predict(X_test)

    # Create submission DataFrame
    submission_df = pd.DataFrame({'Id': df_test['Store'].astype(str) + '_' + df_test['Dept'].astype(str) + '_' + df_test['Date'].dt.strftime('%Y-%m-%d'),
                                  'Weekly_Sales': preds_test})

    # Ensure no negative sales in submission (as per competition rules)
    submission_df['Weekly_Sales'] = submission_df['Weekly_Sales'].apply(lambda x: max(0, x))

    # Save submission file
    submission_file_path = 'submission_lightgbm.csv'
    submission_df.to_csv(submission_file_path, index=False)
    print(f"\nSubmission file '{submission_file_path}' created successfully.")

    # Log submission file as an MLflow artifact
    mlflow.log_artifact(submission_file_path)
    print(f"Submission file logged as MLflow artifact.")

print("\nLightGBM experiment finished. Check MLflow UI for full results and submission file.")


In [ ]:
# --- Cell 3: MLflow UI with ngrok (Optional) ---
# This cell was previously Cell 4.

# IMPORTANT: Replace "YOUR_NGROK_AUTH_TOKEN" with your actual ngrok token.
# Get your token from https://dashboard.ngrok.com/get-started/your-authtoken
# This cell needs to be run only once per Colab session if the token is set.

import time # Import time for sleep
from pyngrok import ngrok, conf

# Kill any existing ngrok tunnels to ensure a clean start
try:
    ngrok.kill()
    print("Terminated any existing ngrok tunnels.")
    time.sleep(2) # Add a small delay to ensure processes are fully terminated
except Exception as e:
    print(f"Could not terminate existing ngrok tunnels (might not exist): {e}")

# Set ngrok authentication token
try:
    # This is a placeholder. In a real scenario, you'd get this from a secure source
    # or prompt the user. For a Colab notebook, direct setting is common.
    NGROK_AUTH_TOKEN = "2zXieAERZUJhQWKxXhKwvjXc1fh_2CSAEsiYxxG6iSoupmRN9" # <--- REPLACE THIS WITH YOUR ACTUAL TOKEN
    print(f"NGROK_AUTH_TOKEN value being used: '{NGROK_AUTH_TOKEN}'") # Debug print

    if NGROK_AUTH_TOKEN == "YOUR_NGROK_AUTH_TOKEN":
        print("CRITICAL: 'YOUR_NGROK_AUTH_TOKEN' placeholder still present. Please replace it with your actual token.")
        # You might want to raise an error or exit here if the token is critical
        raise ValueError("ngrok authentication token not set correctly.")
    
    conf.get_default().auth_token = NGROK_AUTH_TOKEN
    print("ngrok authentication token set.")
except Exception as e:
    print(f"Error setting ngrok auth token: {e}")
    print("Please ensure your ngrok token is correct and pasted without extra spaces or characters.")
    print("Also, check your ngrok dashboard (https://dashboard.ngrok.com/agents) to ensure no active sessions are running.")


# Run MLflow UI in the background
# It typically runs on port 5000
get_ipython().system_raw("mlflow ui --port 5000 &")

# Create an ngrok tunnel to expose the MLflow UI
print("Creating ngrok tunnel for MLflow UI...")
try:
    public_url = ngrok.connect(addr="5000", proto="http")
    print(f"MLflow UI is available at: {public_url}")
    print("Click the link above to access the MLflow UI in your browser.")
except Exception as e:
    print(f"Error creating ngrok tunnel: {e}")
    print("Please ensure ngrok is installed and your auth token is correct.")
    print("If the error persists, check your ngrok dashboard (https://dashboard.ngrok.com/agents) and manually kill any active sessions.")
