In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Install required packages
!pip install wandb torch torchvision pandas numpy matplotlib seaborn scikit-learn

# Set up Kaggle API
!pip install kaggle

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:

# Upload your kaggle.json to Colab and run:
!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [4]:

# Download the dataset
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
!unzip -q walmart-recruiting-store-sales-forecasting.zip

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 670MB/s]


In [5]:
!unzip -q train.csv.zip
!unzip -q stores.csv.zip
!unzip -q test.csv.zip
!unzip -q features.csv.zip

unzip:  cannot find or open stores.csv.zip, stores.csv.zip.zip or stores.csv.zip.ZIP.


In [7]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Downloading mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading databricks_sdk-0.57.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.0->mlflow)
  Downloading opentele

In [11]:
# model_experiment_xgboost.ipynb
# Walmart Sales Forecasting - XGBoost Model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

# Experiment Tracking
import mlflow
import mlflow.xgboost
import mlflow.sklearn

# Utilities
import pickle
import joblib
from datetime import datetime, timedelta
import gc

# Configuration
mlflow.set_experiment("XGBoost_Training")

class WalmartXGBoostExperiment:
    def __init__(self):
        self.data = None
        self.train_data = None
        self.test_data = None
        self.features = None
        self.target = 'Weekly_Sales'
        self.model = None
        self.pipeline = None

    def load_data(self):
        """მონაცემების ჩატვირთვა"""
        with mlflow.start_run(run_name="XGBoost_Data_Loading"):
            # Load datasets
            train_df = pd.read_csv('/content/train.csv')
            stores_df = pd.read_csv('/content/stores.csv')
            features_df = pd.read_csv('/content/features.csv')
            test_df = pd.read_csv('/content/test.csv')

            # Log data info
            mlflow.log_param("train_shape", train_df.shape)
            mlflow.log_param("test_shape", test_df.shape)
            mlflow.log_param("stores_count", stores_df.shape[0])

            # Merge datasets
            train_merged = train_df.merge(stores_df, on='Store')
            train_merged = train_merged.merge(features_df, on=['Store', 'Date'], how='left')

            test_merged = test_df.merge(stores_df, on='Store')
            test_merged = test_merged.merge(features_df, on=['Store', 'Date'], how='left')

            self.train_data = train_merged
            self.test_data = test_merged

            print(f"Training data shape: {self.train_data.shape}")
            print(f"Test data shape: {self.test_data.shape}")

    def data_cleaning(self):
        """მონაცემების გაწმენდა"""
        with mlflow.start_run(run_name="XGBoost_Cleaning"):
            # Handle missing values
            missing_before = self.train_data.isnull().sum().sum()

            # Fill missing values in features
            numeric_cols = self.train_data.select_dtypes(include=[np.number]).columns
            for col in numeric_cols:
                if col != self.target:
                    self.train_data[col] = self.train_data[col].fillna(
                        self.train_data.groupby(['Store', 'Dept'])[col].transform('median')
                    )
                    self.test_data[col] = self.test_data[col].fillna(
                        self.test_data.groupby(['Store', 'Dept'])[col].transform('median')
                    )

            # Handle categorical variables
            categorical_cols = ['Type']
            for col in categorical_cols:
                if col in self.train_data.columns:
                    le = LabelEncoder()
                    self.train_data[col] = le.fit_transform(self.train_data[col].astype(str))
                    # Handle unseen labels in test
                    self.test_data[col] = self.test_data[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1)


            missing_after = self.train_data.isnull().sum().sum()

            mlflow.log_param("missing_values_before", missing_before)
            mlflow.log_param("missing_values_after", missing_after)

            print(f"Missing values before cleaning: {missing_before}")
            print(f"Missing values after cleaning: {missing_after}")

    def feature_engineering(self):
        """ფიჩერების შექმნა"""
        with mlflow.start_run(run_name="XGBoost_Feature_Engineering"):

            def create_features(df):
                # Convert Date to datetime
                df['Date'] = pd.to_datetime(df['Date'])

                # Time-based features
                df['Year'] = df['Date'].dt.year
                df['Month'] = df['Date'].dt.month
                df['Week'] = df['Date'].dt.isocalendar().week
                df['Day'] = df['Date'].dt.day
                df['DayOfWeek'] = df['Date'].dt.dayofweek
                df['Quarter'] = df['Date'].dt.quarter

                # Holiday indicator
                if 'IsHoliday' in df.columns:
                  df['IsHoliday'] = df['IsHoliday'].astype(int)


                # Sort by Store, Dept, Date for lag features
                df = df.sort_values(['Store', 'Dept', 'Date'])

                # Lag features (only for training data with target)
                if 'Weekly_Sales' in df.columns:
                    for lag in [1, 2, 4, 8, 12, 52]:
                        df[f'Sales_Lag_{lag}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(lag)

                    # Rolling statistics
                    for window in [4, 8, 12, 26]:
                        df[f'Sales_Mean_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].rolling(window=window).mean().reset_index(0, drop=True)
                        df[f'Sales_Std_{window}'] = df.groupby(['Store', 'Dept'])['Weekly_Sales'].rolling(window=window).std().reset_index(0, drop=True)

                # Store-Department statistics
                if 'Weekly_Sales' in df.columns:
                    store_dept_stats = df.groupby(['Store', 'Dept'])['Weekly_Sales'].agg([
                        'mean', 'std', 'min', 'max', 'median'
                    ]).reset_index()
                    store_dept_stats.columns = ['Store', 'Dept', 'StoreDept_Mean', 'StoreDept_Std',
                                              'StoreDept_Min', 'StoreDept_Max', 'StoreDept_Median']
                    df = df.merge(store_dept_stats, on=['Store', 'Dept'], how='left')

                # Price features
                if 'MarkDown1' in df.columns:
                    markdown_cols = [col for col in df.columns if 'MarkDown' in col]
                    df['Total_MarkDown'] = df[markdown_cols].sum(axis=1)
                    df['MarkDown_Count'] = (df[markdown_cols] > 0).sum(axis=1)

                return df

            # Apply feature engineering
            self.train_data = create_features(self.train_data)
            self.test_data = create_features(self.test_data)

            # Remove rows with NaN in lag features (first few weeks)
            self.train_data = self.train_data.dropna()

            # Feature selection
            exclude_cols = ['Date', 'Weekly_Sales', 'Store', 'Dept']
            self.features = [col for col in self.train_data.columns if col not in exclude_cols]

            mlflow.log_param("num_features", len(self.features))
            mlflow.log_param("features_list", self.features)

            print(f"Number of features: {len(self.features)}")
            print(f"Training data shape after feature engineering: {self.train_data.shape}")

    def create_pipeline(self):
        """Pipeline-ის შექმნა"""
        with mlflow.start_run(run_name="XGBoost_Pipeline_Creation"):

            # XGBoost model
            xgb_model = xgb.XGBRegressor(
                n_estimators=1000,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            )

            # Create pipeline
            self.pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', xgb_model)
            ])

            mlflow.log_param("model_type", "XGBoost")
            mlflow.log_params(xgb_model.get_params())

    def wmae_score(self, y_true, y_pred, weights):
        """WMAE მეტრიკის გამოთვლა"""
        return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

    def time_series_cv(self):
        """Time Series Cross Validation"""
        with mlflow.start_run(run_name="XGBoost_Cross_Validation"):

            X = self.train_data[self.features]
            y = self.train_data[self.target]

            # Create weights (5 for holiday weeks, 1 otherwise)
            weights = np.where(self.train_data['IsHoliday'] == 1, 5, 1)

            # Time series split
            tscv = TimeSeriesSplit(n_splits=5)

            cv_scores = []
            wmae_scores = []

            for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
                print(f"Fold {fold + 1}/5")

                X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
                y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
                weights_train, weights_val = weights[train_idx], weights[val_idx]

                # Train model
                self.pipeline.fit(X_train_fold, y_train_fold)

                # Predict
                y_pred_val = self.pipeline.predict(X_val_fold)

                # Calculate metrics
                mae = mean_absolute_error(y_val_fold, y_pred_val)
                wmae = self.wmae_score(y_val_fold, y_pred_val, weights_val)

                cv_scores.append(mae)
                wmae_scores.append(wmae)

                mlflow.log_metric(f"fold_{fold+1}_mae", mae)
                mlflow.log_metric(f"fold_{fold+1}_wmae", wmae)

            avg_mae = np.mean(cv_scores)
            avg_wmae = np.mean(wmae_scores)

            mlflow.log_metric("cv_mae_mean", avg_mae)
            mlflow.log_metric("cv_wmae_mean", avg_wmae)
            mlflow.log_metric("cv_mae_std", np.std(cv_scores))
            mlflow.log_metric("cv_wmae_std", np.std(wmae_scores))

            print(f"Cross-validation MAE: {avg_mae:.4f} ± {np.std(cv_scores):.4f}")
            print(f"Cross-validation WMAE: {avg_wmae:.4f} ± {np.std(wmae_scores):.4f}")

            return avg_wmae

    def hyperparameter_tuning(self):
        """ჰიპერპარამეტრების ოპტიმიზაცია"""
        with mlflow.start_run(run_name="XGBoost_Hyperparameter_Tuning"):

            from sklearn.model_selection import RandomizedSearchCV

            # Parameter grid
            param_dist = {
                'model__n_estimators': [500, 1000, 1500],
                'model__max_depth': [4, 6, 8, 10],
                'model__learning_rate': [0.05, 0.1, 0.15, 0.2],
                'model__subsample': [0.7, 0.8, 0.9],
                'model__colsample_bytree': [0.7, 0.8, 0.9]
            }

            X = self.train_data[self.features]
            y = self.train_data[self.target]

            # Custom scorer for WMAE
            def wmae_scorer(estimator, X, y):
                y_pred = estimator.predict(X)
                # Assume all weights are 1 for simplicity in grid search
                weights = np.ones(len(y))
                return -self.wmae_score(y, y_pred, weights)  # Negative because sklearn maximizes

            # Randomized search
            random_search = RandomizedSearchCV(
                self.pipeline,
                param_distributions=param_dist,
                n_iter=20,
                cv=TimeSeriesSplit(n_splits=3),
                scoring=wmae_scorer,
                n_jobs=-1,
                random_state=42,
                verbose=1
            )

            random_search.fit(X, y)

            # Log best parameters
            best_params = random_search.best_params_
            mlflow.log_params(best_params)
            mlflow.log_metric("best_cv_score", -random_search.best_score_)

            # Update pipeline with best parameters
            self.pipeline = random_search.best_estimator_

            print(f"Best parameters: {best_params}")
            print(f"Best CV score: {-random_search.best_score_:.4f}")

    def final_training(self):
        """საბოლოო მოდელის ტრენინგი"""
        with mlflow.start_run(run_name="XGBoost_Final_Training"):

            X = self.train_data[self.features]
            y = self.train_data[self.target]

            # Train final model
            self.pipeline.fit(X, y)

            # Feature importance
            feature_importance = self.pipeline.named_steps['model'].feature_importances_
            feature_names = self.features

            # Log feature importance
            for name, importance in zip(feature_names, feature_importance):
                mlflow.log_metric(f"feature_importance_{name}", importance)

            # Save model
            mlflow.sklearn.log_model(
                self.pipeline,
                "xgboost_model",
                registered_model_name="WalmartSales_XGBoost"
            )

            # Save feature names
            with open("features.pkl", "wb") as f:
                pickle.dump(self.features, f)
            mlflow.log_artifact("features.pkl")


            print("Model training completed and saved to MLflow")

    def generate_predictions(self):
        """ტესტ სეტზე პროგნოზების გენერირება"""
        with mlflow.start_run(run_name="XGBoost_Test_Predictions"):

            # Make predictions
            X_test = self.test_data[self.features]
            predictions = self.pipeline.predict(X_test)

            # Create submission file
            submission = pd.DataFrame({
                'Id': self.test_data['Store'].astype(str) + '_' +
                     self.test_data['Dept'].astype(str) + '_' +
                     self.test_data['Date'].astype(str),
                'Weekly_Sales': predictions
            })

            submission.to_csv('xgboost_submission.csv', index=False)
            mlflow.log_artifact('xgboost_submission.csv')

            print(f"Predictions generated for {len(predictions)} test samples")
            print("Submission file saved as 'xgboost_submission.csv'")

    def run_full_experiment(self):
        """სრული ექსპერიმენტის გაშვება"""
        print("Starting Walmart XGBoost Experiment...")

        # Step 1: Load data
        print("\n1. Loading data...")
        self.load_data()

        # Step 2: Data cleaning
        print("\n2. Data cleaning...")
        self.data_cleaning()

        # Step 3: Feature engineering
        print("\n3. Feature engineering...")
        self.feature_engineering()

        # Step 4: Create pipeline
        print("\n4. Creating pipeline...")
        self.create_pipeline()

        # Step 5: Cross validation
        print("\n5. Cross validation...")
        cv_score = self.time_series_cv()

        # Step 6: Hyperparameter tuning
        print("\n6. Hyperparameter tuning...")
        self.hyperparameter_tuning()

        # Step 7: Final training
        print("\n7. Final training...")
        self.final_training()

        # Step 8: Generate predictions
        print("\n8. Generating predictions...")
        self.generate_predictions()

        print("\nXGBoost Experiment completed successfully!")
        print(f"Final CV WMAE score: {cv_score:.4f}")

# Main execution
if __name__ == "__main__":
    # Initialize experiment
    experiment = WalmartXGBoostExperiment()

    # Run full experiment
    experiment.run_full_experiment()

    # Additional analysis and visualizations
    print("\n" + "="*50)
    print("ADDITIONAL ANALYSIS")
    print("="*50)

    # Feature importance visualization
    if experiment.pipeline is not None:
        feature_importance = experiment.pipeline.named_steps['model'].feature_importances_
        feature_names = experiment.features

        # Create feature importance dataframe
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)

        # Plot top 20 features
        plt.figure(figsize=(12, 8))
        top_features = importance_df.head(20)
        plt.barh(range(len(top_features)), top_features['importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Feature Importance')
        plt.title('Top 20 Most Important Features - XGBoost')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig('xgboost_feature_importance.png', dpi=300, bbox_inches='tight')
        plt.show()

        print("Feature importance analysis completed!")

    # Memory cleanup
    gc.collect()
    print("\nMemory cleaned up. Experiment finished!")

# Sample configuration for different experiments
EXPERIMENT_CONFIGS = {
    'baseline': {
        'n_estimators': 500,
        'max_depth': 6,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    },
    'deep_trees': {
        'n_estimators': 1000,
        'max_depth': 10,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    },
    'regularized': {
        'n_estimators': 1500,
        'max_depth': 4,
        'learning_rate': 0.1,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 1,
        'reg_lambda': 1
    }
}



Starting Walmart XGBoost Experiment...

1. Loading data...
Training data shape: (421570, 17)
Test data shape: (115064, 16)

2. Data cleaning...
Missing values before cleaning: 1422431
Missing values after cleaning: 4272

3. Feature engineering...


TypeError: incompatible index of inserted column with frame index

In [None]:
# Usage example for different configurations:
# experiment = WalmartXGBoostExperiment()
# experiment.load_data()
# experiment.data_cleaning()
# experiment.feature_engineering()
#
# # Test different configurations
# for config_name, params in EXPERIMENT_CONFIGS.items():
#     print(f"\n\nTesting configuration: {config_name}")
#     experiment.pipeline = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', xgb.XGBRegressor(**params, random_state=42, n_jobs=-1))
#     ])
#     cv_score = experiment.time_series_cv()
#     print(f"{config_name} CV WMAE: {cv_score:.4f}")

print("XGBoost experiment template ready!")