In [44]:
import mlflow.xgboost
import warnings
import dagshub
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import mlflow.xgboost
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor

warnings.filterwarnings('ignore')
dagshub.init(repo_owner='qetibakh', repo_name='Final', mlflow=True)
mlflow.set_tracking_uri('https://dagshub.com/qetibakh/Final.mlflow')

In [46]:
df = pd.read_csv('./data/Clean_training.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [47]:
target_col = 'Target'
feature_cols = [col for col in df.columns if col not in [target_col, 'Date']]

X = df[feature_cols]
y = df[target_col]

In [48]:
test_cutoff = df['Date'].max() - pd.DateOffset(months=6)
val_cutoff = df['Date'].max() - pd.DateOffset(months=12)

train_mask = df['Date'] <= val_cutoff
val_mask = (df['Date'] > val_cutoff) & (df['Date'] <= test_cutoff)
test_mask = df['Date'] > test_cutoff

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]
X_test, y_test = X[test_mask], y[test_mask]

# Create DataFrame versions for the WMAE function
train_df = df[train_mask]
val_df = df[val_mask]
test_df = df[test_mask]

print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

Training samples: 264220
Validation samples: 77493
Test samples: 79857


In [49]:
def WMAE(dataset, real, predicted):
    # The 'IsHoliday' column must be present in the feature set
    weights = dataset['IsHoliday'].apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights * abs(real - predicted)) / (np.sum(weights)), 2)

In [58]:
def comprehensive_model_evaluation(name, model, X_train, y_train, X_val, y_val, X_test, y_test,
                                 train_df, val_df, test_df):
    """
    A robust model evaluation function with MLflow logging and visualizations.
    """
    # This is without hyperparameter optimization

    with mlflow.start_run(run_name=f"{name.strip()}_run"):
        # Fit model and make predictions
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_val = model.predict(X_val)
        y_pred_test = model.predict(X_test)

        # Calculate metrics
        metrics = {
            'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'val_rmse': np.sqrt(mean_squared_error(y_val, y_pred_val)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'train_wmae': WMAE(train_df, y_train, y_pred_train),
            'val_wmae': WMAE(val_df, y_val, y_pred_val),
            'test_wmae': WMAE(test_df, y_test, y_pred_test),
            'train_r2': r2_score(y_train, y_pred_train),
            'val_r2': r2_score(y_val, y_pred_val),
            'test_r2': r2_score(y_test, y_pred_test)
        }
        mlflow.log_metrics(metrics)

        # Log model parameters
        if hasattr(model, 'get_params'):
            mlflow.log_params(model.get_params())

        # Log the model itself
        model_name = name.strip().lower()
        if 'xgboost' in model_name:
            mlflow.xgboost.log_model(model, f"{name.strip()}_model")
        elif 'lgbm' in model_name:
            mlflow.lightgbm.log_model(model, f"{name.strip()}_model")
        elif 'catboost' in model_name:
            mlflow.catboost.log_model(model, f"{name.strip()}_model")
        else:
            mlflow.sklearn.log_model(model, f"{name.strip()}_model")

        # --- NEW & SIMPLIFIED PREDICTION DATAFRAME ---
        # This new method is much cleaner and avoids the length mismatch error.
        train_preds_df = pd.DataFrame({'actual': y_train, 'predicted': y_pred_train, 'origin': 'train'})
        val_preds_df = pd.DataFrame({'actual': y_val, 'predicted': y_pred_val, 'origin': 'validation'})
        test_preds_df = pd.DataFrame({'actual': y_test, 'predicted': y_pred_test, 'origin': 'test'})

        predictions_df = pd.concat([train_preds_df, val_preds_df, test_preds_df])
        predictions_df.to_csv(f"{name.strip()}_predictions.csv", index=False)
        mlflow.log_artifact(f"{name.strip()}_predictions.csv")

        # --- NEW VISUALIZATION ARTIFACT ---
        # Create and log a scatter plot of predictions vs actuals
        plt.figure(figsize=(10, 10))
        sns.scatterplot(data=predictions_df, x='actual', y='predicted', hue='origin', alpha=0.5)
        plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', lw=2)
        plt.title(f'{name.strip()} - Actual vs. Predicted')
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.grid(True)
        plot_path = f"{name.strip()}_prediction_plot.png"
        plt.savefig(plot_path)
        plt.close() # Close the plot to avoid displaying it in the notebook
        mlflow.log_artifact(plot_path)

        print(f"{name} - Val RMSE: {metrics['val_rmse']:.4f}, Val WMAE: {metrics['val_wmae']:.2f}, Test RMSE: {metrics['test_rmse']:.4f}, Test WMAE: {metrics['test_wmae']:.2f}")

        return {
            'model_name': name.strip(), 'val_rmse': metrics['val_rmse'], 'test_rmse': metrics['test_rmse'],
            'val_wmae': metrics['val_wmae'], 'test_wmae': metrics['test_wmae'], 'val_r2': metrics['val_r2'], 'test_r2': metrics['test_r2']
        }


In [59]:
mlflow.set_experiment("testing_treebased_models")

models = {
    'LGBM': lgb.LGBMRegressor(random_state=0, verbose=-1),
    'XGBoost': xgb.XGBRegressor(random_state=0, objective='reg:squarederror'),
    'CatBoost': cb.CatBoostRegressor(random_state=0, verbose=False),
    'HGBR': HistGradientBoostingRegressor(random_state=0),
    'ExtraTrees': ExtraTreesRegressor(bootstrap=True, random_state=0),
    'RandomForest': RandomForestRegressor(random_state=0),
}

print("\nStarting comprehensive model evaluation with MLflow logging...")
print("="*80)

results = []
for name, model in models.items():
    result = comprehensive_model_evaluation(
        name, model,
        X_train, y_train,
        X_val, y_val,
        X_test, y_test,
        train_df, val_df, test_df
    )
    results.append(result)


Starting comprehensive model evaluation with MLflow logging...




LGBM - Val RMSE: 10671.3733, Val WMAE: 6396.49, Test RMSE: 8699.8937, Test WMAE: 5638.97
🏃 View run LGBM_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/34452289960e4ce49e3fb1489ddaf381
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3




XGBoost - Val RMSE: 9451.0222, Val WMAE: 5281.80, Test RMSE: 7527.1744, Test WMAE: 4609.96
🏃 View run XGBoost_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/983e007030e545fcbde8b152633c7230
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3




CatBoost - Val RMSE: 9690.5881, Val WMAE: 5684.23, Test RMSE: 7609.6186, Test WMAE: 4880.81
🏃 View run CatBoost_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/66e64764101d4e4498efa82508b409bb
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3




HGBR - Val RMSE: 10676.7914, Val WMAE: 6341.99, Test RMSE: 8598.8361, Test WMAE: 5450.53
🏃 View run HGBR_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/c04e478a0c17460085957da4186d2a15
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3




ExtraTrees - Val RMSE: 7694.9019, Val WMAE: 3635.90, Test RMSE: 6676.5239, Test WMAE: 3477.52
🏃 View run ExtraTrees_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/d355171eaaaa40c4b7538179dc1695fd
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3




RandomForest - Val RMSE: 11074.7647, Val WMAE: 4172.74, Test RMSE: 9389.3332, Test WMAE: 4365.94
🏃 View run RandomForest_run at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/fde6ebae905649b18ef7abecba81d0ec
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3


In [60]:
if results:
    results_df = pd.DataFrame(results).sort_values('val_rmse')

    print("\n" + "="*80)
    print("MODEL COMPARISON SUMMARY")
    print("="*80)
    print(results_df.to_string(index=False))

    with mlflow.start_run(run_name="model_comparison_summary"):
        results_df.to_csv('model_comparison_results.csv', index=False)
        mlflow.log_artifact('model_comparison_results.csv')

        best_model = results_df.iloc[0]
        mlflow.log_metrics({
            'best_model_val_rmse': best_model['val_rmse'],
            'best_model_test_rmse': best_model['test_rmse'],
            'best_model_val_wmae': best_model['val_wmae'],
            'best_model_test_wmae': best_model['test_wmae']
        })
        mlflow.log_param('best_model_name', best_model['model_name'])

        print(f"\nBest Model: {best_model['model_name']}")
        print(f"Best Validation RMSE: {best_model['val_rmse']:.4f}")


MODEL COMPARISON SUMMARY
  model_name     val_rmse   test_rmse  val_wmae  test_wmae   val_r2  test_r2
  ExtraTrees  7694.901917 6676.523901   3635.90    3477.52 0.900333 0.907801
     XGBoost  9451.022227 7527.174359   5281.80    4609.96 0.849650 0.882810
    CatBoost  9690.588144 7609.618596   5684.23    4880.81 0.841931 0.880229
        LGBM 10671.373288 8699.893697   6396.49    5638.97 0.808316 0.843450
        HGBR 10676.791448 8598.836122   6341.99    5450.53 0.808121 0.847066
RandomForest 11074.764725 9389.333235   4172.74    4365.94 0.793550 0.817654

Best Model: ExtraTrees
Best Validation RMSE: 7694.9019
🏃 View run model_comparison_summary at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3/runs/b7c8de07039b46bc9ff1fcb1830d072d
🧪 View experiment at: https://dagshub.com/qetibakh/Final.mlflow/#/experiments/3


რა არის გასაკეთებელი:
1. ჰიპერპარამეტრების დამატება
2. აუთლაიერების ამოგდება
3. უკეთესი featureების გამოყვანა https://www.kaggle.com/code/maxdiazbattan/wallmart-sales-top-3-eda-feature-engineering#-3.1-|-Sales-analysis
4. featureბის შერჩევა