In [2]:
import mlflow
import dagshub
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import joblib
import warnings
warnings.filterwarnings("ignore")

In [3]:
CONFIG = {
    "data_path": "C:\\ESG\\data\\processed_esg_dataset.csv",
    "test_size": 0.25,
    "mlflow_tracing_uri": "https://dagshub.com/virajdeshmukh080818/ESG.mlflow",
    "dagshub_repo_owner": "virajdeshmukh080818",
    "dagshub_repo_name": "ESG",
    "experiment_name": "Traing Advanced Models"
}

In [4]:
mlflow.set_tracking_uri(CONFIG['mlflow_tracing_uri'])
dagshub.init(repo_owner=CONFIG['dagshub_repo_owner'], repo_name=CONFIG['dagshub_repo_name'], mlflow=True)
mlflow.set_experiment(CONFIG['experiment_name'])

<Experiment: artifact_location='mlflow-artifacts:/47fefc0ac8fe470a9a3037536f9dffe9', creation_time=1754685677086, experiment_id='1', last_update_time=1754685677086, lifecycle_stage='active', name='Traing Advanced Models', tags={}>

In [5]:
data = pd.read_csv("C:\\ESG\data\\processed_esg_dataset.csv")

In [6]:
X = data.drop('MarketCap', axis=1)
y = data['MarketCap']

In [7]:
models = {
    "RandomForest": RandomForestRegressor(random_state=42, n_jobs=-1),
    "XGBoost": XGBRegressor(random_state=42, n_jobs=-1, verbosity=0),
    "LightGBM": LGBMRegressor(random_state=42, n_jobs=-1),
    "CatBoost": CatBoostRegressor(random_state=42, silent=True)
}

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_r2 = make_scorer(r2_score)
scoring_mae = make_scorer(mean_absolute_error, greater_is_better=False)

In [9]:
result = []
for name, model in models.items():
    r2_scores = cross_val_score(model, X,y, cv=kf, scoring=scoring_r2)
    mae_scores = cross_val_score(model, X,y, cv=kf, scoring=scoring_mae)

    result.append({
        "Model": name,
        "Mean R2": np.mean(r2_scores),
        "Std R2": np.std(r2_scores),
        "Mean MAE": -np.mean(mae_scores),
        "Std MAE": np.std(mae_scores)
    })

result_df = pd.DataFrame(result).sort_values(by="Mean R2", ascending=False)
print("Models Performance Comaparison: ")
print(result_df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3086
[LightGBM] [Info] Number of data points in the train set: 8800, number of used features: 15
[LightGBM] [Info] Start training from score 13178.413854
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3085
[LightGBM] [Info] Number of data points in the train set: 8800, number of used features: 15
[LightGBM] [Info] Start training from score 13463.558632
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

In [10]:
top_models = result_df.head(2)['Model'].values
for name in top_models:
    model = models[name]
    model.fit(X,y)
    joblib.dump(model, f'{name}_best_model.pkl')
    print(f'Saved {name} as {name}_best_model.pkl')

Saved CatBoost as CatBoost_best_model.pkl
Saved RandomForest as RandomForest_best_model.pkl
