In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
import csv

In [2]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"

train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

target = 'PM10'
features = train_df.select_dtypes(include=[np.number]).columns.tolist()

if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())

X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

In [3]:
base_models = [
    ('lgb', lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42, n_jobs=-1)),
    ('xgb', xgb.XGBRegressor(n_estimators=1000, max_depth=7, learning_rate=0.03, random_state=42, n_jobs=-1)),
    ('cat', CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=8, random_seed=42, verbose=0))
]

meta_model = LinearRegression()

stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,      
    n_jobs=-1
)

print("Training Stacking Ensemble")
stacking_model.fit(X_train, y_train)
print("Stacking Ensemble training completed!")

Training Stacking Ensemble
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3105
[LightGBM] [Info] Number of data points in the train set: 166766, number of used features: 16
[LightGBM] [Info] Start training from score 39.047001
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3106
[LightGBM] [Info] Number of data points in the train set: 133413, number of used features: 16
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3102
[LightGBM] [Inf

In [4]:
y_pred = stacking_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- Stacking Ensemble Performance ---")
print(f"MAE: {mae:.4f} | RMSE: {rmse:.4f} | R2 Score: {r2:.4f}")

# Log to ledger
log_path = os.path.join(PATH, 'model_performances.csv')
log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_id': '007',
    'model_name': 'Stacking Ensemble (LGBM, XGB, CAT)',
    'mae': mae, 'rmse': rmse, 'r2_score': r2,
    'features': f'all_numeric_{len(features)}_cols',
    'comments': 'Combined top 3 models with Linear Meta-Regressor'
}

with open(log_path, 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=log_data.keys())
    writer.writerow(log_data)

--- Stacking Ensemble Performance ---
MAE: 6.6416 | RMSE: 12.9004 | R2 Score: 0.8887
