In [2]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
import csv

In [3]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"

train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

target = 'PM10'
features = train_df.select_dtypes(include=[np.number]).columns.tolist()

if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())

X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

In [5]:
model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    random_state=42,
    n_jobs=-1,
    importance_type='gain'
)

model.fit(X_train, y_train)
print("LightGBM training completed.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3105
[LightGBM] [Info] Number of data points in the train set: 166766, number of used features: 16
[LightGBM] [Info] Start training from score 39.047001
LightGBM training completed.


In [6]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- LightGBM Performance ---")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

--- LightGBM Performance ---
MAE: 6.6864
RMSE: 13.0145
R2 Score: 0.8868


In [8]:
log_path = os.path.join(PATH, 'model_performances.csv')
log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_id': '005',
    'model_name': 'LightGBM',
    'mae': mae,
    'rmse': rmse,
    'r2_score': r2,
    'features': f'all_numeric_{len(features)}_cols',
    'comments': '1000 estimators, leaf-wise growth'
}

with open(log_path, 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=log_data.keys())
    writer.writerow(log_data)