In [3]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
import csv

In [6]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"

train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

target = 'PM10'
features = train_df.select_dtypes(include=[np.number]).columns.tolist()

if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())

X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

print(f"Defeating pollution with {len(features)} features!")

Defeating pollution with 16 features!


In [7]:
model = xgb.XGBRegressor(
    n_estimators=1000,      # More trees for deeper learning
    max_depth=7,            # Sufficient depth for 16-20 features
    learning_rate=0.03,     # Slower learning for better generalization
    subsample=0.8,          # Use 80% of data per tree to prevent overfitting
    colsample_bytree=0.8,   # Use 80% of features per tree
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)
print("XGBoost training completed.")

XGBoost training completed.


In [8]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- XGBoost Performance ---")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

--- XGBoost Performance ---
MAE: 6.7169
RMSE: 13.1318
R2 Score: 0.8847


In [9]:
log_path = os.path.join(PATH, 'model_performances.csv')
log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_id': '004',
    'model_name': 'XGBoost',
    'mae': mae,
    'rmse': rmse,
    'r2_score': r2,
    'features': f'all_numeric_{len(features)}_cols',
    'comments': '1000 estimators, learning_rate 0.03, max_depth 7'
}

with open(log_path, 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=log_data.keys())
    writer.writerow(log_data)

print("Logged to performance ledger.")

Logged to performance ledger.
