In [2]:
import pandas as pd
import numpy as np
import os
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime
import csv


In [3]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"

train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

target = 'PM10'
features = train_df.select_dtypes(include=[np.number]).columns.tolist()

if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())

X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

In [5]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    loss_function='RMSE',
    random_seed=42,
    verbose=100, # Shows progress every 100 iterations
    thread_count=-1 # Use all cores
)

model.fit(X_train, y_train)
print("CatBoost training completed.")

0:	learn: 37.7242515	total: 9.85ms	remaining: 9.84s
100:	learn: 14.1618597	total: 565ms	remaining: 5.03s
200:	learn: 13.4257746	total: 1.11s	remaining: 4.42s
300:	learn: 12.9683449	total: 1.65s	remaining: 3.82s
400:	learn: 12.6215426	total: 2.18s	remaining: 3.25s
500:	learn: 12.3383317	total: 2.71s	remaining: 2.7s
600:	learn: 12.1004847	total: 3.24s	remaining: 2.15s
700:	learn: 11.8938745	total: 3.78s	remaining: 1.61s
800:	learn: 11.7052594	total: 4.3s	remaining: 1.07s
900:	learn: 11.5499157	total: 4.83s	remaining: 531ms
999:	learn: 11.3993735	total: 5.35s	remaining: 0us
CatBoost training completed.


In [6]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- CatBoost Performance ---")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R2 Score: {r2:.4f}")

--- CatBoost Performance ---
MAE: 6.7030
RMSE: 13.0459
R2 Score: 0.8862


In [7]:
log_path = os.path.join(PATH, 'model_performances.csv')
log_data = {
    'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'model_id': '006',
    'model_name': 'CatBoost',
    'mae': mae,
    'rmse': rmse,
    'r2_score': r2,
    'features': f'all_numeric_{len(features)}_cols',
    'comments': '1000 iterations, symmetric trees'
}

with open(log_path, 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=log_data.keys())
    writer.writerow(log_data)
    