In [2]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
import optuna
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [3]:
PATH = "/Users/beratzengin/Desktop/Github/EcoAir SmartCity Predictor/Feature Engineering"
train_df = pd.read_csv(os.path.join(PATH, 'train.csv'))
test_df = pd.read_csv(os.path.join(PATH, 'test.csv'))

target = 'PM10'
features = train_df.select_dtypes(include=[np.number]).columns.tolist()
if target in features:
    features.remove(target)

X_train = train_df[features].fillna(0)
y_train = train_df[target].fillna(train_df[target].mean())
X_test = test_df[features].fillna(0)
y_test = test_df[target].fillna(test_df[target].mean())

In [4]:
def objective(trial):
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
        # Optuna'nın deneyeceği aralıklar
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 31, 512),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    model = lgb.LGBMRegressor(**param)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return r2_score(y_test, preds)


In [5]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) 

print ("Best Params: ",study.best_params)

final_params = study.best_params
final_params['objective'] = 'regression'
final_params['random_state'] = 42
final_params['n_jobs'] = -1

final_model = lgb.LGBMRegressor(**final_params)
final_model.fit(X_train, y_train)

[32m[I 2026-02-28 12:05:26,574][0m A new study created in memory with name: no-name-34f7dfa6-2167-40d5-8c09-b84998898ca3[0m
[32m[I 2026-02-28 12:05:59,542][0m Trial 0 finished with value: 0.88858562559131 and parameters: {'learning_rate': 0.023168635801878723, 'n_estimators': 1111, 'lambda_l1': 0.009034014286186649, 'lambda_l2': 0.06511096674584499, 'num_leaves': 278, 'feature_fraction': 0.7567472428142674, 'bagging_fraction': 0.916127596939109, 'bagging_freq': 4, 'min_child_samples': 73}. Best is trial 0 with value: 0.88858562559131.[0m
[32m[I 2026-02-28 12:07:08,304][0m Trial 1 finished with value: 0.8771533488300479 and parameters: {'learning_rate': 0.0937687088748134, 'n_estimators': 1829, 'lambda_l1': 0.034469189749602704, 'lambda_l2': 0.00019487747579652114, 'num_leaves': 419, 'feature_fraction': 0.667175045017825, 'bagging_fraction': 0.7628066519128339, 'bagging_freq': 6, 'min_child_samples': 48}. Best is trial 0 with value: 0.88858562559131.[0m
[32m[I 2026-02-28 12:07

Best Params:  {'learning_rate': 0.023643245982405643, 'n_estimators': 1239, 'lambda_l1': 0.0005463210400451073, 'lambda_l2': 6.512810945105947, 'num_leaves': 507, 'feature_fraction': 0.5637518768243246, 'bagging_fraction': 0.8006560231427697, 'bagging_freq': 5, 'min_child_samples': 5}


0,1,2
,boosting_type,'gbdt'
,num_leaves,507
,max_depth,-1
,learning_rate,0.023643245982405643
,n_estimators,1239
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [8]:
from sklearn.metrics import mean_absolute_error
import csv
from datetime import datetime

y_pred = final_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"\n--- OPTIMIZED LIGHTGBM SONUÇLARI ---")
print(f"R2 SCORE: {r2:.4f}")
print(f"MAE: {mae:.4f}")

# Deftere işle
log_path = os.path.join(PATH, 'model_performances.csv')
with open(log_path, 'a', newline='') as f:
    writer = csv.writer(f)
    writer.writerow([
        datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        '008',
        'LightGBM (Optuna Tuned)',
        mae,
        np.sqrt(((y_test - y_pred) ** 2).mean()),
        r2,
        f'all_numeric_{len(features)}_cols',
        'Tuned with 50 trials'
    ])


--- OPTIMIZED LIGHTGBM SONUÇLARI ---
R2 SCORE: 0.8899
MAE: 6.6125
