# Modeling
steps:
- Preprocessing the test set like the train data
- Scaling the data (test and train)
- function for model_evaluation
- function for model building
- 

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.metrics import fbeta_score, make_scorer, f1_score, accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV

import importlib
import preprocessing
importlib.reload(preprocessing)
from preprocessing import preprocess_for_air_quality

RSEED = 42

## Scaling the data

## Modeling

In [2]:
df = pd.read_csv('data/Train.csv')

In [3]:
df_processed = preprocess_for_air_quality(df)

In [4]:
#df_processed = pd.read_csv("data/df_processed.csv")
#df_processed.columns

In [5]:
df_processed.isna().sum()

Place_ID X Date                                    0
Date                                               0
Place_ID                                           0
target                                             0
target_min                                         0
target_max                                         0
target_variance                                    0
target_count                                       0
precipitable_water_entire_atmosphere               0
relative_humidity_2m_above_ground                  0
specific_humidity_2m_above_ground                  0
temperature_2m_above_ground                        0
u_component_of_wind_10m_above_ground               0
v_component_of_wind_10m_above_ground               0
L3_NO2_NO2_column_number_density                   0
L3_NO2_tropospheric_NO2_column_number_density      0
L3_O3_O3_column_number_density                     0
L3_CO_CO_column_number_density                     0
L3_CO_H2O_column_number_density               

In [6]:
# drop columns with any NaN values in training set
#df_processed = df_processed.dropna()


In [7]:
X = df_processed.drop(columns=['Place_ID X Date', 'Date', 'Place_ID', 'target','target_min', 'target_max', 'target_variance', 'target_count'])
y = df_processed['target']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED,)

In [9]:
X_train.isna().sum()

precipitable_water_entire_atmosphere               0
relative_humidity_2m_above_ground                  0
specific_humidity_2m_above_ground                  0
temperature_2m_above_ground                        0
u_component_of_wind_10m_above_ground               0
v_component_of_wind_10m_above_ground               0
L3_NO2_NO2_column_number_density                   0
L3_NO2_tropospheric_NO2_column_number_density      0
L3_O3_O3_column_number_density                     0
L3_CO_CO_column_number_density                     0
L3_CO_H2O_column_number_density                    0
L3_HCHO_tropospheric_HCHO_column_number_density    0
L3_CLOUD_cloud_base_height                         0
L3_CLOUD_cloud_fraction                            0
L3_CLOUD_cloud_optical_depth                       0
L3_AER_AI_absorbing_aerosol_index                  0
L3_SO2_SO2_column_number_density                   0
dtype: int64

In [10]:

list_of_clf = [LinearRegression(),
               KNeighborsRegressor(),
               RandomForestRegressor(),
               GradientBoostingRegressor()
               ]

tscv = TimeSeriesSplit(n_splits=5)

scorers = {
    'r2': 'r2',
    'rmse': make_scorer(mean_squared_error, greater_is_better=False, squared=False)
}

def model_evaluation(estimator, scoring, X_train, y_train, cv=tscv):
    return cross_validate(estimator, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)

for reg in list_of_clf:
    results = model_evaluation(reg, scorers, X_train, y_train, cv=5)
    print(reg)
   
    r2_mean, r2_std = results['test_r2'].mean(), results['test_r2'].std()
    rmse_mean, rmse_std = (-results['test_rmse']).mean(), (-results['test_rmse']).std()  # Vorzeichen umdrehen
    print(f'R²   (mean ± std): {r2_mean:.3f} ± {r2_std:.3f}')
    print(f'RMSE (mean ± std): {rmse_mean:.3f} ± {rmse_std:.3f}')
    print('----'*10)

LinearRegression()
R²   (mean ± std): 0.333 ± 0.016
RMSE (mean ± std): 37.971 ± 1.325
----------------------------------------
KNeighborsRegressor()
R²   (mean ± std): 0.493 ± 0.019
RMSE (mean ± std): 33.106 ± 1.518
----------------------------------------
RandomForestRegressor()
R²   (mean ± std): 0.535 ± 0.018
RMSE (mean ± std): 31.686 ± 1.446
----------------------------------------
GradientBoostingRegressor()
R²   (mean ± std): 0.467 ± 0.020
RMSE (mean ± std): 33.940 ± 1.506
----------------------------------------


## Hyperparameter Tuning

- LinearRegression - GradientDescent
- KNN - GridSearchCV?
- Decision Tree - GridSearchCV?

## KNNReg parametertuning

In [12]:
KNeighborsRegressor().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [None]:
# Defining parameter grid (as dictionary)
param_grid = {"n_neighbors" : [2,4,3,5,10], #this actually defines the model you use
              "weights" : ["uniform", "distance"],
              "p" : [1, 2, 3],
              "algorithm": ["ball_tree", "kd_tree", "brute"],
              "leaf_size": [10, 20, 30, 40],
              "weights": ["uniform", "distance"],
             }

# Instantiate gridsearch and define the metric to optimize 
gs = GridSearchCV(KNeighborsRegressor(), param_grid, scoring='neg_root_mean_squared_error',
                  cv=5, verbose=2, n_jobs=-1)

# Fit gridsearch object to data.. also lets see how long it takes

gs.fit(X_train, y_train)



Fitting 5 folds for each of 360 candidates, totalling 1800 fits


In [89]:
# Best score
print('Best score:', -round(gs.best_score_, 3))

# Best parameters
print('Best parameters:', gs.best_params_)

Best score: 23.487
Best parameters: {'algorithm': 'ball_tree', 'leaf_size': 10, 'n_neighbors': 4, 'p': 1, 'weights': 'distance'}


In [101]:
# Assigning the fitted KNNClassifier model with best parameter combination to a new variable knn_best
knn_best_gs = gs.best_estimator_

# Making predictions on the train set
y_pred_knn = knn_best_gs.predict(X_test)

print("R2 on train:", round(r2_score(y_test, y_pred_knn),3))


R2 on train: 0.777


In [116]:
def print_pretty_summary(name, y_actual, y_pred):
    print(name)
    print('=======================')
    
    r2 = r2_score(y_actual, y_pred)
    rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
    
    print(f"R²:   {r2:.3f}")
    print(f"RMSE: {rmse:.3f}")
    print()

In [117]:
print_pretty_summary("KNNreg",y_test, y_pred_knn)

KNNreg
R²:   0.777
RMSE: 22.572



## RandomForestReg parametertuning

In [103]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [None]:
# --- CV und Scorer definieren ---
tscv = TimeSeriesSplit(n_splits=5)
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# --- Modell ---
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# --- Parameter-Raum ---
param_dist = {
    "n_estimators": [200, 400, 600, 800, 1000],
    "max_depth": [None, 8, 12, 16, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", 0.5, 0.8, 1.0],
    "bootstrap": [True, False],
}

# --- Randomized Search ---
rand_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,                  # Anzahl zufälliger Kombinationen
    scoring=rmse_scorer,
    cv=tscv,
    n_jobs=-1,
    random_state=42,
    verbose=2,
    refit=True
)

rand_search.fit(X, y)

# --- Ergebnisse ---
print("\nBeste Parameterkombination:")
print(rand_search.best_params_)
print("Bester RMSE (CV):", -rand_search.best_score_)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  16.7s
[CV] END bootstrap=False, max_depth=16, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time=  35.0s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time=  39.2s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=800; total time= 1.0min
[CV] END bootstrap=True, max_depth=20, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=  57.5s
[CV] END bootstrap=False, max_depth=16, max_features=1.0, min_samples_leaf=2, min_samples_split=2, n_estimators=400; total time= 1.3min
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_esti



[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time= 1.1min
[CV] END bootstrap=False, max_depth=8, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  11.9s
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=  55.5s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600; total time= 1.4min
[CV] END bootstrap=False, max_depth=8, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=  24.7s
[CV] END bootstrap=False, max_depth=None, max_features=0.8, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time= 2.7min
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   9.9s
[CV] END bootstrap=False, max_dep

In [None]:
Beste Parameterkombination:
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 16, 'bootstrap': True}
Bester RMSE (CV): 36.65552053916143

In [119]:
print("Best RMSE (CV):", -round(rand_search.best_score_, 3))
print("Best Parameters:", rand_search.best_params_)

Best RMSE (CV): 36.656
Best Parameters: {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 16, 'bootstrap': True}


In [121]:
# Assigning the fitted random forest classifier model with best parameter combination to a new variable rf_best_gs
rf_best_rs = rand_search.best_estimator_

# Making predictions on the train set
y_pred_rf = rf_best_rs.predict(X_test)

print("R2 on train:", round(r2_score(y_test, y_pred_rf),3))


R2 on train: 0.824


In [122]:
print_pretty_summary("Random Forest",y_test, y_pred_rf)

Random Forest
R²:   0.824
RMSE: 20.064



## Evaluation

of different models

## Error Analysis