# Modeling
steps:
- Preprocessing the test set like the train data
- Scaling the data (test and train)
- function for model_evaluation
- function for model building
- 

In [65]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.metrics import fbeta_score, make_scorer, f1_score, accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import TimeSeriesSplit

RSEED = 42

## Scaling the data

## Modeling

In [59]:
df_processed = pd.read_csv("data/df_processed.csv")
df_processed.columns

Index(['Unnamed: 0', 'Place_ID X Date', 'Date', 'Place_ID', 'target',
       'target_min', 'target_max', 'target_variance', 'target_count',
       'precipitable_water_entire_atmosphere',
       'relative_humidity_2m_above_ground',
       'specific_humidity_2m_above_ground', 'temperature_2m_above_ground',
       'u_component_of_wind_10m_above_ground',
       'v_component_of_wind_10m_above_ground',
       'L3_NO2_NO2_column_number_density',
       'L3_NO2_tropospheric_NO2_column_number_density',
       'L3_O3_O3_column_number_density', 'L3_CO_CO_column_number_density',
       'L3_CO_H2O_column_number_density',
       'L3_HCHO_tropospheric_HCHO_column_number_density',
       'L3_CLOUD_cloud_base_height', 'L3_CLOUD_cloud_fraction',
       'L3_CLOUD_cloud_optical_depth', 'L3_AER_AI_absorbing_aerosol_index',
       'L3_SO2_SO2_column_number_density'],
      dtype='object')

In [60]:
# drop columns with any NaN values in training set
df_processed = df_processed.dropna()


In [61]:
X = df_processed.drop(columns=['Place_ID X Date', 'Date', 'Place_ID', 'target','target_min', 'target_max', 'target_variance', 'target_count'])
y = df_processed['target']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED,)

In [52]:
X_train.isna().sum()

Unnamed: 0                                         0
precipitable_water_entire_atmosphere               0
relative_humidity_2m_above_ground                  0
specific_humidity_2m_above_ground                  0
temperature_2m_above_ground                        0
u_component_of_wind_10m_above_ground               0
v_component_of_wind_10m_above_ground               0
L3_NO2_NO2_column_number_density                   0
L3_NO2_tropospheric_NO2_column_number_density      0
L3_O3_O3_column_number_density                     0
L3_CO_CO_column_number_density                     0
L3_CO_H2O_column_number_density                    0
L3_HCHO_tropospheric_HCHO_column_number_density    0
L3_CLOUD_cloud_base_height                         2
L3_CLOUD_cloud_fraction                            0
L3_CLOUD_cloud_optical_depth                       2
L3_AER_AI_absorbing_aerosol_index                  0
L3_SO2_SO2_column_number_density                   0
dtype: int64

In [71]:

list_of_clf = [LinearRegression(),
               KNeighborsRegressor(),
               #RandomForestRegressor(),
               #GradientBoostingRegressor()
               ]

tscv = TimeSeriesSplit(n_splits=5)

scorers = {
    'r2': 'r2',
    'rmse': make_scorer(mean_squared_error, greater_is_better=False, squared=False)
}

def model_evaluation(estimator, scoring, X_train, y_train, cv=tscv):
    return cross_validate(estimator, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False)

for reg in list_of_clf:
    results = model_evaluation(reg, scorers, X_train, y_train, cv=5)
    print(reg)
   
    r2_mean, r2_std = results['test_r2'].mean(), results['test_r2'].std()
    rmse_mean, rmse_std = (-results['test_rmse']).mean(), (-results['test_rmse']).std()  # Vorzeichen umdrehen
    print(f'R²   (mean ± std): {r2_mean:.3f} ± {r2_std:.3f}')
    print(f'RMSE (mean ± std): {rmse_mean:.3f} ± {rmse_std:.3f}')
    print('----'*10)

LinearRegression()
R²   (mean ± std): 0.333 ± 0.016
RMSE (mean ± std): 37.954 ± 1.330
----------------------------------------
KNeighborsRegressor()
R²   (mean ± std): 0.707 ± 0.022
RMSE (mean ± std): 25.156 ± 1.624
----------------------------------------


## Hyperparameter Tuning

- LinearRegression - GradientDescent
- KNN - GridSearchCV?
- Decision Tree - GridSearchCV?

## Evaluation

of different models

## Error Analysis