In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config

In [None]:
df.head()

In [2]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [3]:
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [None]:
#df = df.dropna()

In [4]:
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [None]:
X_train.shape

GIULIA START

In [None]:
numerical_cols = ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed', 'count']

categorical_cols = ['description', 'hour', 'month', 'day_of_week', 'night_of_week']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
],
remainder='passthrough',
verbose_feature_names_out = False,
).set_output(transform="pandas")

out = preprocessor.fit_transform(X_train)

out.isna().sum()

GIULIA END

In [None]:
set_config(transform_output="pandas")

In [None]:
featurisation = ColumnTransformer([
    ('impute_encode', impute_encode, ['description', 'hour', 'month', 'day_of_week', 'night_of_week']),
    ('impute_num', SimpleImputer(strategy='mean'), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ('scaler', StandardScaler(), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ],
    verbose_feature_names_out = False,
    )

In [None]:
check = featurisation.fit_transform(X_train)

In [None]:
check.isna().sum()

In [None]:
pipe = Pipeline([
    ('features', featurisation),
    ('classifier', [])
])

In [None]:
grid = [{'classifier':[RandomForestRegressor(), RidgeClassifier(), LogisticRegression(), LinearSVC()]}]

In [None]:
gridSearch = GridSearchCV(pipe, grid, scoring='neg_root_mean_squared_error', n_jobs = -1)

In [None]:
gridSearch.fit(X_train, y_train)

In [None]:
gridSearch.cv_results_

In [None]:
print(gridSearch.best_score_)
gridSearch.best_estimator_

In [None]:
preds = gridSearch.best_estimator_.predict(X_val)

In [None]:
error = mean_squared_error(y_val, preds, squared=True)
error

In [None]:
r2 = r2_score(y_val,preds)
r2

In [None]:
mae = median_absolute_error(y_val, preds)
mae

In [None]:
with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)