In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config

In [2]:
df.head()

Unnamed: 0,description,result_timestamp,laeq,hour,month,day_of_week,night_of_week,lat,lon,lc_dwptemp,lc_rainin,lc_dailyrain,lc_windspeed,lc_temp_qcl0,lc_temp_qcl1,lc_temp_qcl2,lc_temp_qcl3,count
0,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:10:00,61.989333,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.01,3.34,3.34,3.283,3.23854,13
1,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:20:00,62.0955,0,4,Friday,Thursday,50.87725,4.700713,1.42,0.0,0.0,0.0,3.38,3.38,3.323,3.294944,13
2,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:30:00,63.908667,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.09,3.33,3.33,3.273,3.385025,13
3,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:40:00,65.064833,0,4,Friday,Thursday,50.87725,4.700713,1.28,0.0,0.0,0.04,3.28,3.28,3.223,3.27909,13
4,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:50:00,67.710833,0,4,Friday,Thursday,50.87725,4.700713,1.24,0.0,0.0,0.03,3.23,3.23,3.173,3.201228,13


In [3]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [4]:
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [None]:
#df = df.dropna()

In [5]:
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [None]:
X_train.shape

In [None]:
# Pipeline
numerical_cols = ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed', 'count']

categorical_cols = ['description', 'hour', 'month', 'day_of_week', 'night_of_week']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
],
remainder='passthrough',
verbose_feature_names_out = False,
).set_output(transform="pandas")

check = preprocessor.fit_transform(X_train)

check.isna().sum()

In [8]:
pipe = Pipeline([
    ('features', preprocessor),
    ('classifier', [])
])

In [9]:
grid = [{'classifier':[RandomForestRegressor(), RidgeClassifier(), LogisticRegression(), LinearSVC()]}]

In [10]:
gridSearch = GridSearchCV(pipe, grid, scoring='neg_root_mean_squared_error', n_jobs = -1)

In [11]:
gridSearch.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
gridSearch.cv_results_

In [None]:
print(gridSearch.best_score_)
gridSearch.best_estimator_

In [None]:
preds = gridSearch.best_estimator_.predict(X_val)

In [None]:
error = mean_squared_error(y_val, preds, squared=True)
error

In [None]:
r2 = r2_score(y_val,preds)
r2

In [None]:
mae = median_absolute_error(y_val, preds)
mae

In [None]:
with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)