In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config



In [2]:
df.head()

Unnamed: 0,description,result_timestamp,laeq,hour,month,day_of_week,night_of_week,lat,lon,lc_dwptemp,lc_rainin,lc_dailyrain,lc_windspeed,lc_temp_qcl0,lc_temp_qcl1,lc_temp_qcl2,lc_temp_qcl3,count
0,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:10:00,61.989333,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.01,3.34,3.34,3.283,3.23854,13
1,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:20:00,62.0955,0,4,Friday,Thursday,50.87725,4.700713,1.42,0.0,0.0,0.0,3.38,3.38,3.323,3.294944,13
2,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:30:00,63.908667,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.09,3.33,3.33,3.273,3.385025,13
3,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:40:00,65.064833,0,4,Friday,Thursday,50.87725,4.700713,1.28,0.0,0.0,0.04,3.28,3.28,3.223,3.27909,13
4,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:50:00,67.710833,0,4,Friday,Thursday,50.87725,4.700713,1.24,0.0,0.0,0.03,3.23,3.23,3.173,3.201228,13


In [3]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [4]:
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [5]:
#df = df.dropna()

In [6]:
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [8]:
X_train.isna().sum()

description       0
hour              0
month             0
day_of_week       0
night_of_week     0
lc_dwptemp       16
lc_rainin        16
lc_dailyrain     16
lc_windspeed     16
count             0
dtype: int64

In [9]:
y_train.isna().sum()

0

In [10]:
X_train.shape

(142292, 10)

In [11]:
set_config(transform_output="pandas")

In [12]:
impute_encode = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
    ])

In [13]:
featurisation = ColumnTransformer([
    ('impute_encode', impute_encode, ['description', 'hour', 'month', 'day_of_week', 'night_of_week']),
    ('impute_num', SimpleImputer(missing_values = np.nan , strategy='mean'), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ('scaler', StandardScaler(), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ],
    verbose_feature_names_out = False,
    )

In [14]:
check = featurisation.fit_transform(X_train)

In [15]:
check.isna().sum()

description_MP 01: Naamsestraat 35  Maxim        0
description_MP 02: Naamsestraat 57 Xior          0
description_MP 03: Naamsestraat 62 Taste         0
description_MP 04: His & Hears                   0
description_MP 05: Calvariekapel KU Leuven       0
description_MP 06: Parkstraat 2 La Filosovia     0
description_MP 07: Naamsestraat 81               0
description_MP08bis - Vrijthof                   0
hour_0                                           0
hour_1                                           0
hour_2                                           0
hour_3                                           0
hour_4                                           0
hour_5                                           0
hour_6                                           0
hour_7                                           0
hour_19                                          0
hour_20                                          0
hour_21                                          0
hour_22                        

In [72]:
pipe = Pipeline([
    ('features', featurisation),
    ('classifier', [])
])

In [84]:
grid = [{'classifier':[RandomForestRegressor(), RidgeClassifier(), LogisticRegression(), LinearSVC()]}]

In [86]:
gridSearch = GridSearchCV(pipe, grid, scoring='neg_root_mean_squared_error', n_jobs = -1)

In [87]:
gridSearch.fit(X_train, y_train)













15 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/christianbutcher/opt/anaconda3/envs/mdaproject/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/christianbutcher/opt/anaconda3/envs/mdaproject/lib/python3.8/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/christianbutcher/opt/anaconda3/envs/mdaproject/lib/python3.8/site-packages/sklearn/linear_model/_ridge.py", line 1422, in fit
    X, y, sample_weight, Y = self._prepare_data

In [94]:
gridSearch.cv_results_

{'mean_fit_time': array([100.58538413,   1.4614295 ,   0.77493119,   0.43010716]),
 'std_fit_time': array([0.49807647, 0.28343996, 0.16769718, 0.04550631]),
 'mean_score_time': array([2.08630514, 0.        , 0.        , 0.        ]),
 'std_score_time': array([0.17330343, 0.        , 0.        , 0.        ]),
 'param_classifier': masked_array(data=[RandomForestRegressor(), RidgeClassifier(),
                    LogisticRegression(), LinearSVC()],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier': RandomForestRegressor()},
  {'classifier': RidgeClassifier()},
  {'classifier': LogisticRegression()},
  {'classifier': LinearSVC()}],
 'split0_test_score': array([-2.47114372,         nan,         nan,         nan]),
 'split1_test_score': array([-2.50768563,         nan,         nan,         nan]),
 'split2_test_score': array([-2.45338189,         nan,         nan,         nan]),
 'split3_test_score': array([-2.469230

In [113]:
print(gridSearch.best_score_)
gridSearch.best_estimator_

-2.478242708055242


In [109]:
preds = gridSearch.best_estimator_.predict(X_val)

In [110]:
error = mean_squared_error(y_val, preds, squared=True)
error

5.733173473118081

In [111]:
r2 = r2_score(y_val,preds)
r2

0.8658412923030656

In [112]:
mae = median_absolute_error(y_val, preds)
mae

1.233143333333338

In [None]:
with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)