In [29]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config

In [4]:
df.head()

Unnamed: 0,description,result_timestamp,laeq,hour,month,day_of_week,night_of_week,lat,lon,lc_dwptemp,lc_rainin,lc_dailyrain,lc_windspeed,lc_temp_qcl0,lc_temp_qcl1,lc_temp_qcl2,lc_temp_qcl3,count
0,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:10:00,61.989333,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.01,3.34,3.34,3.283,3.23854,13
1,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:20:00,62.0955,0,4,Friday,Thursday,50.87725,4.700713,1.42,0.0,0.0,0.0,3.38,3.38,3.323,3.294944,13
2,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:30:00,63.908667,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.09,3.33,3.33,3.273,3.385025,13
3,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:40:00,65.064833,0,4,Friday,Thursday,50.87725,4.700713,1.28,0.0,0.0,0.04,3.28,3.28,3.223,3.27909,13
4,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:50:00,67.710833,0,4,Friday,Thursday,50.87725,4.700713,1.24,0.0,0.0,0.03,3.23,3.23,3.173,3.201228,13


In [5]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [35]:
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [36]:
df = df.dropna()

In [37]:
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [11]:
X_train.isna().sum()

description      0
hour             0
month            0
day_of_week      0
night_of_week    0
lc_dwptemp       0
lc_rainin        0
lc_dailyrain     0
lc_windspeed     0
count            0
dtype: int64

In [12]:
y_train.isna().sum()

0

AttributeError: 'RandomForestRegressor' object has no attribute 'shape'

In [39]:
set_config(transform_output="pandas")

In [None]:
impute_and_one_hot_encode = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
    ])

In [100]:
featurisation = ColumnTransformer([
    ('impute_encode', impute_and_one_hot_encode, ['description', 'hour', 'month', 'day_of_week', 'night_of_week']),
    ('impute', SimpleImputer(missing_values=pd.NA, strategy='median'), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ('scaler', StandardScaler(), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ],
    verbose_feature_names_out = False,
    )

In [101]:
pipe = Pipeline([
    ('features', featurisation),
    ('model', RandomForestRegressor())
])

In [102]:
pipe.fit(X_train,y_train)



In [57]:
preds = pipe.predict(X_val)

In [58]:
error = mean_squared_error(y_val, preds, squared=True)
error

5.724330513529574

In [59]:
r2 = r2_score(y_val,preds)
r2

0.8660482213339371

In [60]:
mae = median_absolute_error(y_val, preds)
mae

1.2299907382550188

In [91]:
with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)

In [85]:
n_estimators = [100,1000,2500,5000]
max_depth = [2,5,10,25]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [86]:
param_grid = {
    'model__n_estimators': n_estimators,
    'model__max_depth':max_depth,
    'model__min_samples_leaf':min_samples_leaf,
    'model__bootstrap':bootstrap
}

In [89]:
search = BayesSearchCV(pipe, search_spaces=param_grid,n_jobs=-1,n_iter=5,scoring='neg_root_mean_squared_error')

In [92]:
search.fit(X,y)











libc++abi: libc++abi: terminating with uncaught exception of type std::runtime_error: Couldn't close fileterminating with uncaught exception of type std::runtime_error: Couldn't close file

libc++abi: terminating with uncaught exception of type std::runtime_error: Couldn't close file
libc++abi: terminating with uncaught exception of type std::runtime_error: Couldn't close file




TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGABRT(-6)}

In [None]:
print(search.best_score_)
search.best_params_

-5.197479038128657


{'model__n_estimators': 2500,
 'model__min_samples_leaf': 4,
 'model__max_depth': 10,
 'model__bootstrap': False}

In [None]:
pipe_opt = Pipeline([
    ('features', featurisation),
    ('model', RandomForestRegressor(n_estimators=2500,
                                    min_samples_leaf=4,
                                    max_depth=10,
                                    bootstrap=False))
])

In [None]:
pipe_opt.fit(X_train,y_train)

In [None]:
preds_opt = pipe_opt.predict(X_val)

In [None]:
error_opt = mean_squared_error(y_val, preds_opt, squared=True)
error_opt

15.582625024741144

In [None]:
r2_opt = r2_score(y_val,preds_opt)
r2_opt

0.6353599196592591

In [None]:
mae_opt = median_absolute_error(y_val, preds_opt)
mae_opt

2.1539054287205346