In [1]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df



In [2]:
df.head()

Unnamed: 0,description,result_timestamp,laeq,hour,month,day_of_week,night_of_week,lat,lon,lc_dwptemp,lc_rainin,lc_dailyrain,lc_windspeed,lc_temp_qcl0,lc_temp_qcl1,lc_temp_qcl2,lc_temp_qcl3,count
0,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:10:00,61.989333,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.01,3.34,3.34,3.283,3.23854,13
1,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:20:00,62.0955,0,4,Friday,Thursday,50.87725,4.700713,1.42,0.0,0.0,0.0,3.38,3.38,3.323,3.294944,13
2,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:30:00,63.908667,0,4,Friday,Thursday,50.87725,4.700713,1.35,0.0,0.0,0.09,3.33,3.33,3.273,3.385025,13
3,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:40:00,65.064833,0,4,Friday,Thursday,50.87725,4.700713,1.28,0.0,0.0,0.04,3.28,3.28,3.223,3.27909,13
4,MP 01: Naamsestraat 35 Maxim,2022-04-01 00:50:00,67.710833,0,4,Friday,Thursday,50.87725,4.700713,1.24,0.0,0.0,0.03,3.23,3.23,3.173,3.201228,13


In [3]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [4]:
impute_and_one_hot_encode = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore'))
    ])

In [5]:
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [6]:
df = df.dropna()

In [7]:
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [9]:
X_train.isna().sum()

description      0
hour             0
month            0
day_of_week      0
night_of_week    0
lc_dwptemp       0
lc_rainin        0
lc_dailyrain     0
lc_windspeed     0
count            0
dtype: int64

In [10]:
y_train.isna().sum()

0

In [11]:
featurisation = ColumnTransformer([
    ('impute_encode', impute_and_one_hot_encode, ['description', 'hour', 'month', 'day_of_week', 'night_of_week']),
    ('impute', SimpleImputer(missing_values=pd.NA, strategy='median'), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
    ('scaler', StandardScaler(), ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed','count']),
])

In [12]:
pipe = Pipeline([
    ('features', featurisation),
    ('model', RandomForestRegressor())
])

In [13]:
pipe.fit(X_train,y_train)

In [14]:
preds = pipe.predict(X_val)

In [15]:
error = mean_squared_error(y_val, preds, squared=True)
error

5.708770076653399

In [16]:
r2 = r2_score(y_val,preds)
r2

0.8664123422021257

In [17]:
mae = median_absolute_error(y_val, preds)
mae

1.2350974999999806

In [18]:
pipe.steps[1][1].feature_importances_

array([0.00504256, 0.00716835, 0.01023822, 0.00328778, 0.00204813,
       0.01732254, 0.00432061, 0.07250795, 0.01288884, 0.00914053,
       0.00529741, 0.00168303, 0.00174774, 0.00399322, 0.02769027,
       0.06510656, 0.04610002, 0.04069857, 0.03259607, 0.02841879,
       0.01762228, 0.01240328, 0.00307074, 0.01312082, 0.00360352,
       0.00909968, 0.00607235, 0.00756583, 0.00588953, 0.00339287,
       0.01526844, 0.00911305, 0.00547135, 0.00494152, 0.00692451,
       0.01271058, 0.01610662, 0.00327492, 0.00443862, 0.002666  ,
       0.01395571, 0.00530974, 0.01463094, 0.01143229, 0.00615798,
       0.00280248, 0.00400551, 0.06345245, 0.00098936, 0.00823797,
       0.01903463, 0.09638814, 0.06326192, 0.00101628, 0.00800673,
       0.01911133, 0.10215283])

In [None]:
n_estimators = [100,1000,2500,5000]
max_depth = [2,5,10,25]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [None]:
param_grid = {
    'model__n_estimators': n_estimators,
    'model__max_depth':max_depth,
    'model__min_samples_leaf':min_samples_leaf,
    'model__bootstrap':bootstrap
}

In [None]:
search = BayesSearchCV(pipe, search_spaces=param_grid,n_jobs=-1,n_iter=5,scoring='neg_root_mean_squared_error')

In [None]:
search.fit(X,y)

In [None]:
print(search.best_score_)
search.best_params_