In [17]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config

In [None]:
df.head()

In [17]:
df.describe()

Unnamed: 0,laeq,hour,month,lc_dwptemp,lc_rainin,lc_dailyrain,lc_windspeed,count
count,177866.0,177866.0,177866.0,177845.0,177845.0,177845.0,177845.0,177866.0
mean,48.040869,10.243177,6.901746,8.009622,6.7e-05,0.0008,0.075361,4.889777
std,6.551384,8.748248,3.30247,5.405742,0.000601,0.003185,0.162827,4.356244
min,29.197667,0.0,1.0,-7.61,0.0,0.0,0.0,0.0
25%,43.229833,3.0,4.0,4.28,0.0,0.0,0.0,1.0
50%,48.382,6.0,7.0,8.73,0.0,0.0,0.01,3.0
75%,53.07,20.0,10.0,12.02,0.0,0.0,0.08,10.0
max,84.31,23.0,12.0,20.6,0.029,0.049,2.82,14.0


In [3]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [3]:
#Drop the features we have decided not to use in the models
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [None]:
#df = df.dropna()

In [5]:
#Create a dataframe for the features and one for the target variable, laeq
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [5]:
#Create test and training sets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [None]:
X_train.isna().sum()

In [None]:
y_train.isna().sum()

In [None]:
X_train.shape

In [7]:
# Define the preprocessing steps we want to include in our pipeline
numerical_cols = ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed', 'count']

categorical_cols = ['description', 'hour', 'month', 'day_of_week', 'night_of_week']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
],
remainder='passthrough',
verbose_feature_names_out = False,
).set_output(transform="pandas")

In [9]:
#Create the pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', [])
])

In [10]:
#Define the different models we want to compare
grid = [{'classifier':[RandomForestRegressor(), Ridge(), SVR()]}]

In [11]:
#Define the grid search
gridSearch = GridSearchCV(pipe, grid, scoring='neg_root_mean_squared_error', n_jobs = -1, verbose=1, cv=3)

In [12]:
#Fit the search to our training data
gridSearch.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits




KeyboardInterrupt: 

In [12]:
gridSearch.cv_results_

{'mean_fit_time': array([  80.43572934,    1.0424397 , 1036.12125047]),
 'std_fit_time': array([ 0.34884474,  0.13639568, 67.01143524]),
 'mean_score_time': array([3.39865335e+00, 3.37134997e-01, 8.51520972e+02]),
 'std_score_time': array([2.30599006e-02, 1.42020878e-02, 3.01026477e+01]),
 'param_classifier': masked_array(data=[RandomForestRegressor(), Ridge(), SVR()],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier': RandomForestRegressor()},
  {'classifier': Ridge()},
  {'classifier': SVR()}],
 'split0_test_score': array([-2.52258278, -4.22479101, -3.1123475 ]),
 'split1_test_score': array([-2.52294251, -4.23735282, -3.12515011]),
 'split2_test_score': array([-2.56985262, -4.23121362, -3.11584329]),
 'mean_test_score': array([-2.5384593 , -4.23111915, -3.1177803 ]),
 'std_test_score': array([0.02219892, 0.00512877, 0.00540313]),
 'rank_test_score': array([1, 3, 2], dtype=int32)}

In [13]:
#The best model as selected by the grid search
print(gridSearch.best_score_)
gridSearch.best_estimator_

-2.538459302675102


In [14]:
#Use the best model to predict on the test data
preds = gridSearch.best_estimator_.predict(X_val)

In [15]:
#Calculate performance measures
mse = mean_squared_error(y_val, preds, squared=True)
r2 = r2_score(y_val,preds)
mae = median_absolute_error(y_val, preds)

In [16]:
#Define the parameters we want to tune and the ranges over which to do so

n_estimators = Integer(25, 200)
max_features = ['sqrt', 'log2', 'auto', None]
max_depth = Integer(2,30)
min_samples_split = Integer(2,15)
bootstrap = [True, False]

In [17]:
#Define the parameter grid
param_grid = {
    'classifier__n_estimators': n_estimators,
    'classifier__max_features': max_features,
    'classifier__max_depth': max_depth,
    'classifier__min_samples_split': min_samples_split,
    'classifier__bootstrap': bootstrap,
}

In [18]:
#Initialise the Bayes search
bayesSearch = BayesSearchCV(gridSearch.best_estimator_, search_spaces=param_grid, scoring='neg_root_mean_squared_error', n_iter=20, n_jobs=-1)

In [19]:
#Fit it to the training data
bayesSearch.fit(X_train, y_train)




  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(




  warn(
  warn(
  warn(
  warn(
  warn(








In [20]:
#View the best score and the corresponding parameters
print(bayesSearch.best_score_)
bayesSearch.best_params_

-2.4773295540377496


OrderedDict([('classifier__bootstrap', True),
             ('classifier__max_depth', 30),
             ('classifier__max_features', None),
             ('classifier__min_samples_split', 2),
             ('classifier__n_estimators', 74)])

In [21]:
#Predict on the test set using the optimised model
preds_opt = bayesSearch.predict(X_val)

In [22]:
#Calculate performance measures
mse_opt = mean_squared_error(y_val, preds_opt, squared=True)
r2_opt = r2_score(y_val,preds_opt)
mae_opt = median_absolute_error(y_val, preds_opt)

In [23]:
#Compare performance measures between tuned and untuned model
data = {'grid':[gridSearch.best_score_,mse,r2,mae], 'bayes':[bayesSearch.best_score_,mse_opt,r2_opt,mae_opt]}
pd.DataFrame(data=data, index=['rmse','mse','r2','mae'])

Unnamed: 0,grid,bayes
rmse,-2.538459,-2.47733
mse,5.725596,5.70959
r2,0.867412,0.867782
mae,1.231092,1.23885


In [None]:
import boto3

access_key_id = 
secret_access_key = 

session = boto3.Session(
    aws_access_key_id=access_key_id ,
    aws_secret_access_key=secret_access_key,)

s3_resource = boto3.resource('s3')

bucket='mda.project.monaco'
key= 'pickle_model.pkl'

pickle_byte_obj = pickle.dumps(gridSearch)

s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)