In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error
import pandas as pd
import numpy as np
from skopt.space import Real, Integer, Categorical
from skopt import BayesSearchCV
from data import df
import pickle
from sklearn import set_config

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.result_timestamp = pd.to_datetime(df.result_timestamp)

In [None]:
#Drop the features we have decided not to use in the models
df = df.drop(['lc_temp_qcl0', 'lc_temp_qcl1', 'lc_temp_qcl2', 'lc_temp_qcl3', 'result_timestamp','lat','lon'], axis=1)

In [None]:
#Create a dataframe for the features and one for the target variable, laeq
X = df.loc[:, df.columns != 'laeq']
y = df.loc[:,'laeq']

In [None]:
#Create test and training sets
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, random_state=12)

In [None]:
# Define the preprocessing steps we want to include in our pipeline
numerical_cols = ['lc_dwptemp', 'lc_rainin', 'lc_dailyrain', 'lc_windspeed', 'count']

categorical_cols = ['description', 'hour', 'month', 'day_of_week', 'night_of_week']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numerical_transformer, numerical_cols)
],
remainder='passthrough',
verbose_feature_names_out = False,
).set_output(transform="pandas")

In [None]:
#Create the pipeline
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', [])
])

In [None]:
#Define the different models we want to compare
grid = [{'classifier':[RandomForestRegressor(), Ridge(), SVR()]}]

In [None]:
#Define the grid search
gridSearch = GridSearchCV(pipe, grid, scoring='neg_root_mean_squared_error', n_jobs = -1, verbose=1, cv=3)

In [None]:
#Fit the search to our training data
gridSearch.fit(X_train, y_train)

In [None]:
#The best model as selected by the grid search
print(gridSearch.best_score_)
gridSearch.best_estimator_

In [None]:
#Use the best model to predict on the test data
preds = gridSearch.best_estimator_.predict(X_val)

In [None]:
#Calculate performance measures
mse = mean_squared_error(y_val, preds, squared=True)
r2 = r2_score(y_val,preds)
mae = median_absolute_error(y_val, preds)

In [None]:
#Define the parameters we want to tune and the ranges over which to do so

n_estimators = Integer(25, 200)
max_features = ['sqrt', 'log2', None]
max_depth = Integer(2,30)
min_samples_split = Integer(2,15)
bootstrap = [True, False]

In [None]:
#Define the parameter grid
param_grid = {
    'classifier__n_estimators': n_estimators,
    'classifier__max_features': max_features,
    'classifier__max_depth': max_depth,
    'classifier__min_samples_split': min_samples_split,
    'classifier__bootstrap': bootstrap,
}

In [None]:
#Initialise the Bayes search
bayesSearch = BayesSearchCV(gridSearch.best_estimator_, search_spaces=param_grid, scoring='neg_root_mean_squared_error', n_iter=20, n_jobs=-1)

In [None]:
#Fit it to the training data
bayesSearch.fit(X_train, y_train)

In [None]:
#View the best score and the corresponding parameters
print(bayesSearch.best_score_)
bayesSearch.best_params_

In [None]:
#Predict on the test set using the optimised model
preds_opt = bayesSearch.predict(X_val)

In [None]:
#Calculate performance measures
mse_opt = mean_squared_error(y_val, preds_opt, squared=True)
r2_opt = r2_score(y_val,preds_opt)
mae_opt = median_absolute_error(y_val, preds_opt)

In [None]:
#Compare performance measures between tuned and untuned model
data = {'grid':[gridSearch.best_score_,mse,r2,mae], 'bayes':[bayesSearch.best_score_,mse_opt,r2_opt,mae_opt]}
pd.DataFrame(data=data, index=['-rmse','mse','r2','mae'])

In [None]:
import boto3

access_key_id = 
secret_access_key = 

session = boto3.Session(
    aws_access_key_id=access_key_id ,
    aws_secret_access_key=secret_access_key,)

s3_resource = boto3.resource('s3')

bucket='mda.project.monaco'
key= 'pickle_model.pkl'

pickle_byte_obj = pickle.dumps(bayesSearch)

s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)

In [None]:

with open('/Users/christianbutcher/Documents/MDA/project_real/mda_2023_monaco/app/pickle_rf_model.pkl', 'wb') as file:
    pickle.dump(pipe, file)