# Modelling

In [1]:
import pandas as pd
import numpy as np
import re
import time
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Perceptron, LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

rng = np.random.RandomState(42)

# from keras import Sequential
# from keras.layers import Dense

import warnings
warnings.filterwarnings('ignore') # hide warnings to avoid cluttering the notebook output

In [2]:
class TimerError(Exception):
    """A custom exception used to report errors in use of Timer class"""

class Timer:
    def __init__(self):
        self._start_time = None
        self._elapsed_time = None

    def start(self):
        """Start a new timer"""
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._elapsed_time = None
        self._start_time = time.perf_counter()

    def stop(self):
        """Stop the timer, and report the elapsed time"""
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")

        self._elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        print(f"Elapsed time: {self._elapsed_time:0.4f} seconds")

    def duration(self):
        """Return the elapsed time from the timer."""
        if self._elapsed_time is None:
            raise TimerError(f"Timer has not run. Use .start() and .stop() to start and stop the timer.")

        return self._elapsed_time

In [3]:
df = pd.read_csv('../assets/df_merge_final.csv')

In [4]:
def create_dt_features(dataframe):
    dataframe['daylight_duration'] = pd.to_timedelta(dataframe['daylight_duration'])
    dataframe['Daylight_hours'] = dataframe['daylight_duration'].dt.total_seconds() / (60*60)
    dataframe['Date'] = pd.to_datetime(dataframe['Date'])
    dataframe['Month'] = dataframe['Date'].dt.month_name()
    dataframe['Day'] = dataframe['Date'].dt.day_of_year
    return dataframe

In [5]:
df = create_dt_features(df)

In [6]:
# specify features
num_features = ['Latitude', 'Longitude', 'AddressAccuracy', 'Tavg', 'Depart',
       'Heat', 'PrecipTotal', 'SeaLevel', 'ResultDir', 'AvgSpeed',
       'Humidity', 'Daylight_hours', 'Day']
cat_features = ['Species', 'CodeSum', 'Month']

features = num_features + cat_features

# define metric to optimise during GridSearch
score_metric = 'roc_auc'

In [7]:
X = df[features]
y = df['WnvPresent']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=rng, stratify=y)

In [9]:
# Processing steps before modelling (standardisation, one hot encoding, over/under-sampling)
standardize = MinMaxScaler(feature_range=(0, 1))
ohe = OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist')

sample_smotetomek = SMOTETomek(random_state=rng, n_jobs=2)

In [10]:
# WORKING FILE
# perceptron = Perceptron(random_state=42)

# model = Sequential()
# model.add(Dense(50, input_dim=53, activation='relu')) # input
# model.add(Dense(100, activation='relu')) # hidden
# model.add(Dense(50, activation='relu'))  # hidden
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['binary_crossentropy']) 


In [11]:
# Instantiating models for classification
lr = LogisticRegression(max_iter=200, random_state=rng)
rc = RidgeClassifier(random_state=rng)
knn = KNeighborsClassifier()
pct = Perceptron(random_state=rng)
rf = RandomForestClassifier(n_estimators=200, random_state=rng)
ada = AdaBoostClassifier(random_state=rng)
gbc = GradientBoostingClassifier(random_state=rng)

In [12]:
lr_params = {
    'clf__solver': ['liblinear'],
    'clf__C': [100, 10, 1.0, 0.1, 0.01],
}

rc_params = {
    'clf__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]   
}

knn_params = {
    'clf__n_neighbors': [3, 5, 9, 15],
    'clf__weights': ['uniform', 'distance']
}

pct_params = {
    'clf__penalty': [None, 'l1', 'l2' 'elasticnet'],
    'clf__class_weight': [None, 'balanced']
}

rf_params = {
    'clf__ccp_alpha': [0.0, 0.01, 0.1],
    'clf__max_features': ['sqrt', 'log2']
}

ada_params = {
    'clf__learning_rate': [1.0, 2.0, 10]
}

gbc_params = {
    'clf__learning_rate': [0.001, 0.01, 0.1],
    'clf__subsample': [0.5, 0.7, 1.0],
    'clf__max_depth': [3, 7, 9],
}

In [13]:
col_transform = ColumnTransformer([
    ('ss', standardize, num_features),
    ('ohe', ohe, cat_features),
], remainder='drop'
)

In [14]:
model_dict = {
    'model_list': [],
    'mean_train_score': [],
    'mean_test_score': [],
    'best_params': [],
    'runtime': []
}

In [15]:
# run pipe, fit params and return best estimator
def run_pipe(clf, pipe_params):
    t = Timer()
    model_name = re.match(r'^(\w+)(?=\()', str(clf)).group()
    print(f"Fitting {model_name}")

    # create pipe
    pipe = Pipeline([
        ('transform', col_transform),
        ('sample', sample_smotetomek),
        ('clf', clf)
    ])
    
    
    # create GridSearchCV
    grid = GridSearchCV(
        estimator = pipe,
        param_grid = pipe_params,
        scoring = score_metric,
        n_jobs = -3,
        cv = 3,
        verbose = 1,
        error_score=0
    )

    t.start()
    grid.fit(X_train, y_train)
    t.stop()
    
    test_score = cross_val_score(grid.best_estimator_, X_test, y_test, scoring=score_metric, cv=3, n_jobs=-3)
    # store average scores
    model_dict['model_list'].append(model_name)
    model_dict['mean_train_score'].append(grid.best_score_)
    model_dict['mean_test_score'].append(test_score.mean())
    model_dict['best_params'].append(grid.best_params_)
    model_dict['runtime'].append(t.duration())

    print("Best Score: ", grid.best_score_)
    print("Best Params: ", grid.best_params_)
    print()
    return grid.best_estimator_

In [16]:
# create table with model performance results
def table_model_results():
    model_performance = pd.DataFrame(model_dict)
    
    model_performance.insert(
        loc = 3,
        column = 'score_delta',
        value = abs(model_performance['mean_train_score'] - model_performance['mean_test_score'])
        )

    return model_performance.sort_values(by="mean_train_score", ascending=False).round({'mean_train_score': 4, 'mean_test_score':4, 'score_delta':4, 'runtime': 2})

In [17]:
lr_best = run_pipe(lr, lr_params)

Fitting LogisticRegression
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Elapsed time: 23.6461 seconds
Best Score:  0.8236324909584477
Best Params:  {'clf__C': 10, 'clf__solver': 'liblinear'}



In [18]:
rc_best = run_pipe(rc, rc_params)

Fitting RidgeClassifier
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Elapsed time: 29.1371 seconds
Best Score:  0.8170834727337734
Best Params:  {'clf__alpha': 0.9}



In [19]:
knn_best = run_pipe(knn, knn_params)

Fitting KNeighborsClassifier
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Elapsed time: 28.5399 seconds
Best Score:  0.7929891958696201
Best Params:  {'clf__n_neighbors': 15, 'clf__weights': 'uniform'}



In [20]:
pct_best = run_pipe(pct, pct_params)

Fitting Perceptron
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Elapsed time: 21.5490 seconds
Best Score:  0.8052279679750282
Best Params:  {'clf__class_weight': None, 'clf__penalty': None}



In [21]:
rf_best = run_pipe(rf, rf_params)

Fitting RandomForestClassifier
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Elapsed time: 64.4267 seconds
Best Score:  0.8134973836429381
Best Params:  {'clf__ccp_alpha': 0.01, 'clf__max_features': 'sqrt'}



In [22]:
ada_best = run_pipe(ada, ada_params)

Fitting AdaBoostClassifier
Fitting 3 folds for each of 3 candidates, totalling 9 fits
Elapsed time: 16.3228 seconds
Best Score:  0.8169208058881186
Best Params:  {'clf__learning_rate': 1.0}



In [23]:
gbc_best = run_pipe(gbc, gbc_params)

Fitting GradientBoostingClassifier
Fitting 3 folds for each of 27 candidates, totalling 81 fits
Elapsed time: 154.4913 seconds
Best Score:  0.829426360347667
Best Params:  {'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__subsample': 0.7}



In [24]:
table_model_results()

Unnamed: 0,model_list,mean_train_score,mean_test_score,score_delta,best_params,runtime
6,GradientBoostingClassifier,0.8294,0.7777,0.0517,"{'clf__learning_rate': 0.1, 'clf__max_depth': ...",154.49
0,LogisticRegression,0.8236,0.7684,0.0552,"{'clf__C': 10, 'clf__solver': 'liblinear'}",23.65
1,RidgeClassifier,0.8171,0.7589,0.0582,{'clf__alpha': 0.9},29.14
5,AdaBoostClassifier,0.8169,0.7702,0.0468,{'clf__learning_rate': 1.0},16.32
4,RandomForestClassifier,0.8135,0.7825,0.031,"{'clf__ccp_alpha': 0.01, 'clf__max_features': ...",64.43
3,Perceptron,0.8052,0.7489,0.0563,"{'clf__class_weight': None, 'clf__penalty': None}",21.55
2,KNeighborsClassifier,0.793,0.733,0.06,"{'clf__n_neighbors': 15, 'clf__weights': 'unif...",28.54


In [25]:
# create VotingClassifier ensemble with best performing classifiers
voting_clf = VotingClassifier([
    ('LogReg', lr),
#     ('Ridge', rc),
    ('AdaBoost', ada)
])

voting_params = {
    'sample__sampling_strategy': ['auto', 0.1, 0.2, 0.3],
    'clf__voting': ['hard', 'soft'],
    'clf__LogReg__solver': ['liblinear'],
    'clf__LogReg__C': [100, 10, 1.0, 0.1, 0.01],
#     'clf__Ridge__alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]   
}

In [26]:
voting_best = run_pipe(voting_clf, voting_params)

Fitting VotingClassifier
Fitting 3 folds for each of 40 candidates, totalling 120 fits
Elapsed time: 56.6272 seconds
Best Score:  0.8266402163730341
Best Params:  {'clf__LogReg__C': 100, 'clf__LogReg__solver': 'liblinear', 'clf__voting': 'soft', 'sample__sampling_strategy': 0.3}



In [27]:
table_model_results()

Unnamed: 0,model_list,mean_train_score,mean_test_score,score_delta,best_params,runtime
6,GradientBoostingClassifier,0.8294,0.7777,0.0517,"{'clf__learning_rate': 0.1, 'clf__max_depth': ...",154.49
7,VotingClassifier,0.8266,0.7701,0.0565,"{'clf__LogReg__C': 100, 'clf__LogReg__solver':...",56.63
0,LogisticRegression,0.8236,0.7684,0.0552,"{'clf__C': 10, 'clf__solver': 'liblinear'}",23.65
1,RidgeClassifier,0.8171,0.7589,0.0582,{'clf__alpha': 0.9},29.14
5,AdaBoostClassifier,0.8169,0.7702,0.0468,{'clf__learning_rate': 1.0},16.32
4,RandomForestClassifier,0.8135,0.7825,0.031,"{'clf__ccp_alpha': 0.01, 'clf__max_features': ...",64.43
3,Perceptron,0.8052,0.7489,0.0563,"{'clf__class_weight': None, 'clf__penalty': None}",21.55
2,KNeighborsClassifier,0.793,0.733,0.06,"{'clf__n_neighbors': 15, 'clf__weights': 'unif...",28.54


In [28]:
preds = voting_best.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94      3983
           1       0.20      0.38      0.26       220

    accuracy                           0.89      4203
   macro avg       0.58      0.65      0.60      4203
weighted avg       0.92      0.89      0.90      4203



In [29]:
voting_best.fit(X, y)

In [30]:
df_test = pd.read_csv('../assets/df_merge_test.csv')

In [31]:
df_test = create_dt_features(df_test)

In [32]:
df_index = pd.read_csv('../assets/test.csv')

In [33]:
results = voting_best.predict(df_test[X.columns])

In [34]:
def create_predictions(predictions, filename):
    if not os.path.isdir('../output'):
        os.mkdir('../output')
    
    results_df = df_index[["Id"]].merge(
        pd.DataFrame(predictions), left_index=True, right_index=True
    )
    results_df.to_csv(f"../output/{filename}.csv", header=["Id", "WnvPresent"], index=False)

In [38]:
prediction_filename = 'predictions_2'

In [39]:
create_predictions(results, prediction_filename)

In [40]:
table_model_results().to_csv(f'../output/params_{prediction_filename}.csv', index=False)