In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression, RidgeClassifier, LinearRegression, LassoCV
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
import sklearn.metrics as metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

import pickle

pd.options.display.max_columns = 100

In [2]:
df = pd.read_csv('./model_data/winni_reports.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,year,date,air_temp_f,water_temp_f,wind_speed_mph,wind_dir,weather,location,time_caught,fish_type,fish_length_in,water_depth_ft,skunked,lines_in,lines_out,general_loc,duration_min,month,hour,time_caught_bucket
0,0,2016,2016-08-23,47.0,73.1,6,sw,sunny,south of welch,8:25,horned pout,10.0,35.0,False,7:10,9:00,welch,110.0,8,8,"[7, 9)"
1,1,2015,2015-09-03,69.0,75.5,4,nw,hazy,north of diamond,6:45,lake trout,17.0,29.0,False,6:30,9:30,diamond,180.0,9,6,"[5, 7)"
2,2,2017,2017-07-17,68.0,74.8,0,no_wind,hazy,north of welch,6:50,lake trout,16.0,31.0,False,6:45,7:50,welch,65.0,7,6,"[5, 7)"
3,3,2015,2015-07-18,62.0,73.4,15,se,raining,south of sandy,6:55,lake trout,20.0,135.0,False,6:15,10:30,sandy,255.0,7,6,"[5, 7)"
4,4,2017,2017-07-17,68.0,74.8,0,no_wind,hazy,north of welch,7:05,lake trout,16.0,31.0,False,6:45,7:50,welch,65.0,7,7,"[7, 9)"


In [3]:
df.isnull().sum()

Unnamed: 0             0
year                   0
date                   0
air_temp_f             0
water_temp_f           0
wind_speed_mph         0
wind_dir               0
weather                0
location               0
time_caught            0
fish_type              0
fish_length_in         0
water_depth_ft         0
skunked                0
lines_in               0
lines_out              0
general_loc            0
duration_min           0
month                  0
hour                   0
time_caught_bucket    73
dtype: int64

In [4]:
df = df.drop(columns=['Unnamed: 0'])

In [5]:
df.columns

Index(['year', 'date', 'air_temp_f', 'water_temp_f', 'wind_speed_mph',
       'wind_dir', 'weather', 'location', 'time_caught', 'fish_type',
       'fish_length_in', 'water_depth_ft', 'skunked', 'lines_in', 'lines_out',
       'general_loc', 'duration_min', 'month', 'hour', 'time_caught_bucket'],
      dtype='object')

In [6]:
model_df = pd.get_dummies(df, columns = ['wind_dir', 'weather', 'general_loc'], drop_first = True)
model_df.head()

Unnamed: 0,year,date,air_temp_f,water_temp_f,wind_speed_mph,location,time_caught,fish_type,fish_length_in,water_depth_ft,skunked,lines_in,lines_out,duration_min,month,hour,time_caught_bucket,wind_dir_ene,wind_dir_n,wind_dir_ne,wind_dir_no_wind,wind_dir_nw,wind_dir_s,wind_dir_se,wind_dir_sw,wind_dir_w,weather_hazy,weather_no_weather_recorded,weather_overcast,weather_raining,weather_sunny,weather_windy,general_loc_alton bay,general_loc_birch,general_loc_carr point,general_loc_diamond,general_loc_governors,general_loc_harilla bay,general_loc_little bear bay,general_loc_lockes,general_loc_long island,general_loc_rattlesnake,general_loc_sanders bay,general_loc_sandy,general_loc_spindle point,general_loc_timber,general_loc_tip witches,general_loc_varney,general_loc_varney point,general_loc_weirs marina,general_loc_welch,general_loc_witches,general_loc_wolfboro bay
0,2016,2016-08-23,47.0,73.1,6,south of welch,8:25,horned pout,10.0,35.0,False,7:10,9:00,110.0,8,8,"[7, 9)",0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,2015,2015-09-03,69.0,75.5,4,north of diamond,6:45,lake trout,17.0,29.0,False,6:30,9:30,180.0,9,6,"[5, 7)",0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2017,2017-07-17,68.0,74.8,0,north of welch,6:50,lake trout,16.0,31.0,False,6:45,7:50,65.0,7,6,"[5, 7)",0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,2015,2015-07-18,62.0,73.4,15,south of sandy,6:55,lake trout,20.0,135.0,False,6:15,10:30,255.0,7,6,"[5, 7)",0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,2017,2017-07-17,68.0,74.8,0,north of welch,7:05,lake trout,16.0,31.0,False,6:45,7:50,65.0,7,7,"[7, 9)",0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [7]:
model_df.dtypes

year                             int64
date                            object
air_temp_f                     float64
water_temp_f                   float64
wind_speed_mph                   int64
location                        object
time_caught                     object
fish_type                       object
fish_length_in                 float64
water_depth_ft                 float64
skunked                           bool
lines_in                        object
lines_out                       object
duration_min                   float64
month                            int64
hour                             int64
time_caught_bucket              object
wind_dir_ene                     uint8
wind_dir_n                       uint8
wind_dir_ne                      uint8
wind_dir_no_wind                 uint8
wind_dir_nw                      uint8
wind_dir_s                       uint8
wind_dir_se                      uint8
wind_dir_sw                      uint8
wind_dir_w               

### Modeling

In [8]:
model_df['skunked'] = model_df['skunked'].map({True: 1, False: 0})

In [9]:
X = model_df.drop(columns = ['year', 'time_caught_bucket', 'lines_in', 'lines_out', 'location', 'date', 'time_caught', 'fish_length_in', 'fish_type', 'hour', 'water_depth_ft', 'duration_min', 'skunked'])
y = model_df['skunked']


### Polynomial

In [None]:
# Polynomial function to expand number of features
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

In [None]:
X_poly.shape

In [None]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X_poly,
                                                    y,
                                                    random_state = 42
                                                    )

In [None]:
X_train.shape, y_train.shape

In [None]:
# Scale data
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [None]:
X_train_sc.shape, y_train.shape

### PCA

In [None]:
# PCA to identify most useful features
pca = PCA(svd_solver='full', n_components=.95)
pca.fit(X_train_sc)
pca.components_.shape

In [None]:
var_exp = pca.explained_variance_ratio_
print(f'Explained variance (first 20 components): {var_exp[:20]}')

In [None]:
Z_train = pca.transform(X_train_sc)
Z_test = pca.transform(X_test_sc)

In [None]:
Z_train.shape

In [None]:
pd.DataFrame(Z_train).var().map(round)

In [None]:
var_exp = pca.explained_variance_ratio_
var_exp.sum()

## Baseline Model

The score to beat!

In [None]:
# Baseline
y.value_counts(normalize=True)

## AdaBoost - For Streamlit

In [None]:
# Adaboost with Random Forest
ada = AdaBoostClassifier(base_estimator=RandomForestClassifier(n_estimators=20, max_depth=13, max_features=5, min_samples_split=2, random_state=42)
                             )
ada.fit(Z_train, y_train)

In [None]:
ada.score(Z_train, y_train), ada.score(Z_test, y_test)

In [None]:
preds = ada.predict(Z_test)
preds[:10]

In [None]:
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
import sklearn.metrics as metrics
from sklearn.metrics import classification_report

print(classification_report(y_test, preds))

In [None]:
plot_confusion_matrix(ada, Z_test, y_test, cmap='terrain')
plt.title('Confusion Matrix', fontdict = {'fontsize':12});

In [None]:
tn, tp, fn, fp = confusion_matrix(y_test, preds).ravel()

metrics.accuracy_score(y_test, preds)
print(f'Accuracy score is: {metrics.accuracy_score(y_test, preds)}'),
print(f'Precision score is: {metrics.precision_score(y_test, preds)}'),
print(f'Recall score is: {metrics.recall_score(y_test, preds)}'),
print(f'Specificity score is: {tn / (tn + fp)}')

In [None]:
filename = 'finalized_model.sav'
pickle.dump(ada, open(filename, 'wb'))

## Additional Models Tried

### LogReg - Z_train
This model uses the Polynomial / PCA fit data

In [None]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')

lr.fit(Z_train, y_train)

In [None]:
f'Test CV: {cross_val_score(lr, Z_test, y_test).mean()}', f'Train CV: {cross_val_score(lr, Z_train, y_train).mean()}'

In [None]:
plot_confusion_matrix(lr, Z_test, y_test, display_labels = ['Skunked', 'Not Skunked']);

## LogReg

This model does not use the Polynomial / PCA fit data

In [None]:
lr = LogisticRegression(max_iter=1000, solver='liblinear')

lr.fit(X_train, y_train)

In [None]:
lr.predict(X_test)

In [None]:
lr.predict_proba(X_test)[:, 1]

In [None]:
f'Test CV: {cross_val_score(lr, X_test, y_test).mean()}', f'Train CV: {cross_val_score(lr, X_train, y_train).mean()}'

In [None]:
lr.score(X_train, y_train)

In [None]:
plot_confusion_matrix(lr, X_test, y_test, display_labels = ['Skunked', 'Not Skunked']);

## Random Forest - Z-train

This model uses the Polynomial / PCA fit data

In [None]:
rf = RandomForestClassifier()
params = {
    'max_depth': [7,11,12,13],
    'max_features': [5,6,7,8],
    'min_samples_split': [2,3,4, 7,9],
    'n_estimators': [10,13,20,23,26,30,33] 
    }

gs = GridSearchCV(RandomForestClassifier(random_state=42), param_grid=params, verbose=0)
gs.fit(Z_train, y_train)
print(gs.best_score_)
gs.best_params_, 

## Stacking

In [None]:
estimators = [('rf', RandomForestClassifier(n_estimators=20, max_depth=13, max_features=5, min_samples_split=2, random_state=42)),
              ('knn', make_pipeline(StandardScaler(),KNeighborsClassifier())),
             ('bag', BaggingClassifier()),
             ('ridge', make_pipeline(StandardScaler(),RidgeClassifier()))]

stack = StackingClassifier(estimators=estimators, final_estimator = LogisticRegression())

In [None]:
stack.fit(X_train, y_train)
stack.score(X_train, y_train), stack.score(X_test, y_test), cross_val_score(stack, X_train, y_train).mean()

In [None]:
plot_confusion_matrix(stack, X_test, y_test, display_labels = ['Skunked', 'Not Skunked']);