In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score


In [2]:
def eval_sklearn_model(y_true, predictions, model=None, X=None):
    """This function takes the true values for y and the predictions made by the model and prints out the confusion matrix along with Accuracy, Precision, and, if model and X provided, Roc_Auc Scores."""
    cnf_matrix = confusion_matrix(y_true, predictions)

    print('True Negative: ', cnf_matrix[0, 0], '| False Positive: ', cnf_matrix[0, 1])
    print('False Negative: ', cnf_matrix[1, 0], '| True Positive: ', cnf_matrix[1, 1], '\n')

    sensitivity = cnf_matrix[1, 1]/ (cnf_matrix[1, 0] + cnf_matrix[1, 1])
    specificity = cnf_matrix[0, 0]/ (cnf_matrix[0, 1] + cnf_matrix[0, 0])

    print('Sensitivity (TP/ TP + FN): ', sensitivity)
    print('Specificity (TN/ TN + FP): ', specificity, '\n')

    print('Accuracy: ', accuracy_score(y_true, predictions, normalize=True))
    print('Precision: ', precision_score(y_true, predictions))
    if model != None:
        print('Roc-Auc: ', roc_auc_score(y_true, [x[1] for x in model.predict_proba(X)]))
    else:
        pass
    print('\n')


In [3]:
model_data = pd.read_csv('../data/model_data.csv').drop('date', axis=1)


In [4]:
model_data.head()


Unnamed: 0,nummosquitos,wnvpresent,culex_erraticus,culex_pipiens,culex_pipiens/restuans,culex_restuans,culex_salinarius,culex_tarsalis,culex_territans,week,ten_day_avg_percip,ten_day_avg_temp,ten_day_avg_dewpoint,ten_day_avg_pressure,ten_day_avg_windspeed
0,1,0,0,0,1,0,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
1,1,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
2,1,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
3,1,0,0,0,1,0,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
4,4,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9


In [5]:
model_data[model_data['wnvpresent'] == 1].shape


(551, 15)

In [6]:
model_data = model_data.drop('nummosquitos', axis=1)
y = model_data.pop('wnvpresent')
X = model_data


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=42)


In [8]:
# help(RandomForestClassifier)

In [9]:
forest = RandomForestClassifier(n_jobs=-1)

In [None]:
purities = ['gini', 'entropy']
weights = ['balanced_subsample']
min_sample = [1/i for i in range(2, 11)]
depths = [i for i in range(5, 20)]
features = [i for i, j in enumerate(X_train.columns, start=1)]

In [None]:
param_grid = dict(max_depth=depths,
                  max_features=features,
                  min_samples_leaf=min_sample,
                  class_weight=weights)

grid = GridSearchCV(forest,
                    param_grid,
                    cv=3,
                    scoring='neg_log_loss',
                    n_jobs=-1,
                    verbose=True)

grid.fit(X_train, y_train)

best_results = {'params': list(grid.best_params_.items()), 'score': grid.best_score_}
print(best_results)

best_forest = grid.best_estimator_

Fitting 3 folds for each of 1755 candidates, totalling 5265 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   18.4s


In [None]:
best_forest

In [None]:
score_forest = cross_val_score(best_forest, X_train, y_train, cv=10, n_jobs=-1)

score_forest

In [None]:
best_forest.fit(X_train, y_train)
predict_forest = best_forest.predict(X_test)

print(accuracy_score(y_test, predict_forest), '\n')
print(classification_report(y_test, predict_forest))

eval_sklearn_model(y_test, predict_forest, model=best_forest, X=X_test)
