# Modeling the Appearance of West Nile Virus in Chicago

Insert summary text here about approach and best results, as well as visualizations

In [1]:
# import all libraries used
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import binarize
from sklearn.grid_search import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

# define a random state and train test split % to use throughout
rs = 8
tts = .2

In [2]:
# define functions for classification model evaluation
def doClassifMetrics(y_test, y_pred):
    print 'Confusion Matrix \n', confusion_matrix(y_test, y_pred)
    print 'Classification Report \n', classification_report(y_test, y_pred)

def modelEval(name, model, X, y, binarize_threshold):
    X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2,
        stratify = y, random_state = rs)
    meancvscore = cross_val_score(model, X, y, n_jobs=-1, verbose=1).mean()
    print 'Model %s cross_val_score: %f' % (name, meancvscore)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) # this uses default 50% probability threshold to predict 1
    y_pred_adj = binarize(model.predict_proba(X_test)[:,1],
        threshold = binarize_threshold, copy=False).transpose()
    print 'Model %s classification metrics:' % name
    doClassifMetrics(y_test, y_pred)
    print 'Model %s using prediction threshold %f:' % (name, binarize_threshold)
    doClassifMetrics(y_test, y_pred_adj)

In [3]:
# read in cleaned and merged data
df = pd.read_csv('Assets/merged.csv')

## Approach 1: Select 10 Best Features and use Random Forest Classifier

In [7]:
# define X and y, dropping unnecessary columns
X = df.drop(['Date', 'Trap', 'Year','WnvPresent', 'date_station_id'], axis=1)
y = df['WnvPresent']

In [5]:
# select the 10 best features
kbest = SelectKBest(k=10)
X_kbest = kbest.fit_transform(X,y)

# get list of best feature names, and put into a dataframe for reference
best_features = [x for (x,y) in zip(X.columns, kbest.get_support().tolist()) if y==1]
X_kbest = pd.DataFrame(X_kbest, columns = best_features)

In [8]:
# instantiate LogisticRegression and evaluate model:
lg = LogisticRegression(random_state=rs, n_jobs=-1)
modelEval('LogisticRegression', lg, X_kbest, y, binarize_threshold = 0.2)

Model LogisticRegression cross_val_score: 0.946431
Model LogisticRegression classification metrics:
Confusion Matrix 
[[1602    2]
 [  88    3]]
Classification Report 
             precision    recall  f1-score   support

          0       0.95      1.00      0.97      1604
          1       0.60      0.03      0.06        91

avg / total       0.93      0.95      0.92      1695

Model LogisticRegression using prediction threshold 0.200000:
Confusion Matrix 
[[1599    5]
 [  84    7]]
Classification Report 
             precision    recall  f1-score   support

          0       0.95      1.00      0.97      1604
          1       0.58      0.08      0.14        91

avg / total       0.93      0.95      0.93      1695



[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [9]:
# instantiate  RandomForestClassifier and evaluate model
rf = RandomForestClassifier(random_state=rs, n_jobs=-1, max_depth = None)
modelEval('RandomForestClassifier', rf, X_kbest, y, 0.2)

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.3s finished


Model RandomForestClassifier cross_val_score: 0.810239
Model RandomForestClassifier classification metrics:
Confusion Matrix 
[[1587   17]
 [  84    7]]
Classification Report 
             precision    recall  f1-score   support

          0       0.95      0.99      0.97      1604
          1       0.29      0.08      0.12        91

avg / total       0.91      0.94      0.92      1695

Model RandomForestClassifier using prediction threshold 0.200000:
Confusion Matrix 
[[1530   74]
 [  68   23]]
Classification Report 
             precision    recall  f1-score   support

          0       0.96      0.95      0.96      1604
          1       0.24      0.25      0.24        91

avg / total       0.92      0.92      0.92      1695

