In [288]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score, cross_val_predict, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from pymongo import *
from mpl_toolkits.basemap import Basemap
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

%matplotlib inline

In [289]:
# Import the spray data so that we can perform clustering analysis on it.
df = pd.read_csv('https://raw.githubusercontent.com/cl65610/west_nile/master/assets/spray.csv')

In [290]:
df.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [291]:
# Set our features for this clustering. 
X = df[['Longitude', 'Latitude']]

In [292]:
# After much tinkering, these parameters gave us what appeared to be an ideal number of groups
dbscn = DBSCAN(eps = .0049, min_samples = 80).fit(X)  
labels = dbscn.labels_

In [293]:
print 'This DBSCAN results in the creation of %d different labels.' % len(set(labels))

This DBSCAN results in the creation of 15 different labels.


In [294]:
# We will now treat these labels as a new feature of our data. Attaching them to our original dataframe will allow us to 
# plot them and visualize our accuracy.
labels_df = pd.DataFrame(labels.reshape(14835,1), columns = ['labels'])

In [295]:
# This merged dataframe is also the dataframe that we'll send up to sql to be joined on. 
merged_df = df.join(labels_df)
merged_df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,labels
0,2011-08-29,6:56:58 PM,42.391623,-88.089163,0
1,2011-08-29,6:57:08 PM,42.391348,-88.089163,0
2,2011-08-29,6:57:18 PM,42.391022,-88.089157,0
3,2011-08-29,6:57:28 PM,42.390637,-88.089158,0
4,2011-08-29,6:57:38 PM,42.39041,-88.088858,0


In [296]:
X = merged_df[['Latitude', 'Longitude']]
y = merged_df['labels']

In [297]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc_fit = etc.fit(X, y)
print 'The score for our Extra Trees model is %.5f.' % etc.score(X, y)

The score for our Extra Trees model is 1.00000.


In [298]:
# In order to test our accuracy, we will import our train data, apply the fit, and check visually to see how well our
# classifier performs.
train_df = pd.read_csv('https://raw.githubusercontent.com/cl65610/west_nile/master/assets/joined_pca.csv')

In [299]:
# As above, our features for this will be latitude and longitude. 
X2 = train_df[['Latitude', 'Longitude']]

In [300]:
# Here we predict what our train data spray zones will be and set those results equal to a dataframe that we can then join 
# onto our training data.
label_predicts = etc_fit.predict(X2)
xt_labels = pd.DataFrame(label_predicts, columns = ['xt_labels'])

In [301]:
labeled_df = train_df.join(xt_labels)

In [302]:
print labeled_df.shape
labeled_df.head()

(10506, 27)


Unnamed: 0.1,Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,...,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4,xt_labels
0,0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,7
1,1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,7
2,2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,2
3,3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,1
4,4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,1


In [303]:
from sklearn.preprocessing import scale

In [304]:
features = ['Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy', 'day_weather', 'PrecipTotal', 
           'pca1', 'pca2', 'pca3', 'pca4', 'xt_labels', 'year']
#X = scale(labeled_df[features])
X = labeled_df[features]
y = labeled_df.WnvPresent

In [305]:
## this is the dataframe will will be building all our models off of, exporting to csv

labeled_df.to_csv('model_building_df.csv')

In [306]:
X = pd.DataFrame(X, columns = features)
X.head()

Unnamed: 0,Species,Trap,Latitude,Longitude,AddressAccuracy,day_weather,PrecipTotal,pca1,pca2,pca3,pca4,xt_labels,year
0,2,1,41.95469,-87.800991,9,179.0,0.0,-0.663872,3.033573,-1.434144,1.092123,7,2007
1,3,1,41.95469,-87.800991,9,179.0,0.0,-0.663872,3.033573,-1.434144,1.092123,7,2007
2,3,6,41.994991,-87.769279,9,179.0,0.0,-0.663872,3.033573,-1.434144,1.092123,2,2007
3,2,13,41.974089,-87.824812,8,179.0,0.0,-0.663872,3.033573,-1.434144,1.092123,1,2007
4,3,13,41.974089,-87.824812,8,179.0,0.0,-0.663872,3.033573,-1.434144,1.092123,1,2007


In [None]:
knn = KNeighborsClassifier()
lr = LogisticRegression(random_state = 42)
svm = SVC(probability = True, random_state = 42)
rf = RandomForestClassifier(random_state = 42)
etc = ExtraTreesClassifier(random_state = 42)
gbc = GradientBoostingClassifier(random_state = 42)
bc = BaggingClassifier(knn, random_state = 42)
ada = AdaBoostClassifier(random_state = 42)

In [None]:
from sklearn.ensemble import VotingClassifier

voter = VotingClassifier(estimators = [('knn', knn),
                                       ('lr', lr),
                                       ('extra trees', etc),
                                       ('random forest', rf),
                                       ('svm', svm),
                                       ('gbc', gbc),
                                       ('ada', ada),
                                       ('bc', bc)
                                      ],
                        voting = 'soft', weights = [1, 1, 1, 1, 1, 1, 1, 1])

voter_fit = voter.fit(X, y)
print voter.score(X, y)

predicts = voter.predict_proba(X)
pred_proba = predicts[:,1]

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, _ = metrics.roc_curve(y, pred_proba)

plt.style.use('fivethirtyeight')

auc = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label="Voting Classifier")
plt.xlim([-.05, 1.0])
plt.ylim([-.05, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('AUC: %f' % auc)

plt.show()

In [None]:
## ATTEMPT TO GRID SEARCH THE VOTING CLASSIFIER

# from sklearn.grid_search import GridSearchCV

# voter_grid = VotingClassifier(estimators = )

# params = {'estimators':[('knn', knn), ('etc', etc)], 'voting':['soft', 'hard'], 'weights':[1, 2, 3]}

# vote_gscv = GridSearchCV(voter_grid, param_grid = params, cv = 2)
# vote_grid_fit = vote_gscv.fit(X, y)

# print vote_grid_fit.best_score_
# print
# print vote_grid_fit.best_estimator_
# print
# print vote_grid_fit.best_params_

In [None]:
final_joined_pca = pd.read_csv('/Users/TerryONeill/west_nile/west_nile/assets/final_joined_pca.csv')
print final_joined_pca.shape
final_joined_pca.head()

In [None]:
## need to drop unnamed column that forms from the index of the original dataframe
final_joined_pca = final_joined_pca.drop('Unnamed: 0', axis = 1)
print final_joined_pca.shape
final_joined_pca.head()

In [None]:
# As above, our features for this will be latitude and longitude. 
X2 = final_joined_pca[['Latitude', 'Longitude']]

In [None]:
# Here we predict what our test data spray zones will be and set those results equal to a dataframe that we can then join 
# onto our joined_pca data.



label_predicts = etc_fit.predict(X2)
xt_labels = pd.DataFrame(label_predicts, columns = ['xt_labels'])
print xt_labels.shape
xt_labels.head()

In [None]:
labeled_df = final_joined_pca.join(xt_labels)

In [None]:
print labeled_df.shape
labeled_df.head()

In [None]:
features = ['Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy', 'day_weather', 'PrecipTotal', 
           'pca1', 'pca2', 'pca3', 'pca4', 'xt_labels', 'year']
X = labeled_df[features]


In [None]:
test_pred_proba = voter_fit.predict_proba(X)

In [None]:
test_proba_df = pd.DataFrame(test_pred_proba, columns = ['prob_0', 'prob_1'])
test_proba_df.head()

In [None]:
test_proba_df['Id'] = range(1, 116294)
test_proba_df.set_index('Id', inplace = True)
test_proba_df.drop('prob_0', axis = 1, inplace = True)
test_proba_df.head()

In [None]:
test_proba_df.columns = ['WnvPresent']
test_proba_df.head()

In [None]:
test_proba_df.to_csv('test_data_predictions.csv')