In [40]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score, cross_val_predict, train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from pymongo import *
from mpl_toolkits.basemap import Basemap
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier

%matplotlib inline

In [41]:
# Import the spray data so that we can perform clustering analysis on it.
df = pd.read_csv('https://raw.githubusercontent.com/cl65610/west_nile/master/assets/spray.csv')

In [42]:
df.head()

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157
3,2011-08-29,6:57:28 PM,42.390637,-88.089158
4,2011-08-29,6:57:38 PM,42.39041,-88.088858


In [43]:
# Set our features for this clustering. 
X = df[['Longitude', 'Latitude']]

In [44]:
# After much tinkering, these parameters gave us what appeared to be an ideal number of groups
dbscn = DBSCAN(eps = .0049, min_samples = 80).fit(X)  
labels = dbscn.labels_

In [45]:
print 'This DBSCAN results in the creation of %d different labels.' % len(set(labels))

This DBSCAN results in the creation of 15 different labels.


In [46]:
# We will now treat these labels as a new feature of our data. Attaching them to our original dataframe will allow us to 
# plot them and visualize our accuracy.
labels_df = pd.DataFrame(labels.reshape(14835,1), columns = ['labels'])

In [47]:
# This merged dataframe is also the dataframe that we'll send up to sql to be joined on. 
merged_df = df.join(labels_df)
merged_df.head()

Unnamed: 0,Date,Time,Latitude,Longitude,labels
0,2011-08-29,6:56:58 PM,42.391623,-88.089163,0
1,2011-08-29,6:57:08 PM,42.391348,-88.089163,0
2,2011-08-29,6:57:18 PM,42.391022,-88.089157,0
3,2011-08-29,6:57:28 PM,42.390637,-88.089158,0
4,2011-08-29,6:57:38 PM,42.39041,-88.088858,0


In [48]:
X = merged_df[['Latitude', 'Longitude']]
y = merged_df['labels']

In [49]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc_fit = etc.fit(X, y)
print 'The score for our Extra Trees model is %.5f.' % etc.score(X, y)

The score for our Extra Trees model is 1.00000.


In [50]:
# In order to test our accuracy, we will import our train data, apply the fit, and check visually to see how well our
# classifier performs.
train_df = pd.read_csv('https://raw.githubusercontent.com/cl65610/west_nile/master/assets/joined_pca.csv')

In [51]:
# As above, our features for this will be latitude and longitude. 
X2 = train_df[['Latitude', 'Longitude']]

In [52]:
# Here we predict what our train data spray zones will be and set those results equal to a dataframe that we can then join 
# onto our training data.
label_predicts = etc_fit.predict(X2)
xt_labels = pd.DataFrame(label_predicts, columns = ['xt_labels'])

In [53]:
labeled_df = train_df.join(xt_labels)

In [54]:
print labeled_df.shape
labeled_df.head()

(10506, 27)


Unnamed: 0.1,Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,day,...,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4,xt_labels
0,0,2007-05-29,2,1,41.95469,-87.800991,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,7
1,1,2007-05-29,3,1,41.95469,-87.800991,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,7
2,2,2007-05-29,3,6,41.994991,-87.769279,9,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,2
3,3,2007-05-29,2,13,41.974089,-87.824812,8,1,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,1
4,4,2007-05-29,3,13,41.974089,-87.824812,8,4,0,179,...,0.0,421.0,1917.0,5.0,179.0,-0.663872,3.033573,-1.434144,1.092123,1


In [55]:
from sklearn.preprocessing import scale

In [56]:
features = ['Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy', 'day_weather', 'PrecipTotal', 
           'pca1', 'pca2', 'pca3', 'pca4', 'xt_labels', 'year']
X = labeled_df[features]

y = labeled_df.WnvPresent

In [57]:
knn = KNeighborsClassifier()
lr = LogisticRegression()
svm = SVC(probability = True)
rf = RandomForestClassifier()
etc = ExtraTreesClassifier()
gbc = GradientBoostingClassifier()

In [58]:
from sklearn.ensemble import VotingClassifier

voter = VotingClassifier(estimators = [('knn', knn), ('lr', lr), ('gbc', gbc),
                                      ('extra trees', etc), ('random forest', rf)],
                        voting = 'soft', weights = [1, 1, 1, 1, 1])

voter_fit = voter.fit(X, y)
print voter.score(X, y)

0.958595088521


In [59]:
final_joined_pca = pd.read_csv('/Users/TerryONeill/west_nile/west_nile/assets/final_joined_pca.csv')
print final_joined_pca.shape
final_joined_pca.head()

(116293, 24)


Unnamed: 0.1,Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,day,month,year,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,0,2008-06-11,2,1,41.95469,-87.800991,9,191,6,2008,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
1,1,2008-06-11,3,1,41.95469,-87.800991,9,191,6,2008,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
2,2,2008-06-11,1,1,41.95469,-87.800991,9,191,6,2008,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
3,3,2008-06-11,4,1,41.95469,-87.800991,9,191,6,2008,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
4,4,2008-06-11,6,1,41.95469,-87.800991,9,191,6,2008,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445


In [60]:
## need to drop unnamed column that forms from the index of the original dataframe
final_joined_pca = final_joined_pca.drop('Unnamed: 0', axis = 1)
print final_joined_pca.shape
final_joined_pca.head()

(116293, 23)


Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,day,month,year,Tmax,...,DewPoint,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4
0,2008-06-11,2,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
1,2008-06-11,3,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
2,2008-06-11,1,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
3,2008-06-11,4,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445
4,2008-06-11,6,1,41.95469,-87.800991,9,191,6,2008,86.0,...,55.5,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445


In [61]:
# As above, our features for this will be latitude and longitude. 
X2 = final_joined_pca[['Latitude', 'Longitude']]

In [62]:
# Here we predict what our test data spray zones will be and set those results equal to a dataframe that we can then join 
# onto our joined_pca data.



label_predicts = etc_fit.predict(X2)
xt_labels = pd.DataFrame(label_predicts, columns = ['xt_labels'])
print xt_labels.shape
xt_labels.head()

(116293, 1)


Unnamed: 0,xt_labels
0,7
1,7
2,7
3,7
4,7


In [63]:
labeled_df = final_joined_pca.join(xt_labels)

In [64]:
print labeled_df.shape
labeled_df.head()

(116293, 24)


Unnamed: 0,Date,Species,Trap,Latitude,Longitude,AddressAccuracy,day,month,year,Tmax,...,PrecipTotal,Sunrise,Sunset,month_weather,day_weather,pca1,pca2,pca3,pca4,xt_labels
0,2008-06-11,2,1,41.95469,-87.800991,9,191,6,2008,86.0,...,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445,7
1,2008-06-11,3,1,41.95469,-87.800991,9,191,6,2008,86.0,...,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445,7
2,2008-06-11,1,1,41.95469,-87.800991,9,191,6,2008,86.0,...,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445,7
3,2008-06-11,4,1,41.95469,-87.800991,9,191,6,2008,86.0,...,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445,7
4,2008-06-11,6,1,41.95469,-87.800991,9,191,6,2008,86.0,...,0.0,416.0,1926.0,6.0,191.0,-1.514379,-1.779891,-0.967131,0.524445,7


In [65]:
features = ['Species', 'Trap', 'Latitude', 'Longitude', 'AddressAccuracy', 'day_weather', 'PrecipTotal', 
           'pca1', 'pca2', 'pca3', 'pca4', 'xt_labels', 'year']
X = labeled_df[features]


In [66]:
test_pred_proba = voter_fit.predict_proba(X)

In [67]:
test_pred_proba

array([[ 0.94081447,  0.05918553],
       [ 0.9550988 ,  0.0449012 ],
       [ 0.93944564,  0.06055436],
       ..., 
       [ 0.94822441,  0.05177559],
       [ 0.97127337,  0.02872663],
       [ 0.85170922,  0.14829078]])

In [68]:
test_proba_df = pd.DataFrame(test_pred_proba, columns = ['prob_0', 'prob_1'])
test_proba_df.head()

Unnamed: 0,prob_0,prob_1
0,0.940814,0.059186
1,0.955099,0.044901
2,0.939446,0.060554
3,0.957377,0.042623
4,0.977378,0.022622


In [69]:
test_proba_df['Id'] = range(1, 116294)
test_proba_df.set_index('Id', inplace = True)
test_proba_df.drop('prob_0', axis = 1, inplace = True)
test_proba_df.head()

Unnamed: 0_level_0,prob_1
Id,Unnamed: 1_level_1
1,0.059186
2,0.044901
3,0.060554
4,0.042623
5,0.022622


In [70]:
test_proba_df.columns = ['WnvPresent']
test_proba_df.head()

Unnamed: 0_level_0,WnvPresent
Id,Unnamed: 1_level_1
1,0.059186
2,0.044901
3,0.060554
4,0.042623
5,0.022622


In [71]:
test_proba_df.to_csv('test_data_predictions.csv')