In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

### Clean the main dataset

In [None]:
# load the mian data
data = pd.read_csv('train.csv')
data = data[['Date', 'Species', 'Trap', 'Latitude', 'Longitude', 'NumMosquitos', 'WnvPresent']]

In [None]:
# calculate the total number of mos of same species for one trap each day
mos = data.groupby(['Date', 'Species', 'Trap', 'Latitude', 'Longitude']).NumMosquitos.sum()
wnv = data.groupby(['Date', 'Species', 'Trap', 'Latitude', 'Longitude']).WnvPresent.sum()

# put number of mos and virus present back together
df = pd.concat([mos, wnv], axis=1)
df.reset_index(level=['Date', 'Species', 'Trap', 'Latitude', 'Longitude'], inplace=True)
df.WnvPresent = df.WnvPresent.apply(lambda x: 1 if x>=1 else 0)

In [None]:
df.Date = pd.to_datetime(df.Date)

### Clean the weather dataset

In [None]:
weather = pd.read_csv("weather.csv")

In [None]:
weather.drop('CodeSum', axis=1, inplace=True)

In [None]:
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [None]:
weather.Tavg = pd.to_numeric(weather.Tavg)

In [None]:
# split weather data from different stations

weather1 = weather[weather['Station']==1]
weather2 = weather[weather['Station']==2]
weather1 = weather1.drop('Station', axis=1)
weather2 = weather2.drop('Station', axis=1)

In [None]:
# rename columns so that after feature selection I could identify where the feature is from
weather1.columns = weather1.columns + str(1)
weather1.rename(columns={'Date1': 'Date'}, inplace=True)
weather2.columns = weather2.columns +str(2)
weather2.rename(columns={'Date2': 'Date'}, inplace=True)

In [None]:
# merge weather data to put weather data for each day in one single row
weather = weather1.merge(weather2, on='Date')

In [None]:
weather.Date = pd.to_datetime(weather.Date)

In [None]:
# merge weather data to the main dataframe
df = pd.merge(df, weather, on='Date', how='left')

In [None]:
# create time variables to use to merge spray data

df['Year'] = df.Date.dt.year
df['Month'] = df.Date.dt.month
df['Week'] = df.Date.dt.week
df['Day'] = df.Date.dt.day

### Utilize spray data

In [None]:
# we think one spray will have a lasting effect for 3 weeks in a certain area, 
# so the following code is essentially trying to merge spray to the main dataset on spray date and location,
# and assaign spray=1 to the records that within 3 weeks after spray

In [None]:
# we think spray will have a wild range of effect, so we round the coordinates to 2 decimal place and merge on that

spray = pd.read_csv("spray.csv")
df['merge_latitude'] = df.Latitude.apply(lambda x: round(x, 2))
df['merge_longitude'] = df.Longitude.apply(lambda x: round(x, 2))
spray['merge_latitude'] = spray.Latitude.apply(lambda x: round(x, 2))
spray['merge_longitude'] = spray.Longitude.apply(lambda x: round(x, 2))

In [None]:
spray.Date = pd.to_datetime(spray.Date)
spray['Year'] = spray.Date.dt.year
spray['Week'] = spray.Date.dt.week
spray.drop('Time', axis=1, inplace=True)

In [None]:
spray = spray.iloc[:, 3:]

In [None]:
# the following lines of code are trying to create 3 dataframe from spray to represent spray effect for three weeks

spray['Week2'] = spray.Week + 1
spray['Week3'] = spray.Week + 2
spray['Spray1'] = 1
spray['Spray2'] = 1
spray['Spray3'] = 1

In [None]:
spray.drop_duplicates(keep='first', inplace=True)

In [None]:
spray_week1 = spray[['merge_latitude', 'merge_longitude', 'Year', 'Week', 'Spray1']]
spray_week2 = spray[['merge_latitude', 'merge_longitude', 'Year', 'Week2', 'Spray2']]
spray_week2.rename(columns={'Week2':'Week'}, inplace=True)
spray_week3 = spray[['merge_latitude', 'merge_longitude', 'Year', 'Week3', 'Spray3']]
spray_week3.rename(columns={'Week3':'Week'}, inplace=True)

In [None]:
# finally, merge spray data to the main dataframe

df = pd.merge(df, spray_week1, how = 'left', on=['Year', 'Week', 'merge_latitude', 'merge_longitude'])
df = pd.merge(df, spray_week2, how = 'left', on=['Year', 'Week', 'merge_latitude', 'merge_longitude'])
df = pd.merge(df, spray_week3, how = 'left', on=['Year', 'Week', 'merge_latitude', 'merge_longitude'])

In [None]:
# Get rid of NaNs. Replace them with 0s.
df.Spray1 = df.Spray1.apply(lambda x: 1 if x == 1 else 0)
df.Spray2 = df.Spray2.apply(lambda x: 1 if x == 1 else 0)
df.Spray3 = df.Spray3.apply(lambda x: 1 if x == 1 else 0)

In [None]:
# combine spray information to one single column, and drop columns used to merge
df['Sprayed'] = df.Spray1 + df.Spray2 + df.Spray3
df.drop(['merge_latitude', 'merge_longitude', 'Spray1', 'Spray2', 'Spray3'], axis=1, inplace=True)
df.drop(['Date', 'Month', 'Day'], axis=1, inplace=True)
df['wnv'] = df.WnvPresent
df.drop('WnvPresent', axis=1, inplace=True)

In [None]:
df.to_csv('~/desktop/wnv.csv', index=False)

### Done with cleaning

In [None]:
df = pd.read_csv('~/desktop/wnv.csv')

In [None]:
# label encode all the categorical features

le = LabelEncoder()
cols = [i for i in df.select_dtypes(include=['object']).columns]
df[cols] = df[cols].apply(le.fit_transform)

In [None]:
X = df.iloc[:,0:-1]
y = df.wnv

### Feature Selection

In [None]:
rf = RandomForestClassifier()
rf.fit(X, y)

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)

In [None]:
# NumMosquitos is not available in the test dataset. The number of Traps is much less than that in the test dataset.
# Sunset time is perfectly correlated with Sunrise time, so we do not expect including it would bring extra benefits
X = df[['Longitude', 'Latitude', 'Species', 'Sunrise1', 'Week']]

### Random Forest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=df.wnv, random_state=42)

In [None]:
# use grid search to optimize the auc

rf_params = {'n_estimators': [5, 10, 15, 20, 30], 'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2', None],
            'max_depth': [3, 5, 10, None]}
rfgs = GridSearchCV(rf, rf_params, scoring='roc_auc')
rfgs.fit(X_train, y_train)

In [None]:
rfgs.score(X_test, y_test)

In [None]:
print rfgs.best_params_
print rfgs.best_score_

In [None]:
# use the best model from grid search to calculate the cross validation score
# set class_weights='balanced' to capture more true positive prediction
rf = RandomForestClassifier(max_features='log2', n_estimators=15, criterion='gini', max_depth=5, class_weight='balanced')
print cross_val_score(rf, X, y, cv=5, n_jobs=-1).mean()
rf_pred = cross_val_predict(rf, X, y, cv=5, n_jobs=-1)

In [None]:
# confusion matrix
conmat = np.array(confusion_matrix(y, rf_pred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['wnv', 'no wnv'], columns=['pred wnv', 'pred no wnv'])
confusion

In [None]:
# plot roc curve and calculate auc

rf.fit(X_train, y_train)
rf_prob = rf.predict_proba(X_test)[:,1]
rffpr = dict()
rftpr = dict()
rfroc_auc=dict()
rffpr[1], rftpr[1], _ = roc_curve(y_test, rf_prob)
rfroc_auc[1] = auc(rffpr[1], rftpr[1])
plt.figure(figsize=[11,9])
plt.plot(rffpr[1], rftpr[1], label='ROC curve (area = %0.2f)' % rfroc_auc[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver operating characteristic for WNV (rf)', fontsize=18)
plt.legend(loc="lower right")
plt.show()

### SVM

In [None]:
# standardize numeric features

numerical_features = ['Longitude', 'Latitude', 'Sunrise1', 'Week']
for i in numerical_features:
    df[i] = (df[i]-df[i].mean())/df[i].std()

In [None]:
# Theoratically, we should have created dummies for categorical features, but doing that brought our kaggle score down 

X = df[['Longitude', 'Latitude', 'Species', 'Sunrise1', 'Week']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=df.wnv, random_state=42)

In [None]:
# grid search

clf = svm.SVC(kernel='rbf')
clf_params = {'C': 10.**np.arange(-2,3), 'gamma': 10.**np.arange(-5,2)}
clfgs = GridSearchCV(clf, clf_params, scoring='roc_auc')
clfgs.fit(X_train, y_train)

In [None]:
clfgs.score(X_test, y_test)

In [None]:
print clfgs.best_params_
print clfgs.best_score_

In [None]:
from sklearn.cross_validation import StratifiedKFold
cv = StratifiedKFold(df.wnv, n_folds=5, shuffle=True, random_state=7)

In [None]:
# cross validation

clf = svm.SVC(kernel='rbf', C=100, gamma=1, probability=True, class_weight='balanced')
clfscore = cross_val_score(clf, X, y, cv=cv, n_jobs=-1).mean()
clf_pred = cross_val_predict(clf, X, y, cv=cv, n_jobs=-1)
print clfscore

In [None]:
cm_clf = np.array(confusion_matrix(y, clf_pred, labels=[1,0]))
confusion_clf = pd.DataFrame(cm_clf, index=['wnv', 'no wnv'], columns=['pred wnv', 'pred no wnv'])
confusion_clf

In [None]:
clf.fit(X_train, y_train)
clf_prob = clf.predict_proba(X_test)[:,1]
clffpr = dict()
clftpr = dict()
clfroc_auc=dict()
clffpr[1], clftpr[1], _ = roc_curve(y_test, clf_prob)
clfroc_auc[1] = auc(clffpr[1], clftpr[1])
plt.figure(figsize=[11,9])
plt.plot(clffpr[1], clftpr[1], label='ROC curve (area = %0.2f)' % clfroc_auc[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Receiver operating characteristic for WNV (svm)', fontsize=18)
plt.legend(loc="lower right")
plt.show()

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn_params={'n_neighbors': range(3, 80), 'weights': ['uniform', 'distance']}
knngs = GridSearchCV(knn, knn_params, scoring='roc_auc')
knngs.fit(X_train, y_train)

In [None]:
print knngs.best_params_
print knngs.best_score_
print knngs.score(X_test, y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=41, weights='uniform')

In [None]:
knn.fit(X_train, y_train)

### Neural Network

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

In [None]:
xs = X.astype(float).as_matrix()
ys = y

In [None]:
# define the neural network

def baseline_model():
    model = Sequential()
    model.add(Dense(30, input_dim=5, init='normal', activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(30, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(30, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
xs_train, xs_test, ys_train, ys_test = train_test_split(xs, ys, test_size=0.3, stratify=df.wnv, random_state=42)

In [None]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=20, batch_size=50, verbose=0)

In [None]:
estimator.fit(xs_train, ys_train)
nn_prob = estimator.predict_proba(xs_test)[:,1]
nnfpr = dict()
nntpr = dict()
nnroc_auc=dict()
nnfpr[1], nntpr[1], _ = roc_curve(y_test, nn_prob)
nnroc_auc[1] = auc(nnfpr[1], nntpr[1])

In [None]:
nnroc_auc[1]

### Naive Bayes (Gaussian)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_prob = gnb.predict_proba(X_test)[:,1]
gnbfpr = dict()
gnbtpr = dict()
gnbroc_auc=dict()
gnbfpr[1], gnbtpr[1], _ = roc_curve(y_test, clf_prob)
gnbroc_auc[1] = auc(gnbfpr[1], gnbtpr[1])

In [None]:
gnbroc_auc[1]

### EDA

In [None]:
import seaborn as sns

In [None]:
sns.set()
sns.pairplot(df, x_vars='Latitude', y_vars='Longitude', hue='wnv', size=8)

### Submitting results

In [None]:
# prepare the testing dataframe. we only use weather data from station 1 because that is where our features from

testing = pd.read_csv('test.csv')
weather_test = pd.read_csv('weather.csv')
weather_test = weather_test[weather_test.Station==1]
testing = pd.merge(left=testing, right=weather_test, on='Date', how='left')
testing.Date = pd.to_datetime(testing.Date)
testing['Week'] = testing.Date.dt.week
testing.Sunrise = pd.to_numeric(testing.Sunrise)
testing.rename(columns={'Sunrise': 'Sunrise1'}, inplace=True)
testing.to_csv('cleanedtest.csv', index=False)

In [None]:
testing = pd.read_csv('cleanedtest.csv')

In [None]:
# manually standardize numeric features
for i in numerical_features:
    testing[i] = (testing[i]-testing[i].mean())/testing[i].std()

In [None]:
# label encoder categorical features
test_cols = [i for i in testing.select_dtypes(include=['object']).columns]
testing[test_cols] = testing[test_cols].apply(le.fit_transform)

In [None]:
X_testing = testing[['Longitude', 'Latitude', 'Species', 'Sunrise1', 'Week']]

### Random Forest

In [None]:
rf_test = rf.predict_proba(X_testing)[:,1]
testing['WnvPresent'] = rf_test
submit_rf = testing[['Id', 'WnvPresent']]
submit_rf.to_csv('score_rf.csv', index=False)

### SVM

In [None]:
svm_test = clf.predict_proba(X_testing)[:,1]
testing['WnvPresent'] = svm_test
submit_svm = testing[['Id', 'WnvPresent']]
submit_svm.to_csv('score_svm.csv', index=False)

### KNN

In [None]:
knn_test = knn.predict_proba(X_testing)[:,1]
testing['WnvPresent'] = knn_test
submit_knn = testing[['Id', 'WnvPresent']]
submit_knn.to_csv('score_knn.csv', index=False)

### Neural Network|

In [None]:
xs_testing = X_testing.as_matrix()

In [None]:
nn_test = estimator.predict_proba(xs_testing)[:,1]
testing['WnvPresent'] = nn_test
submit_nn = testing[['Id', 'WnvPresent']]
submit_nn.to_csv('score_nn.csv', index=False)

### Naive Bayes (gaussian)

In [None]:
gnb_test = gnb.predict_proba(X_testing)[:,1]
testing['WnvPresent'] = gnb_test
submit_gnb = testing[['Id', 'WnvPresent']]
submit_gnb.to_csv('score_gnb.csv', index=False)