In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import datetime

## EDA Weather

In [10]:
data_weather = pd.read_csv('./Data/weather.csv')

In [11]:
data_weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [12]:
data_weather.shape

(2944, 22)

In [13]:
data_weather.Date.min()

'2007-05-01'

In [14]:
data_weather.Date.max()

'2014-10-31'

In [15]:
data_weather['weather_day'] = 1

## EDA Train

In [16]:
data_train = pd.read_csv('./Data/train.csv')

In [18]:
data_train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [19]:
data_train.shape

(10506, 12)

In [20]:
data_train.Date.min()

'2007-05-29'

In [21]:
data_train.Date.max()

'2013-09-26'

In [22]:
#model baseline
data_train.WnvPresent.mean()

0.05244622120692937

In [23]:
data_train.Block.value_counts().count()

64

Address, Street, Trap, Lat & Long are all the same

## Combine Data

In [130]:
df = pd.concat([data_train, data_weather], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [131]:
df['weather_day'].fillna(0,  inplace=True)

In [132]:
df['Date'] = pd.to_datetime(df['Date'])
BeginDate = datetime.date(2007, 4, 30)
temp = (df['Date']-pd.Timestamp(BeginDate)).astype(str)
df['DateFrom'] = temp.map(lambda x: x.replace(' days 00:00:00.000000000', '')).astype(int)

## Feature Engineering

In [67]:
df_traps = pd.get_dummies(df['Trap'])
df_traps.columns

Index(['T001', 'T002', 'T003', 'T004', 'T005', 'T006', 'T007', 'T008', 'T009',
       'T011',
       ...
       'T230', 'T231', 'T232', 'T233', 'T235', 'T236', 'T237', 'T238', 'T900',
       'T903'],
      dtype='object', length=136)

In [86]:
df = pd.concat([df, df_traps], axis=1)

In [87]:
df.shape

(13450, 171)

In [133]:
df_blocks = pd.get_dummies(df['Block'])
df = pd.concat([df, df_blocks], axis=1)

In [142]:
df.shape

(13450, 99)

## Model

In [119]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [152]:
model_cols = ['Latitude', 'Longitude']
model_cols.append('DateFrom')
model_cols.append('WnvPresent')

In [153]:
df_formodel = df.loc[:, model_cols].dropna()

In [154]:
X = df_formodel[[col for col in df_formodel.columns if col != 'WnvPresent']]

In [155]:
y = df_formodel['WnvPresent']

In [156]:
knn = KNeighborsClassifier()

In [157]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [158]:
knn.score(X, y) - (1-data_train.WnvPresent.mean())

0.0022844089091946795

## Assess

In [127]:
from sklearn.metrics import confusion_matrix

In [128]:
preds = pd.Series(knn.predict(X))
df_residuals = pd.DataFrame(preds, columns=['Predictions'])
df_residuals['Actuals'] = y

In [129]:
cm = confusion_matrix(y, preds)
df_cm = pd.DataFrame(cm, columns=['Predicted 0', 'Pred 1'], index=['Actual 0', 'Actual 1'])
df_cm

Unnamed: 0,Predicted 0,Pred 1
Actual 0,9872,83
Actual 1,438,113
