In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import datetime

In [187]:
data_test = pd.read_csv('./Data/test.csv')

## EDA Weather

In [3]:
data_weather = pd.read_csv('./Data/weather.csv')

In [4]:
data_weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [5]:
data_weather.shape

(2944, 22)

In [6]:
data_weather.Date.min()

'2007-05-01'

In [7]:
data_weather.Date.max()

'2014-10-31'

In [8]:
data_weather['weather_day'] = 1

## EDA Train

In [9]:
data_train = pd.read_csv('./Data/train.csv')

In [10]:
data_train.columns

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')

In [14]:
#model baseline
data_train.WnvPresent.mean()

0.05244622120692937

In [120]:
temp = pd.to_datetime(data_train.Date).dt.month


Address, Street, Trap, Lat & Long are all the same

## Combine Data

In [116]:
df = pd.concat([data_train, data_weather], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [117]:
df['weather_day'].fillna(0,  inplace=True)

In [121]:
df['Month'] = pd.to_datetime(df.Date).dt.month

In [123]:
df['Year'] = pd.to_datetime(df.Date).dt.year

In [125]:
df['Date'] = pd.to_datetime(df['Date'])
BeginDate = datetime.date(2007, 4, 30)
temp = (df['Date']-pd.Timestamp(BeginDate)).astype(str)
df['DateFrom'] = temp.map(lambda x: x.replace(' days 00:00:00.000000000', '')).astype(int)

## Feature Engineering

In [19]:
df_traps = pd.get_dummies(df['Trap'])
df_traps.columns

Index(['T001', 'T002', 'T003', 'T004', 'T005', 'T006', 'T007', 'T008', 'T009',
       'T011',
       ...
       'T230', 'T231', 'T232', 'T233', 'T235', 'T236', 'T237', 'T238', 'T900',
       'T903'],
      dtype='object', length=136)

In [20]:
df = pd.concat([df, df_traps], axis=1)

In [87]:
df.shape

(13450, 171)

In [133]:
df_blocks = pd.get_dummies(df['Block'])
df = pd.concat([df, df_blocks], axis=1)

In [142]:
df.shape

(13450, 99)

## Model

In [30]:
from sklearn.cluster import KMeans
import operator

In [235]:
model_cols = ['Latitude', 'Longitude', 'Month', 'WnvPresent']

In [236]:
df_formodel = df.loc[:, model_cols].dropna()

In [237]:
checkn = 20
kmm=KMeans(n_clusters=checkn, random_state=42)
kmm.fit(df_formodel)
k_means_list = [kmm.cluster_centers_[i][len(model_cols)-1] for i in range(checkn)]

k_means_dict = {}
for i in range(len(k_means_list)):
    k_means_dict[i] = k_means_list[i]
sorted_x = sorted(k_means_dict.items(), key=operator.itemgetter(1))
sorted_x[-10:]

[(16, 6.314393452555578e-16),
 (0, 6.453171330633722e-16),
 (14, 7.771561172376096e-16),
 (7, 8.673617379884035e-16),
 (3, 0.0017182130584197583),
 (5, 0.007246376811594159),
 (4, 0.9999999999999966),
 (10, 0.9999999999999966),
 (6, 0.9999999999999978),
 (12, 1.0000000000000007)]

## Assess

In [182]:
from sklearn.metrics import confusion_matrix

In [183]:
df_residuals = df_formodel

df_residuals['class_pred'] = kmm.predict(df_formodel)

df_residuals['rate_pred'] =df_residuals['class_pred'].map(k_means_dict)

df_residuals['Prediction'] =1*( df_residuals['rate_pred']>.2)

In [186]:
cm = confusion_matrix(df_residuals['WnvPresent'], df_residuals['Prediction'])
df_cm = pd.DataFrame(cm, columns=['Predicted 0', 'Pred 1'], index=['Actual 0', 'Actual 1'])

print('   Accurate Predictions', df_cm.iloc[1][1] / (df_cm.iloc[1][1] +df_cm.iloc[0][1]) )
print('   % Identified', df_cm.iloc[1][1] / (df_cm.iloc[1][0]+df_cm.iloc[1][1]) )
df_cm

   Accurate Predictions 1.0
   % Identified 0.9945553539019963


Unnamed: 0,Predicted 0,Pred 1
Actual 0,9955,0
Actual 1,3,548


In [185]:
df_residuals['class_pred'].value_counts().sort_index()

0      703
1     1390
2      507
3      582
4      189
5      276
6      125
7     1095
8      521
9      523
10     188
11      84
12      46
13     671
14     889
15     642
16     692
17     468
18     487
19     428
Name: class_pred, dtype: int64

In [229]:
from sklearn.neighbors import KNeighborsClassifier

In [257]:
KNC = KNeighborsClassifier(n_neighbors=20)

In [253]:
X = df.loc[:, [i for i in model_cols if i != 'WnvPresent']].dropna()

In [263]:
y = kmm.predict(df_formodel)

In [265]:
KNC.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [262]:
KNC.score(X,y)

0.9475537787930707

## Assess

In [215]:
df_predictions = data_test

In [216]:
df_predictions['Month'] = pd.to_datetime(df_predictions.Date).dt.month

In [217]:
df_predictions['Year'] = pd.to_datetime(df_predictions.Date).dt.year

In [218]:
df_predictions['Date'] = pd.to_datetime(df_predictions['Date'])
temp = (df_predictions['Date']-pd.Timestamp(BeginDate)).astype(str)
df_predictions['DateFrom'] = temp.map(lambda x: x.replace(' days 00:00:00.000000000', '')).astype(int)

In [222]:
model_cols.pop(3)

'Id'

In [223]:
predic_cols = model_cols

In [224]:
predic_cols

['Latitude', 'Longitude', 'Month']

In [225]:
df_predictions = df_predictions[predic_cols]

In [267]:
KNC.predict(df_predictions)

array([17, 17, 17, ...,  5,  5,  5])

In [268]:
df_residuals = df_predictions

df_residuals['class_pred'] = KNC.predict(df_predictions)

df_residuals['rate_pred'] =df_residuals['class_pred'].map(k_means_dict)

df_residuals['Prediction'] =1*( df_residuals['rate_pred']>.2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [277]:
df_residuals[df_residuals['class_pred']==4]

Unnamed: 0,Latitude,Longitude,Month,class_pred,rate_pred,Prediction
