In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
import os

from utils import *
from model import *

%matplotlib inline

In [10]:
os.listdir()

['.ipynb_checkpoints',
 '__pycache__',
 'measurement_locations.ipynb',
 'measurement_locations.py',
 'model.py',
 'utils.py',
 'west_nile_virus_analysis.ipynb']

## Explanatory Analysis

In [2]:
# Load the Train/Test Data
mapdata = np.loadtxt("../input/mapdata_copyright_openstreetmap_contributors.txt")
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
weather = pd.read_csv("../input/weather.csv")
traps = train[['Date', 'Trap','Longitude', 'Latitude', 'WnvPresent']]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]

FileNotFoundError: [Errno 2] No such file or directory: '../input/mapdata_copyright_openstreetmap_contributors.txt'

In [None]:
# Location of Traps
lon_lat_box = (-88, -87.5, 41.6, 42.1)

plt.figure(figsize=(20,14))
plt.imshow(mapdata,
           cmap=plt.get_cmap('gray'),
           extent=lon_lat_box,
           aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')


wnvPresent = train.groupby(['Latitude','Longitude']).agg(np.sum)['WnvPresent'].reset_index().values
plt.scatter(wnvPresent[:,1], wnvPresent[:,0], marker='o',color='r',s=wnvPresent[:,2]/2)


In [None]:
train.groupby('WnvPresent').size().plot(kind='bar')

In [None]:
train.iloc[0]

In [None]:
train.head()

In [None]:
test.head()

In [None]:
test.groupby('Species').size()

In [None]:
# Spray Data
spray = pd.read_csv("../input/spray.csv")
spray.head()

In [None]:
# Spray data is not available for test years!! 
spray.groupby('Date').size()

In [None]:
weather.head()

In [None]:
# Mosquito Count by Trap and Month
def _get_trap_stats(dt):
    get_month = lambda d: float(d.split('-')[1])
    dt['month'] = dt['Date'].apply(get_month)
    trap_stats = dt.groupby(['Trap','month']).agg([np.mean,len])['NumMosquitos'].reset_index().sort_values('mean',ascending=False)
    trap_stats.index = trap_stats.Trap
    trap_stats.index = trap_stats[['Trap','month']]
    trap_stats = trap_stats.drop(['len'],axis=1)
    return trap_stats.pivot(index='Trap',columns='month',values='mean').fillna(0)

trap_stats = _get_trap_stats(train)
trap_stats.plot.barh(stacked=True,figsize=(15,15))

In [None]:
# Infection probability given number of mosquitoes in the trap
_temp = train.copy()[['NumMosquitos','WnvPresent']]
_temp['m_count'] = pd.cut(_temp.NumMosquitos,10)
_temp = _temp.groupby('m_count').mean()['WnvPresent']
_temp.plot(kind='bar')

In [None]:
train.columns

In [None]:
_data.head()

In [None]:
def _fit_neighborhood_model(dt):
    get_month = lambda d: int(d.split('-')[1])
    dt['month'] = dt['Date'].apply(get_month)
    months = set(dt.month.astype(int).tolist())
    neighborhood_model = {}
    for _month in months:
        neigh = KNeighborsRegressor(n_neighbors=3)
        _data = dt[['Longitude','Latitude','NumMosquitos','month']][dt['month'] == _month]
        _data = _data.groupby(['Longitude','Latitude']).mean()['NumMosquitos'].reset_index()
        _X, _y = np.array(_data[['Longitude','Latitude']]), _data['NumMosquitos']
        neigh.fit(_X, _y)
        neighborhood_model[_month] = neigh
    return neighborhood_model

neigh = _fit_neighborhood_model(train)
i = 2001
train.iloc[i], neigh[8].predict(np.array([train.Longitude[i],train.Latitude[i]]).reshape(1,-1))[0]

In [None]:
train.head()

In [None]:
# Fitted model versus actual mosquito count
_temp = train.copy()
def _get_mosquito_bias(dt, model):
    get_month = lambda d: int(d.split('-')[1])
    dt['month'] = dt['Date'].apply(get_month)
    # Compute the approx mosquito count
    def compute_bias(row):
        _x = np.array([row.Longitude,row.Latitude]).reshape(1,-1)
        return model[row.month].predict(_x)[0]
    return dt.apply(compute_bias,axis=1)
get_month = lambda d: int(d.split('-')[1])
_temp['month'] = _temp['Date'].apply(get_month)
_temp['app'] =  _get_mosquito_bias(train, neigh)
_temp[['Trap','month','app','NumMosquitos']].tail(20)

In [None]:
_lon_min, _lon_max, _lat_min, _lat_max = train['Longitude'].min(), train['Longitude'].max(), train['Latitude'].min(), train['Latitude'].max()
lons = np.linspace(_lon_min,_lon_max,20)
lats = np.linspace(_lat_min,_lat_max,20)
_out = []
for _lon, _lat in product(lons, lats):
    _out.append({
        'val': neighborhood_model[8].predict(np.array([_lon,_lat]).reshape(1,-1))[0],
        'lon' : _lon,
        'lat' : _lat
    }
    ) 
_out = pd.DataFrame(_out)

In [None]:
# Location of Traps
lon_lat_box = (-88, -87.5, 41.6, 42.1)

plt.figure(figsize=(20,14))
plt.imshow(mapdata,
           cmap=plt.get_cmap('gray'),
           extent=lon_lat_box,
           aspect=aspect)

neighbor_app = _out.groupby(['lat','lon']).agg(np.sum)['val'].reset_index().values
plt.scatter(neighbor_app[:,1], neighbor_app[:,0], marker='o',color='b',s=neighbor_app[:,2])

num_m = train.groupby(['Latitude','Longitude']).agg(np.sum)['NumMosquitos'].reset_index().values
plt.scatter(num_m[:,1], num_m[:,0], marker='o',color='r',s=num_m[:,2]/10)

In [None]:
# Species counts
train['Species'].value_counts().plot(kind='bar')

In [None]:
#################################################

In [None]:
# Processed weather data
weather = process_weather_data()
weather.head()

In [None]:
# Show precipitation histogram
weather.PrecipTotal.hist(bins=30)

In [None]:
r = weather.PrecipTotal.rolling(window=5)
weather['AvgPrecip'] = r.mean().fillna(0)
weather.AvgPrecip.hist(bins=30)

In [None]:
# Show the correlation between dew point - average temp difference and precipation total
weather['dew_diff'] = weather['Tavg'] - weather['DewPoint']
weather.plot(kind='scatter',x='dew_diff',y='PrecipTotal')

In [None]:
# Weather data averaged over months
montly_avgs = weather.groupby('month').mean().drop(['week'],axis=1)
montly_avgs.head()

### Processed Training Data

In [None]:
train = get_train_or_test_data()
train.head()

In [None]:
# target variable distribution by month
train.groupby('month').mean()['WnvPresent'].plot(kind='bar')

In [None]:
train.groupby('month').size().plot(kind='bar')

In [None]:
# Wind speed
#train['ResultSpeed'].hist()
_temp = train.copy()[['ResultSpeed','WnvPresent']]
_temp['wind_speed'] = pd.cut(_temp.ResultSpeed,10)
_temp = _temp.groupby('wind_speed').mean()['WnvPresent']
_temp.plot(kind='bar')

In [None]:
# Precipation
_temp = train.copy()[['PrecipTotal','WnvPresent']]
_temp['PrecipTotal'].hist(bins=30)
_temp['has_rained'] = (_temp['PrecipTotal'] > 0)
_temp = _temp.drop('PrecipTotal',axis=1)
_temp.groupby('has_rained').agg([np.mean,np.std,len])

In [None]:
_temp = train.copy()[['Tavg','DewPoint','WetBulb','WnvPresent']]
_temp['diff'] = _temp['Tavg'] - _temp['DewPoint']
_temp['diff'].hist(bins=30)
_temp['diff'] = pd.cut(_temp['diff'],10)
_temp = _temp.groupby('diff').mean()['WnvPresent']

In [None]:
# Saturation - Detection 
_temp.plot(kind='bar')

In [None]:
_temp = train.copy()[['Species','WnvPresent']]
_temp = _temp.groupby('Species').mean()['WnvPresent']
_temp.plot(kind='bar')

In [None]:
train.groupby('Species').size().plot(kind='bar')

In [None]:
# Preprocessing
species = pd.get_dummies(train['Species'])
species.head()

## Pipeline

In [None]:
# Prepare data
train_data = get_train_or_test_data()
X, Y, scaler = preprocess_data(train_data)

# Fit
fitted_model = train_model(X, Y)

# Predict
test_data = get_train_or_test_data(train=False)
X_test, _, _ = preprocess_data(test_data, train=False, scaler=scaler)

preds = fitted_model.predict_proba(X_test, verbose=0)
save_predictions(preds)

In [None]:
_preds = pd.DataFrame(preds[:,1])
_preds.hist(bins=30)

In [None]:
fitted_model.summary()

In [None]:
test_data.tail(20)