# Mosquito Research

In [1]:
% matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
sns.set_style('darkgrid')
sns.set_context('poster')

pd.options.display.max_columns = None

In [2]:
def get_lags(data, x, lag):
    lag_avg = pd.DataFrame(data[x])
    
    def assign_na(x):
        try:
            y = np.float64(x)
        except:
            y = np.NaN
        return y

    lag_avg[x] = lag_avg[x].apply(assign_na)

    for i in range(1, lag):
        lag_avg['lag_{}'.format(i)] = lag_avg[x].copy().shift(i)

    ten_day_avg = []
    for i, row in lag_avg.iterrows():
        ten_day_avg.append(row.mean(skipna=True))

    return pd.Series(ten_day_avg)


In [3]:
! ls ../data

WNV_Pop_Age.csv      model-building.ipynb test.csv
WNV_Pop_Age_Temp.csv model_data.csv       train.csv
census_age.csv       sampleSubmission.csv weather.csv
census_info.csv      spray.csv
data-clean.ipynb     test copy.csv


Mosquito research [paper](https://academic.oup.com/ee/article-abstract/44/4/1022/2465753?redirectedFrom=fulltext)

In [4]:
train = pd.read_csv('../data/train.csv')

In [5]:
species_dummies = pd.get_dummies(train['Species'])

In [6]:
species_dummies.head(1)

Unnamed: 0,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS
0,0,0,1,0,0,0,0


In [7]:
df = train.join(species_dummies)

In [8]:
df.rename(columns={c: c.lower().replace(' ', '_') for c in df.columns}, inplace=True)

In [9]:
df['date'] = pd.to_datetime(df['date'])

In [10]:
df['year'] = df['date'].dt.year

In [11]:
df['week'] = df['date'].apply(lambda x: x.isocalendar()[1])

In [12]:
df.head(1)

Unnamed: 0,date,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent,culex_erraticus,culex_pipiens,culex_pipiens/restuans,culex_restuans,culex_salinarius,culex_tarsalis,culex_territans,year,week
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0,0,1,0,0,0,0,2007,22


In [13]:
weather = pd.read_csv('../data/weather.csv')

In [14]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [15]:
weather.Date = pd.to_datetime(weather.Date)

In [16]:
df_2 = pd.DataFrame()

df_2['ten_day_avg_percip'] = get_lags(weather, 'PrecipTotal', 10)
df_2['ten_day_avg_temp'] = get_lags(weather, 'Tavg', 10)
df_2['ten_day_avg_dewpoint'] = get_lags(weather, 'DewPoint', 10)
df_2['ten_day_avg_pressure'] = get_lags(weather, 'StnPressure', 10)
df_2['ten_day_avg_windspeed'] = get_lags(weather, 'AvgSpeed', 10)


In [17]:
df_2.head()


Unnamed: 0,ten_day_avg_percip,ten_day_avg_temp,ten_day_avg_dewpoint,ten_day_avg_pressure,ten_day_avg_windspeed
0,0.0,67.0,51.0,29.1,9.2
1,0.0,67.5,51.0,29.14,9.4
2,0.0,62.0,48.0,29.22,10.733333
3,0.0,59.5,46.5,29.275,11.4
4,0.0,58.8,45.2,29.298,11.5


In [18]:
weather_dates = pd.DataFrame(weather.Date)


In [19]:
w = weather_dates.join(df_2)


In [20]:
w = w.groupby('Date').mean().reset_index()


In [21]:
w = w.rename(columns={c: c.lower() for c in w.columns})


In [22]:
model_data = df.merge(w, on="date").set_index('date')


In [23]:
model_data.head(1)


Unnamed: 0_level_0,address,species,block,street,trap,addressnumberandstreet,latitude,longitude,addressaccuracy,nummosquitos,wnvpresent,culex_erraticus,culex_pipiens,culex_pipiens/restuans,culex_restuans,culex_salinarius,culex_tarsalis,culex_territans,year,week,ten_day_avg_percip,ten_day_avg_temp,ten_day_avg_dewpoint,ten_day_avg_pressure,ten_day_avg_windspeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0,0,1,0,0,0,0,2007,22,0.200893,65.6,50.6,29.4375,7.9


In [24]:
print(model_data.columns)


Index(['address', 'species', 'block', 'street', 'trap',
       'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy',
       'nummosquitos', 'wnvpresent', 'culex_erraticus', 'culex_pipiens',
       'culex_pipiens/restuans', 'culex_restuans', 'culex_salinarius',
       'culex_tarsalis', 'culex_territans', 'year', 'week',
       'ten_day_avg_percip', 'ten_day_avg_temp', 'ten_day_avg_dewpoint',
       'ten_day_avg_pressure', 'ten_day_avg_windspeed'],
      dtype='object')


In [25]:
model_data = model_data.drop(['address', 'species', 'block', 'street', 'trap', 'year', 'addressnumberandstreet', 'latitude', 'longitude', 'addressaccuracy'], axis=1)


In [26]:
model_data.to_csv('../data/model_data.csv')


In [27]:
model_data.head()


Unnamed: 0_level_0,nummosquitos,wnvpresent,culex_erraticus,culex_pipiens,culex_pipiens/restuans,culex_restuans,culex_salinarius,culex_tarsalis,culex_territans,week,ten_day_avg_percip,ten_day_avg_temp,ten_day_avg_dewpoint,ten_day_avg_pressure,ten_day_avg_windspeed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2007-05-29,1,0,0,0,1,0,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
2007-05-29,1,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
2007-05-29,1,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
2007-05-29,1,0,0,0,1,0,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9
2007-05-29,4,0,0,0,0,1,0,0,0,22,0.200893,65.6,50.6,29.4375,7.9


In [28]:
!ls ../data/

WNV_Pop_Age.csv      model-building.ipynb test.csv
WNV_Pop_Age_Temp.csv model_data.csv       train.csv
census_age.csv       sampleSubmission.csv weather.csv
census_info.csv      spray.csv
data-clean.ipynb     test copy.csv
