# Capstone 2: West Nile Virus Prediction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
train = pd.read_csv("train.csv", parse_dates=True)
weather = pd.read_csv("weather.csv", parse_dates=True)

In [3]:
train.shape

(10506, 12)

In [4]:
weather.shape

(2944, 22)

In [5]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [6]:
train.describe()

Unnamed: 0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
count,10506.0,10506.0,10506.0,10506.0,10506.0,10506.0
mean,35.687797,41.841139,-87.699908,7.819532,12.853512,0.052446
std,24.339468,0.112742,0.096514,1.452921,16.133816,0.222936
min,10.0,41.644612,-87.930995,3.0,1.0,0.0
25%,12.0,41.732984,-87.76007,8.0,2.0,0.0
50%,33.0,41.846283,-87.694991,8.0,5.0,0.0
75%,52.0,41.95469,-87.627796,9.0,17.0,0.0
max,98.0,42.01743,-87.531635,9.0,50.0,1.0


In [7]:
weather.head(2)

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6


In [8]:
weather.describe()

Unnamed: 0,Station,Tmax,Tmin,DewPoint,ResultSpeed,ResultDir
count,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,1.5,76.166101,57.810462,53.45788,6.960666,17.494905
std,0.500085,11.46197,10.381939,10.675181,3.587527,10.063609
min,1.0,41.0,29.0,22.0,0.1,1.0
25%,1.0,69.0,50.0,46.0,4.3,7.0
50%,1.5,78.0,59.0,54.0,6.4,19.0
75%,2.0,85.0,66.0,62.0,9.2,25.0
max,2.0,104.0,83.0,75.0,24.1,36.0


In [9]:
def fill_Tavg(x1,x2,x3):
    if pd.isnull(x3):
        x3=np.ceil((x1+x2)/2)
        return x3
    else:
        return x3
def calc_decadent_normal(x1,x2):
    if pd.isnull(x2):
        return np.nan
    else:
        return int(x1)-int(x2)
    

In [10]:
#filling NAs and dropping unneeded columns. FFill method used for missing values as asssociated columns have very similar values
weather=weather.replace('-', np.nan)
weather=weather.replace('M', np.nan)
median_speed = weather['AvgSpeed'].median()
weather['AvgSpeed'] = weather['AvgSpeed'].fillna(value=median_speed)

col = ['Sunrise', 'WetBulb', 'Sunset', 'Heat', 'Cool','SeaLevel','StnPressure']

for x in col: 
    weather[x] = weather[x].fillna(method='ffill')

weather['PrecipTotal'] = weather['PrecipTotal'].apply(lambda x: 0.00 if str(x).strip()=='T' or pd.isnull(x) else x)
weather['PrecipBool'] = weather['PrecipTotal'].apply(lambda x: 'N' if x=='0.00' else 'Y')
weather['Tavg']=weather.apply(lambda x: fill_Tavg(x['Tmax'], x['Tmin'], x['Tavg']), axis=1)
weather['decadent_normal']=weather.apply(lambda x: calc_decadent_normal(x['Tavg'], x['Depart']), axis=1)
weather['decadent_normal']=weather['decadent_normal'].fillna(method='ffill')
weather['Depart']=weather.apply(lambda x: int(x['Tavg'])-int(x['decadent_normal']), axis=1)

weather=weather.drop(['decadent_normal'], axis=1)
weather=weather.drop(['Water1','Depth', 'SnowFall', 'CodeSum'], axis=1)
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)


In [11]:
weather.isnull().sum()

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
PrecipBool     0
dtype: int64

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             10506 non-null  object 
 1   Species          10506 non-null  object 
 2   Block            10506 non-null  int64  
 3   Street           10506 non-null  object 
 4   Trap             10506 non-null  object 
 5   Latitude         10506 non-null  float64
 6   Longitude        10506 non-null  float64
 7   AddressAccuracy  10506 non-null  int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 656.8+ KB


In [13]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Station      2944 non-null   int64  
 1   Date         2944 non-null   object 
 2   Tmax         2944 non-null   int64  
 3   Tmin         2944 non-null   int64  
 4   Tavg         2944 non-null   object 
 5   Depart       2944 non-null   int64  
 6   DewPoint     2944 non-null   int64  
 7   WetBulb      2944 non-null   object 
 8   Heat         2944 non-null   object 
 9   Cool         2944 non-null   object 
 10  Sunrise      2944 non-null   object 
 11  Sunset       2944 non-null   object 
 12  PrecipTotal  2944 non-null   object 
 13  StnPressure  2944 non-null   object 
 14  SeaLevel     2944 non-null   object 
 15  ResultSpeed  2944 non-null   float64
 16  ResultDir    2944 non-null   int64  
 17  AvgSpeed     2944 non-null   object 
 18  PrecipBool   2944 non-null   object 
dtypes: flo

In [14]:
train = train.merge(weather, on='Date')

In [24]:
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

def create_year(x): 
    return x.split('-')[0]

train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
train['year']=train.Date.apply(create_year)
train.drop('Date', axis = 1, inplace = True)

In [25]:
dummies = pd.get_dummies(train, columns = ['Species', 'PrecipBool', 'month', 'day', 'year'])
dummies

Unnamed: 0,Block,Street,Trap,Latitude,Longitude,AddressAccuracy,Station,Tmax,Tmin,Tavg,...,day_26,day_27,day_28,day_29,day_30,day_31,year_2007,year_2009,year_2011,year_2013
0,41,N OAK PARK AVE,T002,41.954690,-87.800991,9,1,88,60,74,...,0,0,0,1,0,0,1,0,0,0
1,41,N OAK PARK AVE,T002,41.954690,-87.800991,9,2,88,65,77,...,0,0,0,1,0,0,1,0,0,0
2,41,N OAK PARK AVE,T002,41.954690,-87.800991,9,1,88,60,74,...,0,0,0,1,0,0,1,0,0,0
3,41,N OAK PARK AVE,T002,41.954690,-87.800991,9,2,88,65,77,...,0,0,0,1,0,0,1,0,0,0
4,62,N MANDELL AVE,T007,41.994991,-87.769279,9,1,88,60,74,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21007,17,N ASHLAND AVE,T232,41.912563,-87.668055,9,2,75,55,65,...,1,0,0,0,0,0,0,0,0,1
21008,71,N HARLEM AVE,T233,42.009876,-87.807277,9,1,75,50,63,...,1,0,0,0,0,0,0,0,0,1
21009,71,N HARLEM AVE,T233,42.009876,-87.807277,9,2,75,55,65,...,1,0,0,0,0,0,0,0,0,1
21010,42,W 65TH ST,T235,41.776428,-87.627096,8,1,75,50,63,...,1,0,0,0,0,0,0,0,0,1
