### Header

In [None]:
# import libraries

# maths
import numpy as np
import pandas as pd

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

# others
import os
import re
import time
import datetime as datetime

In [None]:
# file paths

input_path = '../data/2_input/'
clean_path = '../data/3_clean/'
output_path = '../data/4_output/'

image_path = '../images/'

### Functions

### Import Data

In [None]:
# import clean data

df_train = pd.read_csv(clean_path + 'train_clean.csv')
df_test = pd.read_csv(clean_path + 'test_clean.csv')
street_map = gpd.read_file('../data/2_input/Chicago Boundaries/geo_export_d41dc94d-31ce-478c-902c-864695385885.shp')
df_weather = pd.read_csv(clean_path + 'weather_clean.csv')
df_spray = pd.read_csv(clean_path + 'spray_clean.csv')

### EDA

In [None]:
# The target is imbalanced, which will be taken care of before modelling is done.
sns.barplot(x=[0,1], y=df_train.wnvpresent.value_counts())
plt.xlabel('Presence of West Nile Virus', fontsize=15)
plt.ylabel('Count',fontsize=15)
plt.tick_params(labelsize=14)
plt.title('Count of target',fontsize=16, y=1.03)
plt.show()

In [None]:
# Highest number of mosquitos caught in Jul, Aug
df_train.groupby(['year','month']).nummosquitos.sum().unstack(fill_value=0).plot.bar(figsize=(10,7))
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Count of mosquitos by year & month', fontsize=16, y=1.01)
plt.ylabel('Count', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

In [None]:
# Most common species of mosquitos caught are Culex Pipiens and Culex Restuans
# Proportion of each species are the same except for 2007
df_train.groupby(['year','species']).nummosquitos.sum().unstack(fill_value=0).plot.bar(figsize=(10,7))
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Count of mosquitos by species', fontsize=16, y=1.01)
plt.ylabel('Count', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

In [None]:
# not much insights to draw from this plot - to remove? 
df_train.groupby(['year','month', 'species']).nummosquitos.sum().unstack(fill_value=0).plot.bar(figsize=(12,7))
plt.xticks(rotation=45, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Count of mosquitoes by year, month & species', fontsize=16, y=1.01)
plt.ylabel('Count', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

In [None]:
# same as above but can see the breakdown more clearly
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,10))
ax = ax.ravel()
plt.suptitle('Number of mosquitoes by year', fontsize=20, y=1.05)
for i, y in enumerate(df_train.year.unique()):
    df_train[df_train['year']==y].groupby(['month', 'species']).nummosquitos.sum().unstack(fill_value=0).plot.bar(ax=ax[i])
    ax[i].set_title(y)
    ax[i].tick_params(axis='x', labelrotation=0)
plt.tight_layout()

In [None]:
# Highest percentage of virus occuring in Aug, Sep which is a delay from the highest number of mosquitos
df_train.groupby(['year','month']).wnvpresent.mean().unstack(fill_value=0).plot.bar(figsize=(10,7))
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Probability of virus by year & month', fontsize=16, y=1.01)
plt.ylabel('Probability', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

In [None]:
# Highest probability of virus in Aug except for 2011, where highest probability of virus was in Sep
# Higher number of mosquito does not indicate higher probability of virus 
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
ax = ax.ravel()
df_train.groupby(['month', 'year']).wnvpresent.mean().unstack(fill_value=0).plot(ax=ax[0])
df_train.groupby(['month', 'year']).nummosquitos.sum().unstack(fill_value=0).plot(ax=ax[1])

ax[0].set_title('Probability of virus by year & month', fontsize=16, y=1.01)
ax[0].set_ylabel('Probability', fontsize=14)
ax[0].set_xlabel('Month',fontsize=14)
ax[0].legend(fontsize=14)
ax[0].tick_params(labelsize=14)

ax[1].set_title('Number of mosquitos by year & month', fontsize=16, y=1.01)
ax[1].set_ylabel('Count', fontsize=14)
ax[1].set_xlabel('Month',fontsize=14)
ax[1].legend(fontsize=14)
ax[1].tick_params(labelsize=14)

plt.subplots_adjust(wspace=0.3)

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(df_train[df_train['wnvpresent']==1].species)
plt.title('Count of mosquito species carrying the virus', fontsize=16, y=1.01)
plt.tick_params(labelsize=12)
plt.xlabel('Species', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()

In [None]:
# Most common species of mosquitos caught are Culex Pipiens and Culex Restuans
# Proportion of each species are the same except for 2007
df_train.groupby(['year','species']).wnvpresent.mean().unstack(fill_value=0).plot.bar(figsize=(8,6))
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Probability of mosquito species carrying virus', fontsize=16, y=1.01)
plt.ylabel('Probability', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

In [None]:
# looks like there is an outlier - to remove the graph?
sns.scatterplot(x=df_train.groupby(['year', 'month']).nummosquitos.sum(),
                y= df_train.groupby(['year', 'month']).wnvpresent.mean())

### Weather Data

In [None]:
print('year:')
print(df_weather['year'].unique())
print('')
print('month:')
print(df_weather['month'].unique())
print('')
print('day:')
print(df_weather['day'].unique())
print('')

In [None]:
temp_summary_table = pd.pivot_table(df_weather, values=['tmax','tmin','tavg'], index=['year', 'month'], aggfunc=np.mean)
temp_summary_table

- We have weather data for 8 years from 2007 to 2014.
- Only 6 out of 12 months of data available (end spring, summer, early autumn).

In [None]:
df_yr_mth = df_weather.groupby(['year','month'])['tmax','tmin','tavg'].mean()
df_yr_mth = pd.DataFrame(df_yr_mth)
df_yr_mth

In [None]:
year_list = df_weather['year'].unique()
df_list = []

for year in year_list:
    df = df_yr_mth.loc[year]
    df_list.append(df)

In [None]:
fig,axes = plt.subplots(4,2,figsize=(20,24))

for idx,df in enumerate(df_list):    
    
    row_idx = int(idx/2)
    col_idx = idx%2    
    #print(idx,row_idx,col_idx)    
      
    y = ['tmax','tmin','tavg']
    
    year = year_list[idx]  
    axes[row_idx,col_idx].set_title(year, fontsize=14)
    axes[row_idx,col_idx].set_ylim(40,100)
    axes[row_idx,col_idx].set_xlabel('Month', fontsize=14)
    axes[row_idx,col_idx].set_ylabel('Temperature', fontsize=14)
    axes[row_idx,col_idx].tick_params(labelsize=14)
    axes[row_idx,col_idx].legend(fontsize=14)

    sns.lineplot(data=df,x=df.index,y=y[0],ax=axes[row_idx,col_idx],color='red',markers=True)
    sns.lineplot(data=df,x=df.index,y=y[1],ax=axes[row_idx,col_idx],color='blue',markers=True)
    sns.lineplot(data=df,x=df.index,y=y[2],ax=axes[row_idx,col_idx],color='orange',markers=True)

In [None]:
plt.figure(figsize=(20,8))

sns.barplot(data=df_weather,x='month',y='tavg',hue='year', ci=None)
plt.title('Average Temperature (F) by year and month')
plt.ylabel('Average Temperature (F)', fontsize=14)
plt.xlabel('Month', fontsize=14)
plt.tick_params(labelsize=14)

In [None]:
# clean incorrect sunset time
# example: 1860 -> 1900

for idx,row in df_weather.iterrows():

    sunset = int(row['sunset'])
    sunset = str(sunset)
    #print(idx,sunset)
    
    if sunset[-2:] == '60':
        
        hour = int(sunset[:2])
        mins = int(sunset[-2:])
        
        mins = '00'
        hour += 1
        hour = str(hour)
        
        df_weather.at[idx,'sunset'] = hour + mins  

In [None]:
df_weather['sunrise_2'] = pd.to_datetime(df_weather['sunrise'], format='%H%M')
df_weather['sunset_2'] = pd.to_datetime(df_weather['sunset'], format='%H%M')

df_weather['daylight'] = df_weather['sunset_2'] - df_weather['sunrise_2']
#df_weather['daylight_2'] = df_weather['daylight'].dt.total_seconds() / 3600

In [None]:
for idx,row in df_weather.iterrows():
    
    daylight = row['daylight']
    daylight = daylight.total_seconds() / 3600    
    df_weather.at[idx,'daylight_2'] = daylight

In [None]:
df_daylight = df_weather.groupby(['year','month'])['daylight_2'].mean()
df_daylight = pd.DataFrame(df_daylight)

In [None]:
plt.figure(figsize=(20,8))

sns.barplot(data=df_weather,x='month',y='daylight_2',hue='year')

In [None]:
# Distribution of average temperature does not follow a normal distribution
# Average temperature peaks in Jul or Aug each year
# Similar distribution and trends observed for both stations

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(14,12))
ax = ax.ravel()
plt.suptitle('Average Temperature', fontsize=18)

sns.distplot(df_weather[df_weather['station']==1].tavg, ax=ax[0])
df_weather[df_weather['station']==1].groupby(['month','year']).tavg.mean().unstack(fill_value=0).plot(
    ax=ax[1], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkred'])

sns.distplot(df_weather[df_weather['station']==2].tavg, ax=ax[2])
df_weather[df_weather['station']==2].groupby(['month','year']).tavg.mean().unstack(fill_value=0).plot(
    ax=ax[3], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkred'])

ax[0].set_title('Distribution of Average Temperature (Station 1)', fontsize=16, y=1.01)
ax[0].set_xlabel('Average Temperature (F)', fontsize=14)
ax[0].tick_params(labelsize=14)

ax[1].set_title('Monthly Average of Average Temperature (Station 1)', fontsize=16, y=1.01)
ax[1].set_xlabel('Month', fontsize=14)
ax[1].set_ylabel('Average Temperature (F)', fontsize=14)
ax[1].tick_params(labelsize=12)
ax[1].legend(fontsize=14)

ax[2].set_title('Distribution of Average Temperature (Station 2)', fontsize=16, y=1.01)
ax[2].set_xlabel('Average Temperature (F)', fontsize=14)
ax[2].tick_params(labelsize=14)

ax[3].set_title('Monthly Average of Average Temperature (Station 2)', fontsize=16, y=1.01)
ax[3].set_xlabel('Month', fontsize=14)
ax[3].set_ylabel('Average Temperature (F)', fontsize=14)
ax[3].tick_params(labelsize=12)
ax[3].legend(fontsize=14)

plt.subplots_adjust(wspace=0.2)

In [None]:
# Distribution of dewpoint temperature does not follow a normal distribution
# Dewpoint temperature peaks in Jul or Aug each year
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
ax = ax.ravel()

sns.distplot(df_weather.dewpoint, ax=ax[0])
df_weather[df_weather['station']==1].groupby(['month','year']).dewpoint.mean().unstack(fill_value=0).plot(
    ax=ax[1], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkred'])

ax[0].set_title('Distribution of Dewpoint Temperature', fontsize=16, y=1.01)
ax[0].set_xlabel('Dewpoint Temperature (F)', fontsize=14)
ax[0].tick_params(labelsize=14)

ax[1].set_title('Monthly Average of Dewpoint Temperature', fontsize=16, y=1.01)
ax[1].set_xlabel('Month', fontsize=14)
ax[1].set_ylabel('Average Dewpoint Temperature (F)', fontsize=14)
ax[1].tick_params(labelsize=12)
ax[1].legend(fontsize=14)

plt.subplots_adjust(wspace=0.2)

In [None]:
# Distribution of wetbulb temperature does not follow a normal distribution
# Wetbulb temperature peaks in Jul or Aug each year
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
ax = ax.ravel()

sns.distplot(df_weather.wetbulb, ax=ax[0])
df_weather[df_weather['station']==1].groupby(['month','year']).wetbulb.mean().unstack(fill_value=0).plot(
    ax=ax[1], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkred'])

ax[0].set_title('Distribution of Wetbulb Temperature', fontsize=16, y=1.01)
ax[0].set_xlabel('Wetbulb Temperature (F)', fontsize=14)
ax[0].tick_params(labelsize=14)

ax[1].set_title('Monthly Average of Wetbulb Temperature', fontsize=16, y=1.01)
ax[1].set_xlabel('Month', fontsize=14)
ax[1].set_ylabel('Average Wetbulb Temperature (F)', fontsize=14)
ax[1].tick_params(labelsize=12)
ax[1].legend(fontsize=14)

plt.subplots_adjust(wspace=0.2)

In [None]:
# Distribution of precipitation does not follow a normal distribution
# Monthly average precipitation does not show any trend throughout the years
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(14,7))
ax = ax.ravel()

sns.distplot(df_weather.preciptotal, ax=ax[0])
df_weather[df_weather['station']==1].groupby(['month','year']).preciptotal.mean().unstack(fill_value=0).plot(
    ax=ax[1], color=['b', 'g', 'r', 'c', 'm', 'y', 'k', 'darkred'])

ax[0].set_title('Distribution of Precipitation', fontsize=16, y=1.01)
ax[0].set_xlabel('Precipitation(Inches)', fontsize=14)
ax[0].tick_params(labelsize=14)

ax[1].set_title('Monthly Average Precipitation', fontsize=16, y=1.01)
ax[1].set_xlabel('Month', fontsize=14)
ax[1].set_ylabel('Average Precipitation(Inches)', fontsize=14)
ax[1].tick_params(labelsize=12)
ax[1].legend(fontsize=14)

plt.subplots_adjust(wspace=0.2)

In [None]:
# Sprayed in Jul, Aug and Sep
df_spray.groupby(['year_spray', 'month_spray']).day_spray.count().unstack(fill_value=0).plot.bar(figsize=(8,6))
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.title('Count of spray by year and month', fontsize=16, y=1.01)
plt.ylabel('Count', fontsize=14)
plt.xlabel('Year',fontsize=14)
plt.legend(fontsize=14)
plt.show()

### Visualisation on map

In [None]:
crs = {'init': 'epsg:4326'}

geometry_train = [Point(xy) for xy in zip(df_train['longitude'], df_train['latitude'])]

geo_df = gpd.GeoDataFrame(df_train,
                      crs=crs,
                      geometry=geometry_train)
geo_df.head()

In [None]:
geometry_spray = [Point(xy) for xy in zip(df_spray['longitude'], df_spray['latitude'])]

geo_spray_df = gpd.GeoDataFrame(df_spray,
                      crs=crs,
                      geometry=geometry_spray)
geo_spray_df.head()

In [None]:
#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level

df_station = pd.DataFrame({
    'station': ["O'HARE INTERNATIONAL AIRPORT", "MIDWAY INTL ARPT"],
    'latitude': [41.995, 41.786],
    'longitude': [-87.933, -87.752]
})

geometry_station = [Point(xy) for xy in zip(df_station['longitude'], df_station['latitude'])]

geo_station_df = gpd.GeoDataFrame(df_station,
                      crs=crs,
                      geometry=geometry_station)
geo_station_df.head()

In [None]:
# Plot location of traps, spray and weather station
# There were spray locations outside Chicago and were filtered out from the plot

fig, ax = plt.subplots(figsize=(11,11))
street_map.plot(ax=ax, alpha=0.4, color='grey')
geo_spray_df[geo_spray_df['latitude']<42.3].plot(
    ax=ax, markersize=20, color='yellow', marker='o', alpha=0.2, label='Spray')
geo_df.plot(ax=ax, markersize=20, color='blue', marker='o', label='Traps')
geo_station_df.plot(ax=ax, markersize=40, color='orange', marker='o', label='Station')
plt.legend(prop={'size': 15})
plt.title('Locations of traps, spray and weather station', fontsize=16, y=1.01)
plt.xlabel('Longitude', fontsize=14)
plt.ylabel('Latitude', fontsize=14)
plt.show()

In [None]:
# all years
# Presence of virus still observed at locations sprayed - spray was not effective/ time sprayed was not correct?

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,12), sharex=True, sharey=True)
ax = ax.ravel()
plt.suptitle('Location of virus and spray by year', fontsize=16, y=1.01)

mask_remove_outliers = geo_spray_df['latitude'] < 42.3
mask_wnvpresent = geo_df['wnvpresent'] == 1

for i, y in enumerate(df_train.year.unique()):
    
    mask_year = geo_spray_df['year_spray'] == y
    mask_year_2 = geo_df['year'] == y
    
    street_map.plot(ax=ax[i], alpha=0.4, color='grey')
    
    geo_spray_df[mask_remove_outliers & mask_year].plot(ax=ax[i], markersize=20, color='yellow', marker='o', alpha=0.2, label='Spray')
    geo_df[mask_wnvpresent & mask_year_2].plot(ax=ax[i], markersize=20, color='red', marker='o', label='Virus')
    
    ax[i].legend(prop={'size': 15})
    ax[i].set_title(y)
    ax[i].set_xlabel('Longitude', fontsize=14)
    ax[i].set_ylabel('Latitude', fontsize=14)
    
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)

In [None]:
# year 2011
# Presence of virus still observed at locations sprayed - spray was not effective/ time sprayed was not correct?

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,12), sharex=True, sharey=True)
ax = ax.ravel()
plt.suptitle('Year 2011: Location of virus and spray by month', fontsize=16, y=1.01)

mask_remove_outliers = geo_spray_df['latitude'] < 42.3
mask_wnvpresent = geo_df['wnvpresent'] == 1

year = 2011
mask_year = df_train['year'] == year
mask_year_2 = geo_spray_df['year_spray'] == year
mask_year_3 = geo_df['year'] == year

for i, y in enumerate(df_train[mask_year].month.unique()):
    
    mask_month = geo_spray_df['month_spray'] == y
    mask_month_2 = geo_df['month'] == y   
    
    street_map.plot(ax=ax[i], alpha=0.4, color='grey')
    
    geo_spray_df[mask_remove_outliers & mask_month & mask_year_2].plot(ax=ax[i], markersize=20, color='yellow', marker='o', alpha=0.2, label='Spray')
    geo_df[mask_wnvpresent & mask_month_2 & mask_year_3].plot(ax=ax[i], markersize=20, color='red', marker='o', label='Virus')
       
    ax[i].legend(prop={'size': 15})
    ax[i].set_title(y)
    ax[i].set_xlabel('Longitude', fontsize=14)
    ax[i].set_ylabel('Latitude', fontsize=14)
    
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)

In [None]:
# year 2013
# Presence of virus still observed at locations sprayed - spray was not effective/ time sprayed was not correct?

fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12,12), sharex=True, sharey=True)
ax = ax.ravel()
plt.suptitle('Year 2013: Location of virus and spray by month', fontsize=16, y=1.01)

mask_remove_outliers = geo_spray_df['latitude'] < 42.3
mask_wnvpresent = geo_df['wnvpresent'] == 1

year = 2013
mask_year = df_train['year'] == year
mask_year_2 = geo_spray_df['year_spray'] == year
mask_year_3 = geo_df['year'] == year

for i, y in enumerate(df_train[mask_year].month.unique()):
    
    mask_month = geo_spray_df['month_spray'] == y
    mask_month_2 = geo_df['month'] == y   
    
    street_map.plot(ax=ax[i], alpha=0.4, color='grey')
    
    geo_spray_df[mask_remove_outliers & mask_month & mask_year_2].plot(ax=ax[i], markersize=20, color='yellow', marker='o', alpha=0.2, label='Spray')
    geo_df[mask_wnvpresent & mask_month_2 & mask_year_3].plot(ax=ax[i], markersize=20, color='red', marker='o', label='Virus')
       
    ax[i].legend(prop={'size': 15})
    ax[i].set_title(y)
    ax[i].set_xlabel('Longitude', fontsize=14)
    ax[i].set_ylabel('Latitude', fontsize=14)
    
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)

In [None]:
only_station_1 = df_weather[df_weather['station'] == 1] 
combined = df_train.merge(only_station_1, how='left', on=['year','month','day'])

In [None]:
combined.head()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(combined.corr())

In [None]:
def combined_date(row):
    date = '{}-{}-{}'.format(row.year, row.month, row.day)
    return datetime.datetime.strptime(date, '%Y-%m-%d')

In [None]:
df_train['date'] = df_train.apply(combined_date, axis=1)
df_train.head()

In [None]:
df_weather['date'] = df_weather.apply(combined_date, axis=1)
df_weather.head()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

mask_year = df_weather['year'] == 2007

sns.lineplot(x=df_weather[df_weather['year'] == 2007].date, y=df_weather[df_weather['year'] == 2007].tavg, ci=None, ax=ax1)
ax1.set_ylabel('Average Temperature (F)', color='blue', fontsize=14)
ax1.set_xlabel('Date', fontsize=14)
ax1.tick_params(labelsize=12)

mosquito_day = df_train[df_train['year'] == 2007].groupby(['date']).nummosquitos.sum()
df_mosquito_day = pd.DataFrame(mosquito_day)

ax2 = ax1.twinx()
sns.lineplot(x=df_mosquito_day.index, y=df_mosquito_day.nummosquitos, 
             ci=None, ax=ax2, color='r')
ax2.set_ylabel('Number of mosquitos', color='r', fontsize=14)
ax2.tick_params(labelsize=12)

plt.title('Average Temperature and Number of Mosquitos (2007)', fontsize=16)
fig.tight_layout()
plt.show()

In [None]:
df_weather[df_weather['year'] == 2009]

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

mask_year = df_weather['year'] == 2009

sns.lineplot(x=df_weather[df_weather['year'] == 2009].date, y=df_weather[df_weather['year'] == 2009].tavg, ci=None, ax=ax1)
ax1.set_ylabel('Average Temperature (F)', color='blue', fontsize=14)
ax1.set_xlabel('Date', fontsize=14)
ax1.tick_params(labelsize=12)

mosquito_day = df_train[df_train['year'] == 2009].groupby(['date']).nummosquitos.sum()
df_mosquito_day = pd.DataFrame(mosquito_day)

ax2 = ax1.twinx()
sns.lineplot(x=df_mosquito_day.index, y=df_mosquito_day.nummosquitos, 
             ci=None, ax=ax2, color='r')
ax2.set_ylabel('Number of mosquitos', color='r', fontsize=14)
ax2.tick_params(labelsize=12)

plt.title('Average Temperature and Number of Mosquitos (2009)', fontsize=16)
fig.tight_layout()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

mask_year = df_weather['year'] == 2011

sns.lineplot(x=df_weather[df_weather['year'] == 2011].date, y=df_weather[df_weather['year'] == 2011].tavg, ci=None, ax=ax1)
ax1.set_ylabel('Average Temperature (F)', color='blue', fontsize=14)
ax1.set_xlabel('Date', fontsize=14)
ax1.tick_params(labelsize=12)

mosquito_day = df_train[df_train['year'] == 2011].groupby(['date']).nummosquitos.sum()
df_mosquito_day = pd.DataFrame(mosquito_day)

ax2 = ax1.twinx()
sns.lineplot(x=df_mosquito_day.index, y=df_mosquito_day.nummosquitos, 
             ci=None, ax=ax2, color='r')
ax2.set_ylabel('Number of mosquitos', color='r', fontsize=14)
ax2.tick_params(labelsize=12)

plt.title('Average Temperature and Number of Mosquitos (2011)', fontsize=16)
fig.tight_layout()
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

mask_year = df_weather['year'] == 2013

sns.lineplot(x=df_weather[mask_year].date, y=df_weather[mask_year].tavg, ci=None, ax=ax1)
ax1.set_ylabel('Average Temperature (F)', color='blue', fontsize=14)
ax1.set_xlabel('Date', fontsize=14)
ax1.tick_params(labelsize=12)

mosquito_day = df_train[df_train['year'] == 2013].groupby(['date']).nummosquitos.sum()
df_mosquito_day = pd.DataFrame(mosquito_day)

ax2 = ax1.twinx()
sns.lineplot(x=df_mosquito_day.index, y=df_mosquito_day.nummosquitos, 
             ci=None, ax=ax2, color='r')
ax2.set_ylabel('Number of mosquitos', color='r', fontsize=14)
ax2.tick_params(labelsize=12)

plt.title('Average Temperature and Number of Mosquitos (2013)', fontsize=16)
fig.tight_layout()
plt.show()

In [None]:
# sns.lineplot(x=df_train[df_train['year'] == 2013].date, y=df_train[df_train['year'] == 2013].wnvpresent, 
#              ci=None, color='r')

In [None]:
fig, ax1 = plt.subplots(figsize=(12,6))

mask_year = df_weather['year'] == 2013

sns.lineplot(x=df_weather[mask_year].date, y=df_weather[mask_year].tavg, ci=None, ax=ax1)
ax1.set_ylabel('Average Temperature (F)', color='blue', fontsize=14)
ax1.set_xlabel('Date', fontsize=14)
ax1.tick_params(labelsize=12)

wnv_day = df_train[df_train['year'] == 2013].groupby(['date']).wnvpresent.sum()
df_wmn_day = pd.DataFrame(wnv_day)

ax2 = ax1.twinx()
sns.lineplot(x=df_wmn_day.index, y=df_wmn_day.wnvpresent, 
             ci=None, ax=ax2, color='r')
ax2.set_ylabel('Number of virus', color='r', fontsize=14)
ax2.tick_params(labelsize=12)

plt.title('Average Temperature and Number of Virus (2013)', fontsize=16)
fig.tight_layout()
plt.show()

In [None]:
combined['date'] = combined.apply(combined_date, axis=1)
combined.head()

In [None]:
# combined.date = combined.date.map(lambda x:datetime.datetime.strptime(x, '%Y-%m-%d'))

In [None]:
# combined[combined['year'] == 2007].tavg

In [None]:
# fig, ax1 = plt.subplots(figsize=(8,6))

# color = 'tab:red'
# ax1.set_xlabel('time (s)')
# ax1.set_ylabel('Number of mosquitos', color=color)
# ax1.plot(combined[combined['year'] == 2007].date, combined[combined['year'] == 2007].nummosquitos, color=color)
# ax1.tick_params(axis='y', labelcolor=color)

# ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

# color = 'tab:blue'
# ax2.set_ylabel('Temperature(F)', color=color)  # we already handled the x-label with ax1
# ax2.plot(combined[combined['year'] == 2007].date, combined[combined['year'] == 2007].tavg, color=color)
# ax2.tick_params(axis='y', labelcolor=color)

# fig.tight_layout()  # otherwise the right y-label is slightly clipped
# plt.show()