In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import urllib.request as ur
import json
import datetime
%matplotlib inline

In [2]:
# cases data per location
df_cases = pd.read_csv('Data/zika_clusters.csv', low_memory=False)
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1673 entries, 0 to 1672
Data columns (total 3 columns):
location          1673 non-null object
date              1673 non-null object
zika_confirmed    1673 non-null int64
dtypes: int64(1), object(2)
memory usage: 39.3+ KB


In [3]:
# population data
df_population = pd.read_csv('Data/population_density.csv', low_memory=False)

In [4]:
df_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1714 entries, 0 to 1713
Data columns (total 2 columns):
location          1714 non-null object
density_per_km    1714 non-null float64
dtypes: float64(1), object(1)
memory usage: 26.9+ KB


In [5]:
df = pd.merge(df_cases, df_population, on='location', how='left')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1678 entries, 0 to 1677
Data columns (total 4 columns):
location          1678 non-null object
date              1678 non-null object
zika_confirmed    1678 non-null int64
density_per_km    1614 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 65.5+ KB


In [6]:
# latutude and longitude data
df_lat_long = pd.read_csv('Data/latitude_longitude.csv', low_memory=False)
df_lat_long.head()

Unnamed: 0,location,location_type,country,province,county,city,latitude,longitude
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,,-34.603684,-58.381559
1,Argentina-CABA,province,Argentina,Ciudad de Buenos Aires,,,-34.603684,-58.381559
2,Argentina-Cordoba,province,Argentina,Cordoba,,,-31.420083,-64.188776
3,Argentina-Entre_Rios,province,Argentina,Entre Rios,,,-31.774665,-60.495646
4,Argentina-Santa_Fe,province,Argentina,Santa Fe,,,-31.610658,-60.697294


In [7]:
# merge latitude and longitude data
df = pd.merge(df, df_lat_long, on='location', how='left')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 11 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
province          1576 non-null object
county            1353 non-null object
city              1392 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
dtypes: float64(3), int64(1), object(7)
memory usage: 158.2+ KB


In [9]:
# gdp data
df_gdp = pd.read_csv('Data/gdp_info.csv', low_memory=False)
df_gdp

Unnamed: 0,country,2015,2016,2017
0,Argentina,642.5,554.1,637.7
1,Brazil,1799.7,1793.1,2055.0
2,Colombia,291.5,280.0,309.2
3,Dominican Republic,68.2,71.7,75.0
4,Ecuador,99.3,98.6,102.3
5,El Salvador,26.1,26.8,28.0
6,Guatemala,63.8,68.8,75.7
7,Haiti,8.7,8.2,8.6
8,Mexico,1169.6,1076.9,1149.2
9,Nicaragua,12.7,13.2,13.7


In [10]:
# merge gdp 
df = pd.merge(df, df_gdp, on='country', how='left')
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 14 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
province          1576 non-null object
county            1353 non-null object
city              1392 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
2015              1624 non-null float64
2016              1624 non-null float64
2017              1624 non-null float64
dtypes: float64(6), int64(1), object(7)
memory usage: 197.8+ KB


In [11]:
# mosquito data
df_mosquito = pd.read_csv('Data/mosquito.csv', low_memory=False)


In [12]:
df_mosquito.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24019 entries, 0 to 24018
Data columns (total 12 columns):
vector           24019 non-null object
occurrence_id    24019 non-null int64
source_type      24019 non-null object
location_type    23769 non-null object
polygon_admin    23368 non-null object
latitude         24019 non-null float64
longitude        24019 non-null float64
year             24019 non-null int64
country          24019 non-null object
country_id       24019 non-null object
gaul_ad0         24019 non-null int64
status           183 non-null object
dtypes: float64(2), int64(3), object(7)
memory usage: 2.2+ MB


In [13]:
# Closest mosquito sighting
mosquito_coords = df_mosquito[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(df[['latitude','latitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - mosquito_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)


In [14]:
# new column mosquito distance
df['mosquito_dist'] = min_dist
df.head()

Unnamed: 0,location,date,zika_confirmed,density_per_km,location_type,country,province,county,city,latitude,longitude,2015,2016,2017,mosquito_dist
0,Argentina-Buenos_Aires,2016-03-19,1,12625.800781,province,Argentina,Buenos Aires,,,-34.603684,-58.381559,642.5,554.1,637.7,193.07096
1,Argentina-CABA,2016-03-19,1,12625.800781,province,Argentina,Ciudad de Buenos Aires,,,-34.603684,-58.381559,642.5,554.1,637.7,193.07096
2,Argentina-Catamarca,2016-03-19,1,460.153595,province,Argentina,Catamarca,,,-28.469581,-65.779544,642.5,554.1,637.7,204.293453
3,Argentina-Chaco,2016-03-19,1,121.33165,province,Argentina,Chaco,,,-27.425717,-59.024378,642.5,554.1,637.7,215.267115
4,Argentina-Chubut,2016-03-19,1,37.095642,province,Argentina,Chubut,,,-43.293425,-65.111482,642.5,554.1,637.7,205.408153


In [15]:
df.info(0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 15 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
province          1576 non-null object
county            1353 non-null object
city              1392 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
2015              1624 non-null float64
2016              1624 non-null float64
2017              1624 non-null float64
mosquito_dist     1624 non-null float64
dtypes: float64(7), int64(1), object(7)
memory usage: 211.0+ KB


In [16]:
# weather data
df_weather = pd.read_csv('Data/weather.csv', low_memory=False)

In [28]:
df_weather.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112644 entries, 0 to 112643
Data columns (total 8 columns):
date             112644 non-null object
location         112644 non-null object
max_temp         112644 non-null float64
mean_temp        112644 non-null float64
min_temp         112644 non-null float64
dew_point        112644 non-null float64
precipitation    112644 non-null float64
wind             112644 non-null float64
dtypes: float64(6), object(2)
memory usage: 6.9+ MB


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 15 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
province          1576 non-null object
county            1353 non-null object
city              1392 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
2015              1624 non-null float64
2016              1624 non-null float64
2017              1624 non-null float64
mosquito_dist     1624 non-null float64
dtypes: float64(7), int64(1), object(7)
memory usage: 211.0+ KB


In [24]:
# make date column the same format in both dataframes to be able to merge
df_weather['date'] = pd.to_datetime(df_weather['date']).astype(str)

In [25]:
df['date'] = pd.to_datetime(df['date']).astype(str)

In [30]:
df = pd.merge(df, df_weather, on=['location','date'], how='left')

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 21 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
province          1576 non-null object
county            1353 non-null object
city              1392 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
2015              1624 non-null float64
2016              1624 non-null float64
2017              1624 non-null float64
mosquito_dist     1624 non-null float64
max_temp          1074 non-null float64
mean_temp         1074 non-null float64
min_temp          1074 non-null float64
dew_point         1074 non-null float64
precipitation     1074 non-null float64
wind              1074 non-null float64
dtypes: float64(13), int64(1), object(7)
memory usage: 290.1+ 

In [32]:
# impute missing data 
df['max_temp'] = df['max_temp'].interpolate(method='linear', limit_direction='both')
df['mean_temp'] = df['mean_temp'].interpolate(method='linear', limit_direction='both')
df['min_temp'] = df['min_temp'].interpolate(method='linear', limit_direction='both')
df['dew_point'] = df['dew_point'].interpolate(method='linear', limit_direction='both')
df['precipitation'] = df['precipitation'].interpolate(method='linear', limit_direction='both')
df['wind'] = df['wind'].interpolate(method='linear', limit_direction='both')

In [33]:
df['gdp'] = df['2016'] 

In [35]:
df = df.drop(['2015','2016','2017'],axis=1)  

In [36]:
df = df.drop(['province','county','city'],axis=1)  

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1688 entries, 0 to 1687
Data columns (total 16 columns):
location          1688 non-null object
date              1688 non-null object
zika_confirmed    1688 non-null int64
density_per_km    1624 non-null float64
location_type     1624 non-null object
country           1624 non-null object
latitude          1624 non-null float64
longitude         1624 non-null float64
mosquito_dist     1624 non-null float64
max_temp          1688 non-null float64
mean_temp         1688 non-null float64
min_temp          1688 non-null float64
dew_point         1688 non-null float64
precipitation     1688 non-null float64
wind              1688 non-null float64
gdp               1624 non-null float64
dtypes: float64(11), int64(1), object(4)
memory usage: 224.2+ KB


In [38]:
# take not null values for latitude
df = df[df['latitude'].notnull()]

In [39]:
# save csv for future needs
df.to_csv('Data_merged.csv', encoding='utf-8', index=False)