In [1]:
import pandas as pd
import numpy as np
import dill
from datetime import timedelta
from csv_pkl_sql import save_it, csv_it, pkl_it

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Location data

In [2]:
location_key = pd.read_pickle('../pkl/00_cleaned_city_names.pkl')
location_key.head(1)

Unnamed: 0,location,location_type,country,province,county,city
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,


In [3]:
lat_long = pd.read_pickle('../pkl/01_latitude_longitude_google.pkl')
lat_long.head(1)

Unnamed: 0,location,latitude,longitude
0,Argentina-Buenos_Aires,-34.603684,-58.381559


In [4]:
location_key.shape, lat_long.shape

((1606, 6), (1606, 3))

In [5]:
location = pd.merge(location_key, lat_long, on='location', how='inner')
location.head(1)

Unnamed: 0,location,location_type,country,province,county,city,latitude,longitude
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,,-34.603684,-58.381559


In [6]:
location.shape

(1616, 8)

In [7]:
save_it(location[['location','country']], '11_feature_engineering_location')

## Airport information

In [8]:
airport = pd.read_pickle('../pkl/02_airport_information_fallingrain.pkl')
airport.head(1)

Unnamed: 0,city,FAA,IATA,ICAO,kind,latitude,longitude,max_runway,name,country,state
56,BAHIA BLANCA,,BHI,SAZB,Medium,-38.725,-62.169,8579.0,COMANDANTE ESPORA,Argentina,


In [9]:
airport2 = pd.read_pickle('../pkl/04_merged_latitude_longitude_airport_checkpoint.pkl')
airport2.head(1)

Unnamed: 0,location,latitude,longitude,airport_index,country,name,FAA,IATA,ICAO
0,Argentina-Buenos_Aires,-34.603684,-58.381559,80,Argentina,AEROPARQUE JORGE NEWBERY,,AEP,SABE


Create a dataframe of distance to airport and distance to major airport.x

In [10]:
airport.shape, airport2.shape

((2062, 11), (1606, 9))

In [11]:
airport.kind.unique()

array([u'Medium', u'Large'], dtype=object)

In [12]:
save_it(airport, '11_feature_engineering_airport')

###  <font color='red'>TODO--convert this to an actual distance</font>

In [13]:
# Closest medium or large airport
airport_coords = airport[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - airport_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

airport_distance = lat_long[['location']].copy()
airport_distance['airport_dist_any'] = min_dist

# Closest large airport
airport_coords = airport.loc[airport.kind=='Large', 
                             ['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - airport_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

airport_distance['airport_dist_large'] = min_dist

In [14]:
airport_distance.head()

Unnamed: 0,location,airport_dist_any,airport_dist_large
0,Argentina-Buenos_Aires,0.003183,0.071514
1,Argentina-CABA,0.003183,0.071514
2,Argentina-Cordoba,0.009602,43.526915
3,Argentina-Entre_Rios,0.000658,13.126461
4,Argentina-Santa_Fe,0.023428,14.98391


In [15]:
save_it(airport_distance, '11_feature_engineering_airport_distance')

###  <font color='red'>TODO--add flight information, how to adjust it for on-going outbreaks?</font>

## Weather information

In [16]:
weather = pd.read_pickle('../pkl/04_weekly_weather.pkl')
weather.head(2)

Unnamed: 0,date,max_temp,max_temp1,max_temp2,location,mean_temp,mean_temp1,mean_temp2,min_temp,min_temp1,min_temp2,dew_point,dew_point1,dew_point2,precipitation,precipitation1,precipitation2,wind,wind1,wind2
0,2015-11-28,67.0,70.0,68.0,United_States-Florida-Columbia_County,53.0,57.0,56.0,38.0,43.0,44.0,41.0,50.0,47.0,0.0,2.05,2.32,4.0,5.0,6.0
1,2015-12-05,66.0,67.0,70.0,United_States-Florida-Columbia_County,56.0,53.0,57.0,45.0,38.0,43.0,48.0,41.0,50.0,0.5,0.0,2.05,5.0,4.0,5.0


In [17]:
save_it(weather, '11_feature_engineering_weather')

## Mosquito sightings

In [18]:
mosquito = pd.read_pickle('../pkl/05_mosquito_sightings.pkl')
mosquito.head(1)

Unnamed: 0,vector,occurrence_id,source_type,location_type,polygon_admin,latitude,longitude,year,country,country_id,gaul_ad0,status
34478,Aedes albopictus,34479,unpublished,point,-999,22.89,120.44,2006,Taiwan,TWN,886,


###  <font color='red'>TODO--convert this to an inverse sum of distances</font>

In [19]:
# Closest mosquito sighting
mosquito_coords = mosquito[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - mosquito_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

mosquito_distance = lat_long[['location']].copy()
mosquito_distance['mosquito_dist'] = min_dist

In [20]:
mosquito_distance.head()

Unnamed: 0,location,mosquito_dist
0,Argentina-Buenos_Aires,0.008009
1,Argentina-CABA,0.008009
2,Argentina-Cordoba,0.00048
3,Argentina-Entre_Rios,1.244226
4,Argentina-Santa_Fe,0.742703


In [21]:
save_it(mosquito_distance, '11_feature_engineering_mosquito_distance')

## Population density

In [22]:
population = pd.read_pickle('../pkl/06_population_density.pkl')
population.head(1)

Unnamed: 0,location,density_per_km
0,Argentina-Buenos_Aires,12346.605469


In [23]:
save_it(population, '11_feature_engineering_population')

## GDP and PPP GDP

In [24]:
gdp = pd.read_pickle('../pkl/09_GDP_table.pkl')[['country', '2015']]
gdp_ppp = pd.read_pickle('../pkl/09_GDP_PPP_table.pkl')[['country', '2015']]

In [25]:
gdp.head(1)

Unnamed: 0,country,2015
0,United States,17947.0


In [26]:
gdp.rename(columns={'2015':'gdp'}, inplace=True)
gdp_ppp.rename(columns={'2015':'gdp_ppp'}, inplace=True)
gdp = pd.merge(gdp, gdp_ppp, on='country')

In [27]:
save_it(gdp, '11_feature_engineering_gdp')

## Combine dataframes

In [28]:
print weather.shape

model = pd.merge(weather,
                 population,
                 on='location', 
                 how='left')

print model.shape

model = pd.merge(model,
                 airport_distance,
                 on='location',
                 how='left')

print model.shape, model.isnull().sum().max()

model = pd.merge(model,
                 mosquito_distance,
                 on='location',
                 how='left')

print model.dropna().shape

model = pd.merge(model,
         location[['location','country']],
         on='location',
         how='left')

print model.dropna().shape

model = pd.merge(model,
                 gdp,
                 on='country',
                 how='left')

print model.dropna().shape

model.drop(['country'], axis=1, inplace=True)

save_it(model, '11_features_engineered')

(107965, 20)
(108373, 21)
(109189, 23) 102
(110719, 24)
(120511, 25)
(120511, 27)


In [29]:
model.head(1).T

Unnamed: 0,0
date,2015-11-28 00:00:00
max_temp,67
max_temp1,70
max_temp2,68
location,United_States-Florida-Columbia_County
mean_temp,53
mean_temp1,57
mean_temp2,56
min_temp,38
min_temp1,43
