# Preliminary feature engineering and cleaning
This notebook contains preliminary feature engineering and additional data cleaning that was performed for creation of an intermediate MVP. This notebook should be considered obsolete by the later feature engineering notebook, which also uses SQL.

In [2]:
import pandas as pd
import numpy as np
import dill
from datetime import timedelta
from csv_pkl_sql import save_it, csv_it, pkl_it

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

## Location data

In [100]:
with open('../pkl/00_cleaned_city_names.pkl', 'r') as fh:
    location_key = dill.load(fh)
location_key.head(1)

Unnamed: 0,location,location_type,country,province,county,city
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,


In [101]:
with open('../pkl/01_latitude_longitude_google.pkl', 'r') as fh:
    lat_long = dill.load(fh)
lat_long.head(1)

Unnamed: 0,location,latitude,longitude
0,Argentina-Buenos_Aires,-34.603684,-58.381559


In [102]:
location_key.shape, lat_long.shape

((1606, 6), (1606, 3))

In [103]:
location = pd.merge(location_key, lat_long, on='location', how='inner')
location.head(1)

Unnamed: 0,location,location_type,country,province,county,city,latitude,longitude
0,Argentina-Buenos_Aires,province,Argentina,Buenos Aires,,,-34.603684,-58.381559


In [104]:
location.shape

(1616, 8)

## Airport information

In [13]:
with open('../pkl/02_airport_information_fallingrain.pkl', 'r') as fh:
    airport = dill.load(fh)
airport.head(1)

Unnamed: 0,city,FAA,IATA,ICAO,kind,latitude,longitude,max_runway,name,country,state
56,BAHIA BLANCA,,BHI,SAZB,Medium,-38.725,-62.169,8579.0,COMANDANTE ESPORA,Argentina,


In [14]:
with open('../pkl/04_merged_latitude_longitude_airport_checkpoint.pkl', 'r') as fh:
    airport2 = dill.load(fh)
airport2.head(1)

Unnamed: 0,location,latitude,longitude,airport_index,country,name,FAA,IATA,ICAO
0,Argentina-Buenos_Aires,-34.603684,-58.381559,80,Argentina,AEROPARQUE JORGE NEWBERY,,AEP,SABE


Create a dataframe of distance to airport and distance to major airport.

In [15]:
airport.shape, airport2.shape

((2062, 11), (1606, 9))

In [107]:
airport.kind.unique()

array([u'Medium', u'Large'], dtype=object)

# TODO--convert this to an actual distance

In [108]:
# Closest medium or large airport
airport_coords = airport[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - airport_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

airport_distance = lat_long[['location']].copy()
airport_distance['airport_dist_any'] = min_dist

# Closest large airport
airport_coords = airport.loc[airport.kind=='Large', 
                             ['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - airport_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

airport_distance['airport_dist_large'] = min_dist

In [109]:
airport_distance.head()

Unnamed: 0,location,airport_dist_any,airport_dist_large
0,Argentina-Buenos_Aires,0.003183,0.071514
1,Argentina-CABA,0.003183,0.071514
2,Argentina-Cordoba,0.009602,43.526915
3,Argentina-Entre_Rios,0.000658,13.126461
4,Argentina-Santa_Fe,0.023428,14.98391


## Weather information

In [11]:
with open('../pkl/04_weekly_weather.pkl', 'r') as fh:
    weather = dill.load(fh)

weather.head(2)

Unnamed: 0,date,max_temp,max_temp1,max_temp2,location,mean_temp,mean_temp1,mean_temp2,min_temp,min_temp1,min_temp2,dew_point,dew_point1,dew_point2,precipitation,precipitation1,precipitation2,wind,wind1,wind2
0,2015-11-28,67.0,70.0,68.0,United_States-Florida-Columbia_County,53.0,57.0,56.0,38.0,43.0,44.0,41.0,50.0,47.0,0.0,2.05,2.32,4.0,5.0,6.0
1,2015-12-05,66.0,67.0,70.0,United_States-Florida-Columbia_County,56.0,53.0,57.0,45.0,38.0,43.0,48.0,41.0,50.0,0.5,0.0,2.05,5.0,4.0,5.0


## Mosquito sightings

In [127]:
with open('../pkl/05_mosquito_sightings.pkl', 'r') as fh:
    mosquito = dill.load(fh)
mosquito.head(1)

Unnamed: 0,vector,occurrence_id,source_type,location_type,polygon_admin,latitude,longitude,year,country,country_id,gaul_ad0,status
34478,Aedes albopictus,34479,unpublished,point,-999,22.89,120.44,2006,Taiwan,TWN,886,


In [128]:
# Closest mosquito sighting
mosquito_coords = mosquito[['latitude', 'longitude']].values[np.newaxis, :]
places_coords = np.rollaxis(lat_long[['latitude','longitude']].values[np.newaxis, :], 0, -1)
dist_coords = ((places_coords - mosquito_coords)**2).sum(axis=-1)
min_dist = dist_coords.min(axis=1)

mosquito_distance = lat_long[['location']].copy()
mosquito_distance['mosquito_dist'] = min_dist

In [129]:
mosquito_distance.head()

Unnamed: 0,location,mosquito_dist
0,Argentina-Buenos_Aires,0.008009
1,Argentina-CABA,0.008009
2,Argentina-Cordoba,0.00048
3,Argentina-Entre_Rios,1.244226
4,Argentina-Santa_Fe,0.742703


## Population density

In [130]:
with open('../pkl/06_population_density.pkl', 'r') as fh:
    population = dill.load(fh)
population.head(1)

Unnamed: 0,location,density_per_km
0,Argentina-Buenos_Aires,12346.605469


## Zika infection

In [4]:
with open('../pkl/03_infection_data_initial_import.pkl', 'r') as fh:
    infection = dill.load(fh)
infection.head(1)

Unnamed: 0,report_date,location,location_type,data_field,data_field_code,value,unit
0,2016-03-19,Argentina-Buenos_Aires,province,cumulative_confirmed_local_cases,AR0001,0,cases


In [5]:
infection = (infection[['report_date', 'location', 'value', 'data_field']]
             .rename(columns={'report_date':'date','value':'zika_cases'}))

In [6]:
for x in infection.zika_cases.iteritems():
    try:
        float(x[1])
    except:
        print(x)

(2414, '125*5')
(2783, '149*5')
(5192, '5*')


In [7]:
infection.loc[2414, 'zika_cases'] = 0
infection.loc[2783, 'zika_cases'] = 0
infection.loc[5192, 'zika_cases'] = 0
infection['zika_cases'] = infection.zika_cases.fillna(0)
infection['zika_cases'] = infection.zika_cases.astype(int)

In [10]:
infection = (infection
             .groupby(['location','date']).sum()
             .reset_index()
            )

infection.sort_values('zika_cases',ascending=False).head(20)

Unnamed: 0,location,date,zika_cases
342,Brazil,2016-05-28,161241
341,Brazil,2016-05-21,148905
339,Brazil,2016-05-07,138108
340,Brazil,2016-05-14,138108
338,Brazil,2016-04-30,127822
337,Brazil,2016-04-23,120161
30828,El_Salvador,2016-04-23,91896
336,Brazil,2016-04-02,91387
32613,Sudeste,2016-05-28,65328
32612,Sudeste,2016-05-21,61309


## Combine dataframes

In [141]:
model = pd.merge(weather_final,
                 infection,
                 on=['date','location'],
                 how='left')

model['zika_cases'] = model.zika_cases.fillna(0)

print model.shape, model.isnull().sum().max()

model = pd.merge(model,
                 population,
                 on='location', 
                 how='left')

print model.shape, model.isnull().sum().max()

model = pd.merge(model,
                 airport_distance,
                 on='location',
                 how='left')

print model.shape, model.isnull().sum().max()

model = pd.merge(model,
                 mosquito_distance,
                 on='location',
                 how='left')

print model.dropna().shape, model.isnull().sum().max()

(107965, 21) 0
(108373, 22) 102
(109189, 24) 102
(110719, 25) 102


In [142]:
save_it(model, '07_feature_engineering_and_cleaning')