# Load and clean Zika infection data
Though the locations of Zika outbreaks were previously imported, the actual infection data was never imported. 

In [57]:
import pandas as pd
import numpy as np
import re
import time
from glob import glob
from csv_pkl_sql import save_it

import matplotlib.pyplot as plt

%matplotlib inline

## Import infection data

In [73]:
data_file_locations = glob('../zika/*/*/data/*.csv')
data = pd.concat([pd.read_csv(x)
                            for x in data_file_locations], axis=0).reset_index(drop=True)

data.drop(['time_period','time_period_type'], axis=1, inplace=True)

data['report_date'] = data.report_date.str.replace('_','-')       
data['report_date'] = pd.to_datetime(data.report_date)

#save_it(data, '03_infection_data_initial_import')

In [74]:
data.head(1)

Unnamed: 0,report_date,location,location_type,data_field,data_field_code,value,unit
0,2016-03-19,Argentina-Buenos_Aires,province,cumulative_confirmed_local_cases,AR0001,0,cases


In [76]:
data.shape

(106847, 7)

## Clean the data

In [None]:
# Drop municipalities
data = data.loc[data.unit!='municipalities']

In [77]:
data = (data[['report_date', 'location', 'value', 'data_field']]
             .rename(columns={'report_date':'date','value':'zika_cases'}))

In [78]:
for x in data.zika_cases.iteritems():
    try:
        float(x[1])
    except:
        print(x)

(2414, '125*5')
(2783, '149*5')
(5192, '5*')


In [79]:
data.loc[2414, 'zika_cases'] = 0
data.loc[2783, 'zika_cases'] = 0
data.loc[5192, 'zika_cases'] = 0
data['zika_cases'] = data.zika_cases.fillna(0)
data['zika_cases'] = data.zika_cases.astype(int)

In [80]:
data.query("zika_cases>0").shape, data.shape

((36407, 4), (106847, 4))

In [81]:
# Remove data that don't appear to be directly associated with Zika
excluded_fields = ['cumulative_cases_discarded',
'microcephaly_not',
'gbs_reported',
'zika_not',
'confirmed_acute_fever',
'confirmed_arthralgia',
'confirmed_arthritis', 
'confirmed_rash', 
'confirmed_conjunctivitis',
'confirmed_eyepain', 
'confirmed_headache', 
'confirmed_malaise',
'zika_reported_travel',
'yearly_reported_travel_cases']

mask = data.data_field.isin(excluded_fields)
print(mask.sum(), data.loc[mask, 'zika_cases'].sum(), data.zika_cases.sum())

(3217, 61606, 5195700)


In [82]:
data = data.loc[mask.pipe(np.invert)]

In [83]:
data.shape[0], data.zika_cases.sum()

(103630, 5134094)

In [86]:
data.dtypes

date          datetime64[ns]
location              object
zika_cases             int64
data_field            object
dtype: object

In [88]:
save_it(data, '03_infection_data_final')