In [13]:
import pandas as pd
import numpy as np
from glob import glob
from csv_pkl_sql import save_it

pd.options.mode.chained_assignment = None

In [14]:
location_files = glob('../zika/*/*Places.csv')
locations = pd.concat([pd.read_csv(x) 
                       for x in location_files], axis=0).reset_index(drop=True)

In [15]:
data_file_locations = glob('../zika/*/*/data/*.csv')
data_locations = pd.concat([pd.read_csv(x, usecols=[1]).drop_duplicates() 
                            for x in data_file_locations], axis=0).drop_duplicates().reset_index(drop=True)

In [16]:
# Drop the locations that don't exist in any data files
mask = locations.location.isin(data_locations.location)
locations = locations[mask]

In [17]:
# District data will be difficult to incorporate into the model, so drop for now
mask = locations.location_type.isin(['country', 'region', 'district']).pipe(np.invert)
locations = locations.loc[mask]

In [18]:
locations = locations.dropna(axis=1, how='all')

In [19]:
location_key = locations[['location', 'location_type']]
location_key[['country', 'province', 'county', 'city']] = location_key.location.str.split(r"""-""", expand=True)

In [20]:
def map_locations(x):
    location_mapper = {'state':'province',
                       'municipality':'city',
                       'department':'province',
                       'Region':'province',
                       'Collectivity':'province',
                       'territory':'province'
                      }
    if x in location_mapper.keys():
        return location_mapper[x]
    else:
        return x
    
location_key['location_type'] = location_key.location_type.apply(lambda x: map_locations(x))

In [21]:
# Fix the US Virgin Islands entries
mask = ( location_key.county.isnull() & 
         (location_key.location_type=='county') &
         (location_key.country=='United_States_Virgin_Islands')
        )

location_key.loc[mask, 'county'] = location_key.loc[mask, 'province']
location_key.loc[mask, 'province'] = 'Virgin Islands'
location_key.loc[mask, 'country'] = 'United States'


mask = ( location_key.province.isnull() & 
         (location_key.location_type=='province'))
location_key.loc[mask, 'province'] = 'Virgin Islands'
location_key.loc[mask, 'country'] = 'United States'


mask = (location_key.location=='United_States-US_Virgin_Islands')
location_key.loc[mask, 'province'] = 'Virgin Islands'

In [22]:
# Fix remaining counties (mainly in Ecuador and Panama)
mask = ( location_key.county.isnull() & 
         (location_key.location_type=='county'))

location_key.loc[mask, 'county'] = location_key.loc[mask, 'province']
location_key.loc[mask, 'province'] = None

In [23]:
# Move cities to correct column
mask = ( location_key.city.isnull() & 
         (location_key.location_type=='city'))

location_key.loc[mask, 'city'] = location_key.loc[mask, 'county']
location_key.loc[mask, 'county'] = None

In [24]:
# More fixes for cities
mask = ( location_key.city.isnull() & 
         (location_key.location_type=='city'))

location_key.loc[mask, 'city'] = location_key.loc[mask, 'province']
location_key.loc[mask, 'province'] = None

In [25]:
# Drop unknown cities
location_key = location_key[location_key.city.isin(['Unknown','Not_Reported']).pipe(np.invert)]

In [26]:
# Fix for Dade County Florida
mask = location_key.location=='United_States-Florida-Miami-Dade_County'
location_key.loc[mask, 'county'] = 'Dade_County'
location_key.loc[mask, 'city'] = 'Miami'

In [27]:
# Fix for Santiago Del Estero Argentina
location_key.loc[location_key.location=='Argentina-Sgo_Del_Estero', 'province'] = 'Santiago Del Estero'
location_key.loc[location_key.location=='Argentina-CABA', 'province'] = 'Ciudad de Buenos Aires'

In [28]:
# Remove county name
location_key['county'] = location_key.county.str.replace('_County','')

location_key = location_key[location_key.county.isin(['Unknown','Not_Reported']).pipe(np.invert)]

In [29]:
# Remove all underscores
for col in ['country', 'province', 'county', 'city']:
    location_key[col] = location_key[col].str.replace('_', ' ')

In [44]:
# For checking the data 50 lines at a time
i=32
nsize = 50
location_key.iloc[i*nsize:(i+1)*nsize]

Unnamed: 0,location,location_type,country,province,county,city
1744,United_States-Florida-Walton_County,county,United States,Florida,Walton,
1745,United_States-Florida-Washington_County,county,United States,Florida,Washington,
1746,United_States_Virgin_Islands,province,United States,Virgin Islands,,
1747,United_States_Virgin_Islands-Saint_Thomas,county,United States,Virgin Islands,Saint Thomas,
1748,United_States_Virgin_Islands-Saint_Croix,county,United States,Virgin Islands,Saint Croix,
1749,United_States_Virgin_Islands-Saint_John,county,United States,Virgin Islands,Saint John,


In [45]:
save_it(location_key, '00_cleaned_city_names')