In [None]:
import json
from collections import defaultdict

import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import mapping
from jenkspy import jenks_breaks
from unidecode import unidecode

from covidcaremap.data import external_data_path, processed_data_path
from covidcaremap.ihme import IHME

Configuration that will be used in the visualization

In [None]:
ihme_config = {
    'dates': None, # Calculated below
    'ihme_version': None, # Calculated below
    'aggregations': {
        'country': {
            'per_capita_base': 1000000,
            'breaks': { 'totals': None, 'per_capita': None } # Calculated below
        },
        'region': {
            'per_capita_base': 100000,
            'breaks': { 'totals': None, 'per_capita': None } # Calculated below
        }
    }
}

Read in a range of country/region boundaries to match up to IHME location names.

In [None]:
state_gdf = gpd.read_file(external_data_path('us_states.geojson'), encoding='utf-8')
countries_gdf = gpd.read_file(external_data_path('admin0.geojson'))
spain_gdf = gpd.read_file('https://raw.githubusercontent.com/deldersveld/'
                          'topojson/master/countries/spain/spain-comunidad.json')
canary_islands_gdf = gpd.read_file('https://raw.githubusercontent.com/deldersveld/'
                                   'topojson/master/countries/spain/canary-islands-province.json')
italy_gdf = gpd.read_file('https://raw.githubusercontent.com/openpolis/'
                          'geojson-italy/master/geojson/limits_IT_regions.geojson')
admin1_gdf = gpd.read_file(external_data_path('admin1.geojson'))


Read population data

In [None]:
state_pop_gdf = gpd.read_file(processed_data_path('us_states_with_pop.geojson'))
worldpop_country_df = pd.read_csv(external_data_path('worldpop-country-pop-for-ihme-2020.csv'))
worldpop_region_df = pd.read_csv(external_data_path('worldpop-region-pop-for-ihme-2020.csv'))
worldpop_admin1_df = pd.read_csv(external_data_path('worldpop-admin1-2020.csv'))

Process Admin0 and Admin1 worldpop data to support new countries and regions added by IHME.

In [None]:
worldpop_admin1_processed_df = worldpop_admin1_df[['adm1_code', 'population']] \
    .merge(admin1_gdf[['adm1_code', 'name']] , on='adm1_code') \
    .drop(columns=['adm1_code']) \
    .rename(columns={'name': 'location_name'})

worldpop_admin0_processed_df = worldpop_admin1_df.groupby('adm0_a3')['population'].sum() \
    .to_frame() \
    .merge(countries_gdf[['ADM0_A3', 'NAME']] , left_on='adm0_a3', right_on='ADM0_A3') \
    .drop(columns=['ADM0_A3']) \
    .rename(columns={'NAME': 'location_name'})

country_level_pops = set(worldpop_country_df['location_name'])
worldpop_admin0_processed_df = worldpop_admin0_processed_df[
    ~worldpop_admin0_processed_df['location_name'].isin(country_level_pops)
]

Get the latest IHME projections.

In [None]:
ihme_df, ihme_version = IHME.get_latest(include_version=True)
ihme_config['ihme_version'] = ihme_version
print('\n\nMODEL VERSION: {}\n\n'.format(ihme_version))

Rename columns between IHME and the geospatial datasets so that they can be matched up.

In [None]:
# Renames for better matching
italy_gdf = italy_gdf.replace({
    '''Valle d'Aosta/Vallée d'Aoste''': '''Valle d'Aosta'''
})

# Rename regions to line up with GeoJSON values.
ihme_df = ihme_df.replace({
    # Spain regions
    'Andalucia': 'Andalucía',
    'Aragon': 'Aragón',
    'Castile and Leon': 'Castilla y León',
    'Castile and León': 'Castilla y León',
    'Catalonia': 'Cataluña',  
    'Basque Country': 'País Vasco',
    'Canary Islands': 'Islas Canarias',
    'Valencian Community': 'Comunidad Valenciana',
    
    # Italy
    'Provincia autonoma di Bolzano': 'Bozen',
    'Provincia autonoma di Trento': 'Trento',
    
    # Germany
    'Baden-Wurttemberg': 'Baden-Württemberg',
    
    # Canada
    'Quebec': 'Québec', 
    
    # Mexico
    'State of Mexico': 'México',
    'Mexico_two': 'México', # This changed in model version 2020_06_08
    'Veracruz de Ignacio de la Llave': 'Veracruz',
    'Michoacan de Ocampo': 'Michoacán',
    'Michoacán de Ocampo': 'Michoacán',
    
    # Brazil
    'Ceara': 'Ceará',
    'Maranhao': 'Maranhão',
    'Sao Paulo': 'São Paulo',
    'Goias': 'Goiás',
    'Amapa': 'Amapá',
    'Paraiba': 'Paraíba',
    'Espirito Santo': 'Espírito Santo',
    
    
    # Countries
    'Republic of Korea': 'South Korea',
    'Republic of Moldova': 'Moldova',
    'Parana': 'Paraná',
    'Mexico_country': 'Mexico',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Russian Federation': 'Russia',
    'Serbia': 'Republic of Serbia'
})

Ensure we'll capture all the regions we want to match.

In [None]:
available_regions = (
    set(state_gdf['NAME'].values) |
    set(countries_gdf['ADMIN'].values) |
    set(admin1_gdf['name_en'].values) |
    set(admin1_gdf['name'].values) |
    set(spain_gdf['NAME_1'].values) |
    set(canary_islands_gdf['NAME_1'].values) |
    set(italy_gdf['reg_name'].values)
)

# We check against 'unidecoded' strings as well
# as IHME does not use accented characters.
missing_regions = (
    set(ihme_df['location_name'].values) - 
    available_regions -
    set(map(lambda x: x if x is None else unidecode(x), available_regions))
)

regions_to_ignore = set([
   'Other Counties, WA', 
   'Life Care Center, Kirkland, WA',
   'King and Snohomish Counties (excluding Life Care Center), WA'
])
if len(missing_regions - regions_to_ignore) != 0:
    raise Exception("Missing regions: {}".format(', '.join(missing_regions)))

Account for some location_names that could tie to multiple geometries

In [None]:
def avoid_geom(feature):
    # La Rioja is a region in Argentina and Spain; we want Spain
    if feature.get('name') == 'La Rioja' and feature['iso_a2'] != 'ES':
        return True
    return False


Create a dictionary of location name to geometries and population.

In [None]:
country_dfs = [
    (countries_gdf, 'ADMIN'),
]

country_pop_dfs = [
    (worldpop_country_df, 'location_name', 'population'),
    (worldpop_admin0_processed_df, 'location_name', 'population')
]

region_dfs = [
    (state_gdf, 'NAME'),
    (admin1_gdf, ['name', 'name_en']),
    (spain_gdf, 'NAME_1'),
    (canary_islands_gdf, 'NAME_1'),
    (italy_gdf, 'reg_name')
]

region_pop_dfs = [
    (state_pop_gdf, 'State Name', 'Population'),
    (worldpop_region_df, 'location_name', 'population'),
    (worldpop_admin1_processed_df, 'location_name', 'population')
]

state_names = set(state_gdf['NAME'].values)

def get_geoms_by_name(dfs):
    result = {}
    seen = set([])
    duplicates = set([])
    
    for df, name_cols in dfs:
        if type(name_cols) is str:
            name_cols = [name_cols]        
        for _, feature in df.iterrows():
            if avoid_geom(feature):
                continue
            row_names = set([])
            for name_col in name_cols:
                name = feature[name_col]
                if name is not None:
                    if name in row_names:
                        continue
                    row_names.add(name)
                    if name in seen:
                        duplicates.add(name)                       
                    else:
                        seen.add(name)
                        result[name] = feature['geometry']
                        decoded = unidecode(name)
                        if decoded != name:
                            if decoded not in result:                    
                                result[decoded] = feature['geometry']
    return result, duplicates

def get_pop_by_name(dfs):
    result = {}
    duplicates = set([])
    seen = set([])
    
    for df, name_col, pop_col in dfs:
        seen_in_this_df = set([])
        print(name_col)
        for _, row in df.iterrows():            
            name = row[name_col]
            if name == 'Florida':
                print('FLORIDA: {}'.format(row[pop_col]))
            if name is not None:
                if name in seen:
                    duplicates.add(name)
                    # If it's not a US state, delete it out to ensure manual handling.
                    if name in seen_in_this_df:
                        print('POP DUPLICATE NAME: {}, HANDLE MANUALLY IF NECESSARY'.format(name))                        
                        if name in result:
                            del result[name]
                else:
                    seen.add(name)
                    seen_in_this_df.add(name)
                    result[name] = row[pop_col]
                    decoded = unidecode(name)
                    if decoded != name:
                        if decoded not in result:                    
                            result[decoded] = row[pop_col]
                        else:
                            print('POP UNIDECODE DUPLICATE: {}'.format(decoded))
    return result, duplicates

print('==Processing country geoms....')
country_geom_by_name, _ = get_geoms_by_name(country_dfs)
print('==Processing country populations....')
country_pop_by_name, country_pop_duplicates = get_pop_by_name(country_pop_dfs)
print('==Processing region geoms....')
region_geom_by_name, _ = get_geoms_by_name(region_dfs)  
print('==Processing region populations....')
region_pop_by_name, region_pop_duplicates = get_pop_by_name(region_pop_dfs)
    

Manually input some population data

##### Dominican Republic

["Estimaciones y proyecciones de la población total" (xlsx)](https://www.one.gob.do/categoria/tablagrafico?Gid=23). Oficina Nacional de Estadística.

In [None]:
country_pop_by_name['Dominican Republic'] = 10735896

##### Mexico City

["Mexico Demographics Profile 2018"](https://www.indexmundi.com/mexico/demographics_profile.html)

In [None]:
region_pop_by_name['Mexico City'] = 21672000

##### Republic of Serbia

2019 Estimate

["PBC Stats"](http://www.stat.gov.rs/)

In [None]:
country_pop_by_name['Republic of Serbia'] = 6963764

##### Handle duplicate entries in world pop

The admin1 worldpop data has duplicate `loction_name` entries. Handle those cases here.

In [None]:
# Spain, not Argentina
region_pop_by_name['La Rioja'] = 315675 # (2018) Sources include:Instituto Nacional de Estadística, Eurostat

# Brazil, not Cape Verde
region_pop_by_name['Santa Catarina'] = 6727000 # (2014) Instituto Brasileiro de Geografia e Estatística

# Brazil, not Columbia, Peru, or Venezuela
region_pop_by_name['Amazonas'] = 4144597 # (2019 estimate)  Population estimates for the Brazilian municipalities and Federation Units on July 1, 2018

# Brazil, not Mexico
region_pop_by_name['Distrito Federal'] = 3015268 # (2019 estimate)  IBGE - Projeção da população

Ensure we're not missing geoms or population for any regions

In [None]:
names_with_geom = set(country_geom_by_name.keys()).union(set(region_geom_by_name))
names_with_pop = set(country_pop_by_name.keys()).union(set(region_pop_by_name))

geom_not_found = set([])
pop_not_found = set([])
for _, row in ihme_df.iterrows():
    name = row['location_name']
    if name not in names_with_geom:
        geom_not_found.add(name)
    if name not in names_with_pop:
        pop_not_found.add(name)
        
if len(geom_not_found) > 0 or len(pop_not_found) > 0:
    raise Exception("""  
        Geoms not found: {}
        
        Population not found: {}
    """.format('\n'.join(geom_not_found), '\n'.join(pop_not_found))
    )
        

Define methods for retrieving the geometry and population for each location.

In [None]:
# Search region first then country to account for names like Georgia.
# Account for other edge cases, like Mexico

def get_geom(location_name):
    if location_name == 'Mexico':
        return country_geom_by_name.get(location_name)
    
    result = region_geom_by_name.get(
        location_name,
        country_geom_by_name.get(location_name)
    )
    if result is None:
        raise Exception("IHME location_name {} geometry not found".format(location_name))
    return result

def get_pop(location_name):
    if location_name == 'Mexico':
        return country_pop_by_name.get(location_name)
    
    result = region_pop_by_name.get(
        location_name,
        country_pop_by_name.get(location_name)
    )
    if result is None:
        raise Exception("IHME location_name {} population not found".format(location_name))
    return result

Rename smoothed columns to better fit with the `_` deliniator used by the code.

In [None]:
ihme_df = ihme_df.rename(columns={
    'deaths_mean_smoothed': 'smoothed-deaths_mean',
    'deaths_lower_smoothed': 'smoothed-deaths_lower',
    'deaths_upper_smoothed': 'smoothed-deaths_upper',
    'totdea_mean_smoothed': 'smoothed-totdea_mean',
    'totdea_lower_smoothed': 'smoothed-totdea_lower',
    'totdea_upper_smoothed': 'smoothed-totdea_upper',
    'est_infections_mean': 'est-infections_mean',
    'est_infections_lower': 'est-infections_lower',
    'est_infections_upper': 'est-infections_upper',
})

Define methods that generate the IHME GeoJSON for each of our region groups.

In [None]:
value_cols = [
    'allbed_mean', 
    'allbed_lower',
    'allbed_upper', 
    'ICUbed_mean', 
    'ICUbed_lower', 
    'ICUbed_upper',
    'InvVen_mean', 
    'InvVen_lower', 
    'InvVen_upper', 
    'deaths_mean',
    'deaths_lower', 
    'deaths_upper',
    'smoothed-deaths_mean',
    'smoothed-deaths_lower', 
    'smoothed-deaths_upper',
    'admis_mean', 
    'admis_lower',
    'admis_upper', 
    'newICU_mean', 
    'newICU_lower', 
    'newICU_upper',
    'totdea_mean', 
    'totdea_lower', 
    'totdea_upper', 
    'smoothed-totdea_mean', 
    'smoothed-totdea_lower', 
    'smoothed-totdea_upper', 
    'bedover_mean',
    'bedover_lower', 
    'bedover_upper', 
    'icuover_mean', 
    'icuover_lower',
    'icuover_upper',
    'est-infections_mean',
    'est-infections_lower',
    'est-infections_upper'
]

metrics = set(map(lambda x: x.split('_')[0], value_cols))
levels = set(map(lambda x: x.split('_')[1], value_cols))

def gather_data_for_locations(df):
    """
    Gathers data and the feature collection for the location_names
    contained as keys inn geoms_by_name (which must be a subset to the keys
    of pop_by_name)
    
    Returns the feature collection, the data, and the dates
    """
    next_location_id = 0
    name_to_id = {}
    data = {}
    dates = set([])
    props_by_name = defaultdict(dict)
    features = []
    
    def setup_location(name, location_id):
        geom = get_geom(name)
        pop = get_pop(name)
        features.append({
            'id': location_id,
            'type': 'Feature',
            'geometry': mapping(geom),
            'properties': {
                'id': location_id,
                'location_name': name,
                'population': pop
            }
        })
        data[location_id] = { 
            'location_name': name,
            'population': pop, 
            'values': {} 
        }
        for metric in metrics:
            data[location_id]['values'][metric] = {}
            for level in levels:
                data[location_id]['values'][metric][level] = {}   
            
    for _, row in df.sort_values(by='location_name').iterrows():
        name = row['location_name']
        date = row['date']
        dates.add(date)
        if not name in name_to_id:
            location_id = next_location_id
            name_to_id[name] = location_id
            setup_location(name, location_id)
            next_location_id += 1
        else:
            location_id = name_to_id[name]
        for metric in metrics:
            for level in levels:
                v = row['{}_{}'.format(metric, level)]
                if np.isnan(v):
                    v = None
                data[location_id]['values'][metric][level][date] = v
    
    return features, data, dates


Calculate the features and data for all locations

In [None]:
all_features, all_data, dates = gather_data_for_locations(ihme_df)

# Set the dates into the config for visualization.
ihme_config['dates'] = sorted(list(dates))

Define methods for generating breaks that will be used to color the map.

In [None]:
def get_prop_values(data, per_capita_base=None):
    result = {}
    for m in metrics:
        result[m] = {}
        for l in levels:
            result[m][l] = []

    for location_id, location_data in data.items():
        if per_capita_base is not None:
            pop = location_data['population']
            denom = pop / per_capita_base
        else:
            denom = 1
                
        for m in metrics:
            for l in levels:
                for v in location_data['values'][m][l].values():
                    if v is not None:
                        result[m][l].append(v / denom)
                
    return dict(result)

def compute_breaks(data, per_capita_base=None):
    result = {}
    for m in metrics:
        result[m] = {}
        for l in levels:
            result[m][l] = None
            
    prop_values = get_prop_values(data, per_capita_base)
    for m in prop_values:
        for l in prop_values[m]:
            result[m][l] = jenks_breaks(prop_values[m][l], nb_class=6)

    return result

Define methods for creating GeoJSON and breaks files

In [None]:
def create_data(location_names, agg_id):
    features = [
        f for f in all_features
        if f['properties']['location_name'] in location_names
    ]
    
    feature_collection = {
        'type': 'FeatureCollection',
        'features': features
    }
    
    data = dict([
        (k, v) for (k, v) in all_data.items()
        if v['location_name'] in location_names
    ])

    # Set breaks
    ihme_config['aggregations'][agg_id]['breaks']['totals'] = compute_breaks(data)
    ihme_config['aggregations'][agg_id]['breaks']['per_capita'] = compute_breaks(
        data, ihme_config['aggregations'][agg_id]['per_capita_base']
    )

    return (
        feature_collection,
        data,
        processed_data_path('ihme-{}.geojson'.format(agg_id)),
        processed_data_path('ihme-{}-data.json'.format(agg_id))
    )

Create datasets.

In [None]:
# For Countries
## Account for Georgia as a country name (not in IHME data...yet)
country_names = list(country_geom_by_name.keys())
country_names.remove('Georgia')

(
    country_feature_collection,
    country_data,
    country_geojson_path,
    country_data_path
) = create_data(country_names, 'country')

# For Regions
## Account for Mexico as a subnational region name
region_names = list(region_geom_by_name.keys())
region_names.remove('Mexico')
(
    region_feature_collection,
    region_data,
    region_geojson_path, 
    region_data_path
) = create_data(region_names, 'region')


#### Output files

In [None]:
with open(country_geojson_path, 'w') as f:
    f.write(json.dumps(country_feature_collection, sort_keys=True))

In [None]:
with open(country_data_path, 'w') as f:
    f.write(json.dumps(country_data, sort_keys=True))

In [None]:
with open(region_geojson_path, 'w') as f:
    f.write(json.dumps(region_feature_collection, sort_keys=True))

In [None]:
with open(region_data_path, 'w') as f:
    f.write(json.dumps(region_data, sort_keys=True))

In [None]:
with open(processed_data_path('ihme-config.json'), 'w') as f:
    f.write(json.dumps(ihme_config, indent=4, sort_keys=True))

In [None]:
ihme_config['aggregations']['country']['breaks']['per_capita']['deaths']['mean']

In [None]:
ihme_config['aggregations']['region']['breaks']['per_capita']['deaths']['mean']

In [None]:
ihme_config['aggregations']['region']['breaks']['totals']['deaths']['mean']