In [None]:
import json
from collections import defaultdict

import pandas as pd
import geopandas as gpd
from shapely.geometry import mapping
from jenkspy import jenks_breaks

from covidcaremap.data import external_data_path, processed_data_path
from covidcaremap.ihme import IHME

Configuration that will be used in the visualization

In [None]:
ihme_config = {
    'dates': None, # Calculated below
    'ihme_version': None, # Calculated below
    'aggregations': {
        'country': {
            'per_capita_base': 1000000,
            'breaks': { 'totals': None, 'per_capita': None } # Calculated below
        },
        'region': {
            'per_capita_base': 100000,
            'breaks': { 'totals': None, 'per_capita': None } # Calculated below
        }
    }
}

Read in a range of country/region boundaries to match up to IHME location names.

In [None]:
state_gdf = gpd.read_file(external_data_path('us_states.geojson'), encoding='utf-8')
countries_gdf = gpd.read_file(external_data_path('admin0.geojson'))
spain_gdf = gpd.read_file('https://raw.githubusercontent.com/deldersveld/'
                          'topojson/master/countries/spain/spain-comunidad.json')
canary_islands_gdf = gpd.read_file('https://raw.githubusercontent.com/deldersveld/'
                                   'topojson/master/countries/spain/canary-islands-province.json')
italy_gdf = gpd.read_file('https://raw.githubusercontent.com/openpolis/'
                          'geojson-italy/master/geojson/limits_IT_regions.geojson')
admin1_gdf = gpd.read_file(external_data_path('admin1.geojson'))


Read population data

In [None]:
state_pop_gdf = gpd.read_file(processed_data_path('us_states_with_pop.geojson'))
worldpop_country_df = pd.read_csv(external_data_path('worldpop-country-pop-for-ihme-2020.csv'))
worldpop_region_df = pd.read_csv(external_data_path('worldpop-region-pop-for-ihme-2020.csv'))
worldpop_admin1_df = pd.read_csv(external_data_path('worldpop-admin1-2020.csv'))

In [None]:
worldpop_admin1_df[worldpop_admin1_df['adm0_a3'] == 'CAN'] \
    .merge(admin1_gdf , on='adm1_code') \
    .drop(columns=['adm1_code']) \
    .rename(columns={'name': 'location_name'})[['adm0_a3_y','location_name']]

Process Admin0 and Admin1 worldpop data to support new countries and regions added by IHME.

In [None]:
worldpop_admin1_processed_df = worldpop_admin1_df[['adm1_code', 'population']] \
    .merge(admin1_gdf[['adm1_code', 'name']] , on='adm1_code') \
    .drop(columns=['adm1_code']) \
    .rename(columns={'name': 'location_name'})

worldpop_admin0_processed_df = worldpop_admin1_df.groupby('adm0_a3')['population'].sum() \
    .to_frame() \
    .merge(countries_gdf[['ADM0_A3', 'NAME']] , left_on='adm0_a3', right_on='ADM0_A3') \
    .drop(columns=['ADM0_A3']) \
    .rename(columns={'NAME': 'location_name'})

Get the latest IHME projections.

In [None]:
ihme_df, ihme_version = IHME.get_latest(include_version=True)
ihme_config['ihme_version'] = ihme_version
print('Model version: {}'.format(ihme_version))

Rename columns between IHME and the geospatial datasets so that they can be matched up.

In [None]:
# Renames for better matching
italy_gdf = italy_gdf.replace({
    '''Valle d'Aosta/Vallée d'Aoste''': '''Valle d'Aosta'''
})

# Rename regions to line up with GeoJSON values.
ihme_df = ihme_df.replace({
    # Spain regions
    'Andalucia': 'Andalucía',
    'Aragon': 'Aragón',
    'Castile and Leon': 'Castilla y León',
    'Catalonia': 'Cataluña',  
    'Basque Country': 'País Vasco',
    'Canary Islands': 'Islas Canarias',
    'Valencian Community': 'Comunidad Valenciana',
    
    # Italy
    'Provincia autonoma di Bolzano': 'Bozen',
    'Provincia autonoma di Trento': 'Trento',
    
    # Germany
    'Baden-Wurttemberg': 'Baden-Württemberg',
    
    # Canada
    'Quebec': 'Québec' 
})

Ensure we'll capture all the regions we want to match.

In [None]:
missing_regions = (
    set(ihme_df['location_name'].values) - 
    set(state_gdf['NAME'].values) - 
    set(countries_gdf['ADMIN'].values) -
    set(admin1_gdf['name_en'].values) -
    set(admin1_gdf['name'].values) -
    set(spain_gdf['NAME_1'].values) -
    set(canary_islands_gdf['NAME_1'].values) -
    set(italy_gdf['reg_name'].values)
)

regions_to_ignore = set([
   'Other Counties, WA', 
   'Life Care Center, Kirkland, WA',
   'King and Snohomish Counties (excluding Life Care Center), WA'
])
assert len(missing_regions - regions_to_ignore) == 0

Create a dictionary of location name to geometries and population.

In [None]:
country_dfs = [
    (countries_gdf, 'ADMIN'),
]

country_pop_dfs = [
    (worldpop_country_df, 'location_name', 'population'),
    (worldpop_admin0_processed_df, 'location_name', 'population')
]

region_dfs = [
    (state_gdf, 'NAME'),
    (admin1_gdf, 'name_en'),
    (admin1_gdf, 'name'),
    (spain_gdf, 'NAME_1'),
    (canary_islands_gdf, 'NAME_1'),
    (italy_gdf, 'reg_name')
]

region_pop_dfs = [
    (state_pop_gdf, 'State Name', 'Population'),
    (worldpop_region_df, 'location_name', 'population'),
    (worldpop_admin1_processed_df, 'location_name', 'population')
]

def get_geoms_by_name(dfs):
    result = {}
    
    for df, name_col in dfs:
        for _, feature in df.iterrows():
            name = feature[name_col]
            if name is not None:
                if name not in result:
                    result[name] = feature['geometry']
    return result

def get_pop_by_name(dfs):
    result = {}
    
    for df, name_col, pop_col in dfs:
        for _, row in df.iterrows():
            name = row[name_col]
            if name is not None:
                if name not in result:
                    result[name] = row[pop_col]
    return result

country_geom_by_name = get_geoms_by_name(country_dfs)
country_pop_by_name = get_pop_by_name(country_pop_dfs)
region_geom_by_name = get_geoms_by_name(region_dfs)  
region_pop_by_name = get_pop_by_name(region_pop_dfs)
    

Define methods that generate the IHME GeoJSON for each of our region groups.

In [None]:
value_cols = [
    'allbed_mean', 
    'allbed_lower',
    'allbed_upper', 
    'ICUbed_mean', 
    'ICUbed_lower', 
    'ICUbed_upper',
    'InvVen_mean', 
    'InvVen_lower', 
    'InvVen_upper', 
    'deaths_mean',
    'deaths_lower', 
    'deaths_upper', 
    'admis_mean', 
    'admis_lower',
    'admis_upper', 
    'newICU_mean', 
    'newICU_lower', 
    'newICU_upper',
    'totdea_mean', 
    'totdea_lower', 
    'totdea_upper', 
    'bedover_mean',
    'bedover_lower', 
    'bedover_upper', 
    'icuover_mean', 
    'icuover_lower',
    'icuover_upper']

def gather_features_by_name(geoms_by_name, pop_by_name, df):
    dates = set([])
    props_by_name = defaultdict(dict)
    for _, row in df.iterrows():
        name = row['location_name']
        date = row['date']
        dates.add(date)
        for col in value_cols:
            props_by_name[name]['{}_{}'.format(date, col)] = row[col]
    
    features = []
    names_found = []
    names_not_found = []
    for name in props_by_name:
        if name in geoms_by_name:
            geom = geoms_by_name[name]
            pop = pop_by_name[name]
            props = { 'location_name': name, 'population': pop }
            for k, v in props_by_name[name].items():
                props[k] = v
            features.append({
                'type': 'Feature',
                'geometry': mapping(geom),
                'properties': props
            })
            names_found.append(name)
        else:
            names_not_found.append(name)
    return features, dates, set(names_found), set(names_not_found)
    

Define methods for generating breaks that will be used to color the map.

In [None]:
metrics = set(map(lambda x: x.split('_')[0], value_cols))
levels = set(map(lambda x: x.split('_')[1], value_cols))

def get_prop_values(features, per_capita_base=None):
    result = {}
    for m in metrics:
        result[m] = {}
        for l in levels:
            result[m][l] = []

    for feat in features:
        if per_capita_base is not None:
            pop = feat['properties']['population']
            denom = pop / per_capita_base
        else:
             denom = 1
                
        for k, v in feat['properties'].items():
            if k not in ['location_name', 'population']:
                _, metric, level = k.split('_')
                result[metric][level].append(v / denom)
                
    return dict(result)

def compute_breaks(features, per_capita_base=None):
    result = {}
    for m in metrics:
        result[m] = {}
        for l in levels:
            result[m][l] = None
            
    prop_values = get_prop_values(features, per_capita_base)
    for m in prop_values:
        for l in prop_values[m]:
            result[m][l] = jenks_breaks(prop_values[m][l], nb_class=6)

    return result

Define methods for creating GeoJSON and breaks files

In [None]:
def create_data(geom_by_name, pop_by_name, agg_id):
    # GeoJSON
    features, dates, names_found, names_not_found = gather_features_by_name(
        geom_by_name, pop_by_name, ihme_df
    )
    
    feature_collection = {
        'type': 'FeatureCollection',
        'features': features
    }

    
    # Set breaks
    ihme_config['aggregations'][agg_id]['breaks']['totals'] = compute_breaks(features)
    ihme_config['aggregations'][agg_id]['breaks']['per_capita'] = compute_breaks(
        features, ihme_config['aggregations'][agg_id]['per_capita_base']
    )

    return (
        feature_collection, 
        processed_data_path('ihme-{}-data.geojson'.format(agg_id)),
        dates, 
        names_found, 
        names_not_found
    )

Create datasets.

In [None]:
# For Countries
(
    country_feature_collection,
    country_geojson_path,
    country_dates, 
    country_names_found, 
    country_names_not_found
) = create_data(country_geom_by_name, country_pop_by_name, 'country')

# For Regions
(
    region_feature_collection,
    region_geojson_path, 
    region_dates, 
    region_names_found, 
    region_names_not_found
) = create_data(region_geom_by_name, region_pop_by_name, 'region')

# Set the dates into the config for visualization.
ihme_config['dates'] = sorted(list(country_dates.union(region_dates)))

Perform final check that we didn't miss any regions.

In [None]:
# Last check that we didn't miss any regions.
assert len(
    (
        region_names_not_found.union(country_names_not_found) - 
        region_names_found.union(country_names_found)
    ) - regions_to_ignore
) == 0

#### Output files

In [None]:
with open(country_geojson_path, 'w') as f:
    f.write(json.dumps(country_feature_collection))

In [None]:
with open(region_geojson_path, 'w') as f:
    f.write(json.dumps(region_feature_collection))

In [None]:
with open(processed_data_path('ihme-config.json'), 'w') as f:
    f.write(json.dumps(ihme_config, indent=4))

In [None]:
ihme_config['aggregations']['country']['breaks']['per_capita']['deaths']['mean']

In [None]:
ihme_config['aggregations']['region']['breaks']['per_capita']['deaths']['mean']

In [None]:
ihme_config['aggregations']['region']['breaks']['totals']['deaths']['mean']