In [None]:
import json
from urllib.request import urlopen
from pathlib import Path

import pandas as pd
import geopandas as gpd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

sns.set_style('whitegrid')
sns.set_palette('colorblind')

with urlopen('https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson') as response:
    countries = json.load(response)
geo_loc_countries = set([countries['features'][x]['properties']['ADMIN'] for x in range(len(countries['features']))])

## GBD2019 AMR Map

In [None]:
burden_df = pd.concat([pd.read_csv(taxa_fp) for taxa_fp in Path('../data/ihme_microbe').glob("*.csv")]).drop_duplicates()
bacterial_burden_per_country = burden_df.groupby('Location')['Value'].sum()
bacterial_burden_per_country = bacterial_burden_per_country\
                                .reset_index(name='Bacterial Burden (DALYs per 100k)')\
                                .rename(columns={'Location': 'Country'})
        
    
bacterial_burden_per_country['Bacterial Burden (log DALYs per 100k)'] = np.log(bacterial_burden_per_country['Bacterial Burden (DALYs per 100k)'])
ihme_to_geoloc_countries = {'Bahamas': 'The Bahamas',
                            'Bolivia (Plurinational State of)': 'Bolivia',
                            'Brunei Darussalam': 'Brunei',
                            'Cabo Verde': 'Cape Verde',
                            'Congo': 'Republic of Congo',
                            'Czechia': 'Czech Republic',
                            "Côte d'Ivoire": "Ivory Coast",
                            "Democratic People's Republic of Korea": 'North Korea',
                            "Eswatini": "Swaziland",
                            'Guinea-Bissau': 'Guinea Bissau',
                            'Iran (Islamic Republic of)': "Iran",
                            "Lao People's Democratic Republic": "Laos",
                            'Micronesia (Federated States of)': "Federated States of Micronesia",
                            'North Macedonia': "Macedonia",
                            'Republic of Korea': "South Korea",
                            'Republic of Moldova': "Moldova",
                            'Russian Federation': "Russia",
                            'Serbia': 'Republic of Serbia',
                            'Syrian Arab Republic': "Syria",
                            'Taiwan (Province of China)': "Taiwan",
                            'Timor-Leste': 'East Timor',
                            'Tokelau': "drop",
                            'Venezuela (Bolivarian Republic of)': "Venezuela",
                            'Viet Nam': "Vietnam",
                            'USA': 'United States of America'}

bacterial_burden_per_country['Map Locations'] = bacterial_burden_per_country['Country']\
                                                    .apply(lambda x: ihme_to_geoloc_countries[x] if x in ihme_to_geoloc_countries else x)
# dropping tokelau as we don't have map locations for it
bacterial_burden_per_country = bacterial_burden_per_country[bacterial_burden_per_country['Map Locations'] != 'drop']


In [None]:
fig = px.choropleth(bacterial_burden_per_country, geojson=countries, featureidkey='properties.ADMIN',
                    locations='Map Locations', color='Bacterial Burden (log DALYs per 100k)',
                    color_continuous_scale="Viridis",
                    projection='eckert4')
fig.show()

## SRA Reads per Pathogen

Pulled from BigQuery in June 2023

In [None]:
sra_metadata = pd.read_csv('../data/sra_metadata/all_non_human_sra_metadata.csv.xz')
sra_metadata['releasedate'] = pd.to_datetime(sra_metadata['releasedate'], yearfirst=True)

def link_ihme_pathogens_to_sra_taxa(sra_taxa):
    """
    Function to link IHME pathogens to actual SRA taxa
    """
    spp = ['Aeromonas spp.', 'Campylobacter spp.', 'Chlamydia spp.', 'Citrobacter spp.', 'Enterobacter spp.',
           'Legionella spp.', 'Morganella spp.', 'Mycoplasma spp.', 'Proteus spp.', 'Providencia spp.', 'Serratia spp.', 
           'Shigella spp.']

    species = ['Acinetobacter baumannii', 'Clostridioides difficile', 'Enterococcus faecalis', 'Enterococcus faecium', 
               'Escherichia coli', 'Haemophilus influenzae', 'Klebsiella pneumoniae', 'Listeria monocytogenes', 'Mycobacterium tuberculosis',
               'Neisseria gonorrhoeae', 'Neisseria meningitidis', 'Pseudomonas aeruginosa', 
               'Staphylococcus aureus', 'Streptococcus pneumoniae', 'Vibrio cholerae']
    
    for spp_taxa in spp:
        if sra_taxa.startswith(spp_taxa.split()[0]):
            return spp_taxa
    
    for spec_taxa in species:
        if sra_taxa.startswith(spec_taxa):
            return spec_taxa
    
    special = ['Non-typhoidal Salmonella', 'Other Klebsiella species', 'Other enterococci', 
               'Salmonella enterica serovar Paratyphi', 'Salmonella enterica serovar Typhi',
              'Group A Streptococcus', 'Group B Streptococcus']
    
    if sra_taxa.startswith("Klebsiella") and not sra_taxa.startswith("Klebsiella pneumoniae"):
        return 'Other Klebsiella species'
    elif sra_taxa.startswith('Enterococcus') and not (sra_taxa.startswith("Enterococcous faecium") or sra_taxa.startswith("Enterococcus faecalis")):
        return "Other enterococci"
    elif sra_taxa.startswith('Salmonella enterica subsp. enterica serovar Typhi'):
        return 'Salmonella enterica serovar Typhi'
    elif sra_taxa.startswith('Salmonella enterica subsp. enterica serovar Paratyphi'):
        return 'Salmonella enterica serovar Paratyphi'
    elif sra_taxa.startswith('Salmonella'):
        return 'Non-typhoidal Salmonella'
    # note this will slightly undercount GAS and GBS as there is other Strep with A antigen and B antigens
    elif sra_taxa.startswith('Streptococcus pyogenes'):
        return 'Group A Streptococcus'
    elif sra_taxa.startswith('Streptococcus agalactiae'):
        return "Group B Streptococcus"
    
    return "Non-IHME Taxa"

sra_metadata['IHME_taxa'] = sra_metadata['organism'].apply(link_ihme_pathogens_to_sra_taxa)
sra_metadata = sra_metadata[sra_metadata['IHME_taxa'] != 'Non-IHME Taxa']

sra_metadata = sra_metadata[['IHME_taxa', 'organism', 'geo_loc_name_country_calc', 'releasedate']]
sra_metadata = sra_metadata[sra_metadata['geo_loc_name_country_calc'] != 'uncalculated']
sra_metadata['Country'] = sra_metadata['geo_loc_name_country_calc'].apply(lambda x: ihme_to_geoloc_countries[x] if x in ihme_to_geoloc_countries else x)


In [None]:
ihme_taxa_sequenced_per_country = sra_metadata.groupby('Country')['IHME_taxa'].nunique()
all_pathogens = burden_df['Pathogen'].unique()
all_countries = bacterial_burden_per_country['Map Locations'].unique()

def filter_metadata(metadata, countries):
    pathogens_per_country = metadata.groupby('Country')['IHME_taxa'].nunique().reset_index()
    null_countries = {'Country': [], 'IHME_taxa': []}
    for country in countries:
        if country not in pathogens_per_country['Country'].unique():
            null_countries['Country'].append(country)
            null_countries['IHME_taxa'].append(0)
    pathogens_per_country = pathogens_per_country.append(pd.DataFrame(null_countries), ignore_index=True)
    pathogens_per_country = pathogens_per_country.rename(columns={'IHME_taxa': 'Highest Burden Bacterial Taxa with Raw Data (SRA)'})
    return pathogens_per_country


pre_2018_pathogens_sequenced = filter_metadata(sra_metadata[sra_metadata['releasedate'].dt.date < pd.to_datetime('2018-01-01', yearfirst=True).date()],
                                               all_countries)
post_2018_pathogens_sequenced = filter_metadata(sra_metadata[sra_metadata['releasedate'].dt.date >= pd.to_datetime('2018-01-01', yearfirst=True).date()],
                                                all_countries)

In [None]:
fig = px.choropleth(pre_2018_pathogens_sequenced, geojson=countries, featureidkey='properties.ADMIN',
                    locations='Country', color='Highest Burden Bacterial Taxa with Raw Data (SRA)',
                    color_continuous_scale="Blackbody",
                    projection='eckert4')
fig.show()

In [None]:
fig = px.choropleth(post_2018_pathogens_sequenced, geojson=countries, featureidkey='properties.ADMIN',
                    locations='Country', color='Highest Burden Bacterial Taxa with Raw Data (SRA)',
                    color_continuous_scale="Blackbody",
                    projection='eckert4')
fig.show()

In [None]:
bacterial_burden_per_country

In [None]:
burden_df['Map Locations'] = burden_df['Location'].apply(lambda x: ihme_to_geoloc_countries[x] if x in ihme_to_geoloc_countries else x)
# dropping tokelau as we don't have map locations for it
burden_df = burden_df[burden_df['Map Locations'] != 'drop']


In [None]:
unsampled_burden_df = {'Country': [], 'Unsampled Bacterial Burden (DALYs per 100k)': []}
for country, sampled_taxa in sra_metadata.groupby('Country')['IHME_taxa'].unique().items():
    missing_pathogens = set(all_pathogens) - set(sampled_taxa)
    unsampled_burden = burden_df[(burden_df['Map Locations'] == country) & (burden_df['Pathogen'].isin(missing_pathogens))]
    unsampled_burden = unsampled_burden['Value'].sum()
    unsampled_burden_df['Country'].append(country)
    unsampled_burden_df['Unsampled Bacterial Burden (DALYs per 100k)'].append(unsampled_burden)

unsampled_burden_df = pd.DataFrame(unsampled_burden_df)    

In [None]:
unsampled_burden_df.sort_values('Unsampled Bacterial Burden (DALYs per 100k)', ascending=False)

In [None]:
unsampled_burden_df[]

In [None]:
set(unsampled_burden_df['Country'].unique()) - set(world_bank_groups['Economy'].unique())

In [None]:
world_bank_groups = pd.read_excel('../data/world_bank_groups_2023.xlsx')
world_bank_groups['Economy'] = world_bank_groups['Economy'].apply(lambda x: ihme_to_geoloc_countries[x] if x in ihme_to_geoloc_countries else x)

world_bank_to_geoloc_countries = {'Congo, Dem. Rep.': 'Democratic Republic of the Congo',
                                  'Egypt, Arab Rep.': 'Egypt',
                                 'Gambia, The': 'Gambia',
                                  'Hong Kong SAR, China': 'Hong Kong',
                                  'Iran, Islamic Rep.': 'Iran',
                                  'Kyrgyz Republic': 'Kyrgyzstan',
                                  'Lao PDR': 'Laos',
                                  "Korea, Dem. People's Rep.": 'North Korea',
                                  'Slovak Republic': 'Slovakia',
                                  'Korea, Rep.': 'South Korea',
                                  'Taiwan, China': 'Taiwan',
                                  'Bahamas, The': 'The Bahamas',
                                  'Türkiye': 'Turkey',
                                  'United States': 'United States of America',
                                  'Yemen, Rep.': 'Yemen'}
world_bank_groups['Economy'] = world_bank_groups['Economy'].apply(lambda x: world_bank_to_geoloc_countries[x] if x in world_bank_to_geoloc_countries else x)

#set(unsampled_burden_df['Country'].unique()) - set(world_bank_groups['Economy'].unique())
world_bank_groups = world_bank_groups.set_index('Economy')['Income group']




In [None]:
unsampled_burden_df['World Bank Income Group'] = unsampled_burden_df['Country'].apply(lambda x: world_bank_groups[x] if x in world_bank_groups else 'unknown')
unsampled_burden_df = unsampled_burden_df[unsampled_burden_df['World Bank Income Group'] != 'unknown']

In [None]:
sns.stripplot(data = unsampled_burden_df, order=['High income', 'Upper middle income', 'Lower middle income', 'Low income'], x='Unsampled Bacterial Burden (DALYs per 100k)', y='World Bank Income Group')
plt.savefig('unsampled_bacterial_burden.png', dpi=300, bbox_inches='tight')

In [None]:
all_countries = bacterial_burden_per_country['Map Locations'].unique()
represented_pathogens = genomes_per_country_sra.groupby('Map Locations')['IHME_taxa'].nunique()

In [None]:
represented_sequencing = {'Country': [], 'Sequenced Pathogens (SRA)': []}
for country in all_countries:
    if country in represented_pathogens:
        represented_sequencing['Country'].append(country)
        represented_sequencing['Sequenced Pathogens (SRA)'].append(represented_pathogens.loc[country])
    else:
        represented_sequencing['Country'].append(country)
        represented_sequencing['Sequenced Pathogens (SRA)'].append(0)

represented_sequencing = pd.DataFrame(represented_sequencing)

In [None]:
fig = px.choropleth(represented_sequencing, geojson=countries, featureidkey='properties.ADMIN',
                    locations='Country', color='Sequenced Pathogens (SRA)',
                    color_continuous_scale="Blackbody",
                    projection='eckert4')
fig.show()

# Top 5 AMR Burden pathogens per country represented

In [None]:
top5_pathogens_per_country_by_burden = burden_df.set_index('Pathogen').groupby('Location')['Value'].nlargest(5).reset_index()
top5_pathogens_per_country_by_burden['Map Locations'] = top5_pathogens_per_country_by_burden['Location']\
                                                    .apply(lambda x: ihme_to_geoloc_countries[x] if x in ihme_to_geoloc_countries else x)
top5_pathogens_per_country_by_burden = top5_pathogens_per_country_by_burden.groupby('Map Locations')['Pathogen'].unique()

In [None]:
genomes_per_country_sra_sets = genomes_per_country_sra.groupby('Map Locations')['IHME_taxa'].unique()

In [None]:
top5_sequencing = {'Country': [], 'Top 5 Pathogens (SRA)': []}

for country, burden_set in top5_pathogens_per_country_by_burden.items():
    burden_set = set(burden_set)
    
    if country in genomes_per_country_sra_sets.index:
        sequenced_set = set(genomes_per_country_sra_sets.loc[country])
        top_burden_sequenced = len(burden_set.intersection(sequenced_set))
    else:
        print(country)
        top_burden_sequenced = 0
    
    top5_sequencing['Country'].append(country)
    top5_sequencing['Top 5 Pathogens (SRA)'].append(top_burden_sequenced)
top5_sequencing = pd.DataFrame(top5_sequencing)

In [None]:
fig = px.choropleth(top5_sequencing, geojson=countries, featureidkey='properties.ADMIN',
                    locations='Country', color='Top 5 Pathogens (SRA)',
                    color_continuous_scale="Blackbody",
                    projection='eckert4')
fig.show()

In [None]:
top5_sequencing