In [1]:
import os
import pandas as pd
import numpy as np

DATAPATH = "../data/world_data_2024"

# Data sources

- original country list and world bank classification: WHO, including region (countries_list.csv)
- updated population numbers (2023) by the world bank (worldbank_worldpop_2023.csv): https://data.worldbank.org/indicator/SP.POP.TOTL
- researcher per million habitants by the world bank (worldbank_researcherpermillion.csv): https://data.worldbank.org/indicator/SP.POP.SCIE.RD.P6?end=2022&start=2012
- Bulk data download on science metrics from the UIS (UNESCO)(SCN-SDG): https://uis.unesco.org/bdds


In [85]:
# Cleanup region names
df = pd.read_csv(os.path.join(DATAPATH, "countries_list.csv"))
df["who_region"] = df["who_region"].replace("Region of the Americas", "Americas")
df[df["who_region"]=="Unknown"]
country2region = {"American Samoa":"Western Pacific",
                  "Bermuda": "Americas",
                  "Greenland": "Americas",
                  "Guam": "Western Pacific",
                  "Northern Mariana Islands": "Western Pacific",
                  "Palestine": "Eastern Mediterranean",
                  "Taiwan (Province of China)": "Western Pacific",
                  "Turkey": "European",
                  "United Kingdom":"European",
                  "United States Virgin Islands": "Americas"}
df["who_region"] = df.apply(lambda row: country2region.get(row["country"], row["who_region"]) if row["who_region"] == "Unknown" else row["who_region"], axis=1)

In [86]:
#woroldbank data includes regions mixed with the countries. It will be cleaned up by matching with the original WHO data
df_wb = pd.read_csv(os.path.join(DATAPATH, "worldbank_worldpop_2023.csv"))
missing_countries = set(df["country"].tolist()) - set(df_wb["Country Name"].tolist())
country_correction = {
    "Bahamas, The":'Bahamas',
    "Bolivia":'Bolivia (Plurinational State of)',
    "Congo, Rep.":'Congo',
    "Cote d'Ivoire":"Côte d'Ivoire",
    "Korea, Dem. People's Rep.":"Democratic People's Republic of Korea",
    "Congo, Dem. Rep.":'Democratic Republic of the Congo',
    "Egypt, Arab Rep.":'Egypt',
    "Gambia, The":'Gambia',
    "Iran, Islamic Rep.":'Iran (Islamic Republic of)',
    "Kyrgyz Republic":'Kyrgyzstan',
    "Lao PDR":"Lao People's Democratic Republic",
    "Micronesia, Fed. Sts.":'Micronesia (Federated States of)',
    "West Bank and Gaza":'Palestine',
    "Korea, Rep.":'Republic of Korea',
    "Moldova":'Republic of Moldova',
    "St. Kitts and Nevis":'Saint Kitts and Nevis',
    "St. Lucia":'Saint Lucia',
    "St. Vincent and the Grenadines":'Saint Vincent and the Grenadines',
    "Slovak Republic":'Slovakia',
    "Turkiye":'Turkey',
    "Tanzania":'United Republic of Tanzania',
    "Virgin Islands (U.S.)":'United States Virgin Islands',
    "United States":'United States of America',
    "Venezuela, RB":'Venezuela (Bolivarian Republic of)',
    "Yemen, Rep.":'Yemen'}
non_existent_wb = ["Cook Islands", "Niue", "Tokelau", "Taiwan (Province of China)"]

df_wb["Country Name"] = df_wb["Country Name"].replace(country_correction)
df_wb.rename(columns={"Country Name": "country"}, inplace=True)

df = df[~df["country"].isin(non_existent_wb)] #eliminate the regions/countries not in WB data

df = pd.merge(df, df_wb[['country', '2023']], on='country', how='left')
df['population'] = df['2023']
df.drop(columns=['2023'], inplace=True)

In [87]:
# African Region Stats
afr = df[df["who_region"]=="African"]
print(len(afr))
afr["income"].value_counts()

47


income
LIC     21
LMIC    19
UMIC     5
HIC      2
Name: count, dtype: int64

# Researchers per million
Is one of the tracked measures for the advancement of the SDG 9.5 (9.5 Enhance scientific research, upgrade the technological capabilities of industrial sectors in all countries, in particular developing countries, including, by 2030, encouraging innovation and substantially increasing the number of research and development workers per 1 million people and public and private research and development spending)

In [94]:
rpm = pd.read_csv(os.path.join(DATAPATH, "worldbank_researcherpermillion.csv"))
rpm = rpm[['Country Name', '2018', '2019', '2020', '2021' , '2022', '2023']]
rpm["Country Name"] = rpm["Country Name"].replace(country_correction)
rpm.rename(columns={"Country Name": "country"}, inplace=True)
df_ = pd.merge(df, rpm, on='country', how='left')

In [96]:
df_afr = df_[df_["who_region"]=="African"]
df_afr.isna().sum()

country        0
code           0
population     0
income         0
who_region     0
2018          41
2019          41
2020          44
2021          44
2022          43
2023          47
dtype: int64

# Global Burden of Death
- Plot overview: https://ourworldindata.org/burden-of-disease
- Source data: 