In [10]:
import os
import sys
import warnings

import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

In [11]:
# SET YOUR DIRECTORY HERE
dirname = "/Users/vigadam/Dropbox/My Mac (MacBook-Air.local)/Documents/work/data_book/"

data_in = dirname + "da_data_repo/worldbank-immunization/raw"
data_out = dirname + "da_data_repo/worldbank-immunization/clean"

In [12]:
data = pd.read_csv(data_in + "/fac51cfc-e0cd-4938-927b-b82e828f5ca4_Data.csv")

In [13]:
data = (
    data.iloc[:-5]
    .replace("..", np.nan)
    .assign(
        year=lambda x: x["Time"].astype(int),
        c=lambda x: x["Country Code"].astype("category"),
        pop=lambda x: x["Population, total [SP.POP.TOTL]"].apply(pd.to_numeric)
        / 1000000,
        mort=lambda x: x[
            "Mortality rate, under-5 (per 1,000 live births) [SH.DYN.MORT]"
        ].apply(pd.to_numeric),
        imm=lambda x: x[
            "Immunization, measles (% of children ages 12-23 months) [SH.IMM.MEAS]"
        ].apply(pd.to_numeric),
        gdppc=lambda x: x[
            "GDP per capita, PPP (constant 2011 international $) [NY.GDP.PCAP.PP.KD]"
        ].apply(pd.to_numeric),
        hexp=lambda x: x[
            "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]"
        ].apply(pd.to_numeric),
    )
    .assign(
        surv=lambda x: (1000 - x["mort"]) / 10, lngdppc=lambda x: np.log(x["gdppc"])
    )
    .rename(columns={"Country Name": "countryname", "Country Code": "countrycode"})
    .filter(
        [
            "year",
            "c",
            "countryname",
            "countrycode",
            "pop",
            "mort",
            "surv",
            "imm",
            "gdppc",
            "lngdppc",
            "hexp",
        ]
    )
    .query("year >= 1998")
    .dropna(subset=["imm", "mort", "pop"])
)

In [14]:
data.to_csv(data_out + "/worldbank-immunization-panel.csv", index=False)

## Continents

In [16]:
data = pd.read_csv(data_in + "/ac7b6203-6a02-4e39-aad1-da9ad65319d5_Data.csv")

In [17]:
data = (
    data.iloc[:-5]
    .replace("..", np.nan)
    .assign(
        year=lambda x: x["Time"].astype(int),
        c=lambda x: x["Country Code"].astype("category"),
        mort=lambda x: x[
            "Mortality rate, under-5 (per 1,000 live births) [SH.DYN.MORT]"
        ].apply(pd.to_numeric),
        imm=lambda x: x[
            "Immunization, measles (% of children ages 12-23 months) [SH.IMM.MEAS]"
        ].apply(pd.to_numeric),
    )
    .assign(surv=lambda x: 1000 - x["mort"])
    .dropna(subset=["imm"])
    .loc[lambda x: (x["year"] >= 1998) & (x["Country Name"] != "World")]
    .filter(["year", "c", "imm", "surv"])
    .sort_values(by=["year", "c"])
    .set_index(["year", "c"])
    .unstack("c")
    .reset_index()
)
data.columns = ["year"] + [x[0] + "_" + str(x[1]) for x in data.columns[1:]]

In [18]:
data.to_csv(data_out + "/worldbank-immunization-continents.csv",index = False)