#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.

### Chapter 02
**CH02C Displaying immunization rates across countries**

using the world-bank-immunization dataset

version 1.0 2021-05-05

In [1]:
import os
import sys
import warnings

import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")

In [2]:
# Current script folder
current_path = os.getcwd()
dirname = current_path.split("da_case_studies")[0]

# location folders
data_in = dirname + "da_data_repo/worldbank-immunization/clean/"
data_out = dirname + "da_case_studies/ch02-immunization-crosscountry/"
output = dirname + "da_case_studies/ch02-immunization-crosscountry/output/"
func = dirname + "da_case_studies/ch00-tech-prep/"
sys.path.append(func)

In [3]:
# Import the prewritten helper functions 
from py_helper_functions import *

In [4]:
# load in clean and tidy data and create workfile
df = pd.read_csv(data_in + "worldbank-immunization-panel.csv")
#df = pd.read_csv("https://osf.io/download/gk5cn/")

In [5]:
# cleaning
df = df.filter(["countryname", "year", "imm", "gdppc"]).loc[
    (df["year"] >= 2015)
    & ((df["countryname"] == "Pakistan") | (df["countryname"] == "India"))
]

In [6]:
df.describe(percentiles=[])

Unnamed: 0,year,imm,gdppc
count,6.0,6.0,6.0
mean,2016.0,81.5,5373.962113
std,0.894427,6.774954,874.322204
min,2015.0,75.0,4459.146517
50%,2016.0,81.5,5257.315866
max,2017.0,88.0,6516.17262


In [7]:
df.filter(["countryname", "year", "imm", "gdppc"])

Unnamed: 0,countryname,year,imm,gdppc
3240,India,2015,87,5743.426497
3370,Pakistan,2015,75,4459.146517
3431,India,2016,88,6145.294595
3561,Pakistan,2016,75,4608.527214
3622,India,2017,88,6516.17262
3752,Pakistan,2017,76,4771.205236


In [8]:
# Table 2.5
df.sort_values(["countryname", "year"])

Unnamed: 0,countryname,year,imm,gdppc
3240,India,2015,87,5743.426497
3431,India,2016,88,6145.294595
3622,India,2017,88,6516.17262
3370,Pakistan,2015,75,4459.146517
3561,Pakistan,2016,75,4608.527214
3752,Pakistan,2017,76,4771.205236


In [9]:
df = df.set_index(["countryname", "year"]).unstack("year")
df

Unnamed: 0_level_0,imm,imm,imm,gdppc,gdppc,gdppc
year,2015,2016,2017,2015,2016,2017
countryname,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
India,87,88,88,5743.426497,6145.294595,6516.17262
Pakistan,75,75,76,4459.146517,4608.527214,4771.205236


In [10]:
# reset column names from multiindex
df.columns = [x[0] + "_" + str(x[1]) for x in df.columns]

#Table 2.4
df

Unnamed: 0_level_0,imm_2015,imm_2016,imm_2017,gdppc_2015,gdppc_2016,gdppc_2017
countryname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
India,87,88,88,5743.426497,6145.294595,6516.17262
Pakistan,75,75,76,4459.146517,4608.527214,4771.205236
