# INFO

This notebook contains all the required code to fetch, process and merge data in order to populate the World Bank table.


# Word Bank

In [1]:
import numpy as np
import pandas as pd

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [2]:
def download_WDIdata_to_df(url):
    print("Dowloading")
    resp = urlopen(url)
    zipfile = ZipFile(BytesIO(resp.read()))
    print("Downloaded")

    print("Loading into a pandas dataframe")
    data = pd.read_csv(zipfile.open('WDIData.csv'))
    print("Dataframe created")

    return data

In [3]:
def load_local_WDIdata(zip_path):
    print("loading data from ", zip_path)
    zipfile = ZipFile(zip_path)
    data = pd.read_csv(zipfile.open('WDIData.csv'))
    print("Dataframe created")

    return data

### Loads the data

In [3]:
DOWNLOAD_ZIP = True

In [5]:
if DOWNLOAD_ZIP:
    ZIP_URL = "http://databank.worldbank.org/data/download/WDI_csv.zip"
    # it can take some minutes
    wb_df = download_WDIdata_to_df(ZIP_URL)

else:
    ZIP_PATH = ('input/wb/WDI_csv.zip')
    wb_df = load_local_WDIdata(ZIP_PATH)

Dowloading
Downloaded
Loading into a pandas dataframe
Dataframe created


In [6]:
# remove the last column (always empty)
wb_df.drop("Unnamed: 64", axis=1, inplace=True)

In [9]:
wb_df["source"] = "World Bank"
wb_df["date"] = "2020-07-01"
wb_df["adm_area_1"] = None
wb_df["adm_area_2"] = None
wb_df["adm_area_3"] = None

In [10]:
wb_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,source,date,adm_area_1,adm_area_2,adm_area_3
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,84.171599,84.510171,,,,World Bank,2020-07-01,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,88.720097,89.308602,90.283638,89.286856,,World Bank,2020-07-01,,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,78.211,79.065508,81.102134,79.2481,,World Bank,2020-07-01,,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,96.936319,97.290083,97.467915,97.063959,,World Bank,2020-07-01,,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,,,37.165211,,,World Bank,2020-07-01,,,


### Select only countries with valid ISO (drop things like Arab World and EU) and add GID

In [15]:
# load the full list of countries with a GID
gid_list = pd.read_csv("input/gid/admn_0.csv")

# consider only the rows with a valid GID
wb_GID = wb_df[wb_df["Country Code"].isin(gid_list["countrycode"])].copy()

# add a GID column
wb_GID["GID"] = wb_GID["Country Code"]

In [24]:
# rearrange the columns
cols = list(wb_GID.columns)
wb_GID = wb_GID[cols[-6:-4] + [cols[-1]] + cols[:2] + cols[-4:-1] + cols[2:4] + cols[4:-6]]

In [25]:
# we do not consider those
wb_NOT = wb_df[~wb_df["Country Code"].isin(gid_list["countrycode"])].copy()
wb_NOT["Country Name"].unique()

array(['Arab World', 'Caribbean small states',
       'Central Europe and the Baltics', 'Early-demographic dividend',
       'East Asia & Pacific',
       'East Asia & Pacific (excluding high income)',
       'East Asia & Pacific (IDA & IBRD countries)', 'Euro area',
       'Europe & Central Asia',
       'Europe & Central Asia (excluding high income)',
       'Europe & Central Asia (IDA & IBRD countries)', 'European Union',
       'Fragile and conflict affected situations',
       'Heavily indebted poor countries (HIPC)', 'High income',
       'IBRD only', 'IDA & IBRD total', 'IDA blend', 'IDA only',
       'IDA total', 'Late-demographic dividend',
       'Latin America & Caribbean',
       'Latin America & Caribbean (excluding high income)',
       'Latin America & the Caribbean (IDA & IBRD countries)',
       'Least developed countries: UN classification',
       'Low & middle income', 'Low income', 'Lower middle income',
       'Middle East & North Africa',
       'Middle East & No

In [26]:
wb_GID.head()

Unnamed: 0,source,date,GID,Country Name,Country Code,adm_area_1,adm_area_2,adm_area_3,Indicator Name,Indicator Code,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
67257,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,...,20.68,22.33,24.08,26.17,27.99,30.1,32.44,,,
67258,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Access to electricity (% of population),EG.ELC.ACCS.ZS,...,42.7,43.222019,69.1,68.933266,89.5,71.5,97.7,97.7,98.713203,
67259,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,...,30.2188,29.572881,60.849157,61.282199,86.500512,64.573354,97.09936,97.091973,98.272872,
67260,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,...,82.8,86.567779,95.0,92.673767,98.7,92.5,99.5,99.5,100.0,
67261,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,...,,9.005013,,,9.961,,,14.893312,,


### Save to CSV

In [27]:
wb_GID.to_csv("WB_full_table.csv", index = False)

## Parse the data, for each row keep only the most recent value

In [28]:
wb_data = wb_GID.copy()

# load the indicators we are interested in
# CODES_PATH = 'input/wb/interesting_indicators.csv'
# interesting_indicators = pd.read_csv(CODES_PATH)
# wb_data = wb_df.loc[wb_df["Indicator Code"].isin(interesting_indicators["code"])].copy()

# the last column name is the most recent year
most_recent_year = int(wb_data.columns[-1])

# creates two additional columns
wb_data.loc[:, "Most Recent Value"] = np.nan
wb_data.loc[:, "Year"] = np.nan

# for each row, find the most recent non NaN measure
for year in range(most_recent_year, 1959, -1):
    if wb_data["Most Recent Value"].isnull().values.any():
        wb_data.loc[wb_data["Most Recent Value"].isnull(), "Year"] = int(year)
        wb_data.loc[: ,"Most Recent Value"].fillna(wb_data[str(year)], inplace=True)
    else:
        break

# drop all the colums with years
cols_to_drop = [str(year) for year in range(1960, most_recent_year+1)]

wb_data_recent = wb_data.drop(cols_to_drop, axis=1)

In [29]:
wb_data_recent.head()

Unnamed: 0,source,date,GID,Country Name,Country Code,adm_area_1,adm_area_2,adm_area_3,Indicator Name,Indicator Code,Most Recent Value,Year
67257,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,32.44,2016.0
67258,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Access to electricity (% of population),EG.ELC.ACCS.ZS,98.713203,2018.0
67259,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,98.272872,2018.0
67260,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,100.0,2018.0
67261,World Bank,2020-07-01,AFG,Afghanistan,AFG,,,,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,14.893312,2017.0


### Save to CSV

In [30]:
wb_data_recent.to_csv("WB_most_recent_values.csv", index=False)

<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>
<br>