In [1]:
import numpy as np
import pandas as pd

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [2]:
zip_url = "http://databank.worldbank.org/data/download/WDI_csv.zip"

In [3]:
def download_WDIdata_to_df(url):
    print("Dowloading")
    resp = urlopen(zip_url)
    zipfile = ZipFile(BytesIO(resp.read()))
    print("Downloaded")
    
    print("Loading into a pandas dataframe")
    data = pd.read_csv(zipfile.open('WDIData.csv'))
    print("Dataframe created")
    
    return data
    

### the server seems to be not so reliable, sometimes the connection drops.
### maybe it is better do manually dowload the zip

In [4]:
# def load_local_WDIdata(zip_path):
#     zipfile = ZipFile(zip_path)
#     data = pd.read_csv(zipfile.open('WDIData.csv'))
    
#     return data

# Download and load the data into a dataframe

In [5]:
# it can take some minutes
wb_df = download_WDIdata_to_df(zip_url)

Dowloading
Downloaded
Loading into a pandas dataframe
Dataframe created


In [6]:
wb_df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,95.939242,95.962166,96.352930,95.997833,96.649916,96.834184,97.003974,,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,22.260538,,,30.277130,,,37.165211,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377251,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.NEGL.ZS,,,,,,,...,21.400000,,,,21.400000,,,,,
377252,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,16.900000,,,,14.500000,,,,,
377253,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,3.900000,,,,3.700000,,,,,
377254,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,30.500000,,,33.500000,32.400000,,,,,


### last column is empty, remove it

In [7]:
wb_df[wb_df["Unnamed: 64"].notnull()]

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64


In [8]:
wb_df.drop("Unnamed: 64", axis=1, inplace=True)

In [9]:
wb_df.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.00762,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.466653,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,95.704618,95.939242,95.962166,96.35293,95.997833,96.649916,96.834184,97.003974,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,,22.260538,,,30.27713,,,37.165211,,


## NOTE: there are not only single countries but also groups of them
## for a complete list see WDICountry.csv in the zip file

In [10]:
len(wb_df["Country Name"].unique())

264

# Load the interesting indicators from the csv file

In [11]:
interesting_indicators = pd.read_csv("interesting_indicators.csv")
interesting_indicators.head()

Unnamed: 0,code,description,type
0,SP.POP.TOTL,Population total,Climate Change
1,AG.SRF.TOTL.K2,Surface area (sq. km),Agriculture & Rural Development
2,EN.URB.MCTY.TL.ZS,Population in urban agglomerations of more tha...,Climate Change
3,SP.POP.GROW,Population growth (annual %),Climate Change
4,NY.GDP.MKTP.KN,GDP (constant LCU),Economy & Growth


## Slice the dataframe

In [12]:
data = wb_df.loc[wb_df["Indicator Code"].isin(interesting_indicators["code"])].copy()
data.shape

(10560, 64)

### for each row we keep track of the most recent non NaN value

In [13]:
# the last column name is the most recent year 
most_recent_year = int(data.columns[-1])

In [14]:
data.loc[:, "most_recent_measure"] = np.nan
data.loc[:, "year"] = np.nan

In [15]:
data.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,most_recent_measure,year
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,87.07058,88.17684,87.34274,89.13012,89.67869,90.27369,,,,
20,Arab World,ARB,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,,,,,,,...,6.03267,3.090463,1.504003,-5.557763,0.1480371,2.554314,,,,
21,Arab World,ARB,Adjusted net national income (constant 2010 US$),NY.ADJ.NNTY.KD,,,,,,,...,2018494000000.0,2080874000000.0,2112171000000.0,1994781000000.0,1997734000000.0,2048763000000.0,,,,
22,Arab World,ARB,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,,,,,,,...,2193645000000.0,2259630000000.0,2341306000000.0,2156642000000.0,2139182000000.0,2161640000000.0,,,,
23,Arab World,ARB,Adjusted net national income per capita (annua...,NY.ADJ.NNTY.PC.KD.ZG,,,,,,,...,3.66767,0.8472754,-0.6422262,-7.494294,-1.834019,0.593719,,,,


In [16]:
for year in range(most_recent_year, 1959, -1):
    if data["most_recent_measure"].isnull().values.any():
        data.loc[data["most_recent_measure"].isnull(), "year"] = int(year)
        data.loc[: ,"most_recent_measure"].fillna(data[str(year)], inplace=True)
    else:
        break
        

In [17]:
data.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,most_recent_measure,year
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,87.07058,88.17684,87.34274,89.13012,89.67869,90.27369,,,90.27369,2017.0
20,Arab World,ARB,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,,,,,,,...,6.03267,3.090463,1.504003,-5.557763,0.1480371,2.554314,,,2.554314,2017.0
21,Arab World,ARB,Adjusted net national income (constant 2010 US$),NY.ADJ.NNTY.KD,,,,,,,...,2018494000000.0,2080874000000.0,2112171000000.0,1994781000000.0,1997734000000.0,2048763000000.0,,,2048763000000.0,2017.0
22,Arab World,ARB,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,,,,,,,...,2193645000000.0,2259630000000.0,2341306000000.0,2156642000000.0,2139182000000.0,2161640000000.0,,,2161640000000.0,2017.0
23,Arab World,ARB,Adjusted net national income per capita (annua...,NY.ADJ.NNTY.PC.KD.ZG,,,,,,,...,3.66767,0.8472754,-0.6422262,-7.494294,-1.834019,0.593719,,,0.593719,2017.0


### drop the single year columns

In [18]:
cols_to_drop = [str(year) for year in range(1960, most_recent_year+1)]

In [19]:
new_data = data.drop(cols_to_drop, axis=1)

In [20]:
new_data.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,most_recent_measure,year
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,90.27369,2017.0
20,Arab World,ARB,Adjusted net national income (annual % growth),NY.ADJ.NNTY.KD.ZG,2.554314,2017.0
21,Arab World,ARB,Adjusted net national income (constant 2010 US$),NY.ADJ.NNTY.KD,2048763000000.0,2017.0
22,Arab World,ARB,Adjusted net national income (current US$),NY.ADJ.NNTY.CD,2161640000000.0,2017.0
23,Arab World,ARB,Adjusted net national income per capita (annua...,NY.ADJ.NNTY.PC.KD.ZG,0.593719,2017.0


In [29]:
import wbdata

In [30]:
test = wbdata.get_dataframe({"EG.ELC.ACCS.ZS" : "acc"})

In [31]:
test

Unnamed: 0_level_0,Unnamed: 1_level_0,acc
country,date,Unnamed: 2_level_1
Arab World,2019,
Arab World,2018,
Arab World,2017,90.273687
Arab World,2016,89.678685
Arab World,2015,89.130121
...,...,...
Zimbabwe,1964,
Zimbabwe,1963,
Zimbabwe,1962,
Zimbabwe,1961,
