# Basics

Import libraries and functions.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from pyspark.sql.functions import concat, col, lit, split

Loading database from WDB (https://databank.worldbank.org/home.aspx)

In [2]:
df= pd.read_csv (r'C:\Users\mferna38\Desktop\wdi\WDIData.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.936004,17.337896,17.687093,18.140971,18.491344,18.825520,19.272212,19.628009,,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.499471,6.680066,6.859110,7.016238,7.180364,7.322294,7.517191,7.651598,,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,37.855399,38.046781,38.326255,38.468426,38.670044,38.722783,38.927016,39.042839,,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.794160,32.001027,33.871910,38.880173,40.261358,43.061877,44.270860,45.803485,,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,18.663502,17.633986,16.464681,24.531436,25.345111,27.449908,29.641760,30.404935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384365,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,14.500000,,,,,,,
384366,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,3.700000,,,,5.418352,,,
384367,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,33.500000,32.400000,,,,33.658057,,,
384368,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.400000,,


To work more comfortably, we remove those columns not useful for us, as country name and indicator code, since with the country code and the indicator name we have the information we need.

In [3]:
df.drop(columns=["Country Name","Indicator Code"], axis=1, inplace=True)
df

Unnamed: 0,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,AFE,Access to clean fuels and technologies for coo...,,,,,,,,,...,16.936004,17.337896,17.687093,18.140971,18.491344,18.825520,19.272212,19.628009,,
1,AFE,Access to clean fuels and technologies for coo...,,,,,,,,,...,6.499471,6.680066,6.859110,7.016238,7.180364,7.322294,7.517191,7.651598,,
2,AFE,Access to clean fuels and technologies for coo...,,,,,,,,,...,37.855399,38.046781,38.326255,38.468426,38.670044,38.722783,38.927016,39.042839,,
3,AFE,Access to electricity (% of population),,,,,,,,,...,31.794160,32.001027,33.871910,38.880173,40.261358,43.061877,44.270860,45.803485,,
4,AFE,"Access to electricity, rural (% of rural popul...",,,,,,,,,...,18.663502,17.633986,16.464681,24.531436,25.345111,27.449908,29.641760,30.404935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384365,ZWE,Women who believe a husband is justified in be...,,,,,,,,,...,,,14.500000,,,,,,,
384366,ZWE,Women who were first married by age 15 (% of w...,,,,,,,,,...,,,3.700000,,,,5.418352,,,
384367,ZWE,Women who were first married by age 18 (% of w...,,,,,,,,,...,,33.500000,32.400000,,,,33.658057,,,
384368,ZWE,Women's share of population ages 15+ living wi...,,,,,,,,,...,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.400000,,


From the thousand countries we have information about in the worlwide database, we have decided to study 50 of them, groupin gthem by geographical and economical similiarity. By this way, we keep in our df just those countries selected.

In [4]:
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PCZ','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list

In [5]:
dfgrouped=df.loc[df['Country Code'].isin(country_list)]
dfgrouped

Unnamed: 0,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
73695,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,99.400000,99.500000,99.500000,99.600000,99.600000,99.600000,99.700000,99.700000,,
73696,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,98.300000,98.400000,98.500000,98.600000,98.600000,98.700000,98.700000,98.800000,,
73697,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,100.000000,100.000000,100.000000,100.000000,100.000000,100.000000,100.000000,99.900000,,
73698,DZA,Access to electricity (% of population),,,,,,,,,...,99.002205,99.087013,99.186661,99.350250,99.635490,99.697838,99.500000,99.804131,,
73699,DZA,"Access to electricity, rural (% of rural popul...",,,,,,,,,...,97.288933,97.430611,97.652374,98.126846,99.072006,99.071304,98.681053,99.579903,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381475,YEM,Women who believe a husband is justified in be...,,,,,,,,,...,32.400000,,,,,,,,,
381476,YEM,Women who were first married by age 15 (% of w...,,,,,,,,,...,9.400000,,,,,,,,,
381477,YEM,Women who were first married by age 18 (% of w...,,,,,,,,,...,31.900000,,,,,,,,,
381478,YEM,Women's share of population ages 15+ living wi...,,,,,,,,,...,38.100000,38.000000,37.800000,37.700000,37.600000,37.500000,37.300000,37.200000,,


Now we transpose the rows of years into the columns.

In [6]:
dfpivot=(dfgrouped.set_index(["Country Code", "Indicator Name"]).stack().reset_index(name='Value').rename(columns={'level_2':'Date'}))
dfpivot

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1729300,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1729301,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1729302,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1729303,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


Our time range covers from 1960 to 2021. However, the record is not uniform and complete for all areas and indicators. We can appreaciate that specially in the first years of the last century, so many data is missing, so it makes no sense to study it. Besides, for the year 2021 many data is also lacking. Therefore, we would delimit our study between 1990 and 2020.

In [8]:
dfpivot['Date'].astype(int)

0          2000
1          2001
2          2002
3          2003
4          2004
           ... 
1729300    2016
1729301    2017
1729302    2018
1729303    2019
1729304    2020
Name: Date, Length: 1729305, dtype: int32

In [13]:
df9020=dfpivot.loc[dfpivot['Date']>1989]
df9020

TypeError: '>' not supported between instances of 'str' and 'int'

In [10]:
df9020.to_csv('C:/Users/mferna38/Desktop/wdi/Datafrom90to20.csv',encoding='utf-8')

NEXT STEP: NORMALIZATION

- Select just the indicators we want.
- Group countries by group list name.
- Nan values: replace by 0, by the mean or eliminating.
- Remove outliers.
- For the main variable to compare (GDP): analyse distribution: if not normal, make logarithmic.

In [11]:
df9020["Indicator Name"].drop_duplicates()

0          Access to clean fuels and technologies for coo...
21         Access to clean fuels and technologies for coo...
42         Access to clean fuels and technologies for coo...
63                   Access to electricity (% of population)
84         Access to electricity, rural (% of rural popul...
105        Access to electricity, urban (% of urban popul...
126        Account ownership at a financial institution o...
129        Account ownership at a financial institution o...
132        Account ownership at a financial institution o...
135        Account ownership at a financial institution o...
138        Account ownership at a financial institution o...
141        Account ownership at a financial institution o...
144        Account ownership at a financial institution o...
147        Account ownership at a financial institution o...
150        Account ownership at a financial institution o...
170        Adjusted net enrollment rate, primary (% of pr...
210        Adjusted net 

In [15]:
indicators_list=['Literacy rate, adult total (% of people ages 15 and above)','Net migration', 'Government expenditure on education, total (% of government expenditure)','GDP (current US$)']
selected=df9020.loc[df9020['Indicator Name'].isin(indicators_list)]
selected

Unnamed: 0,Country Code,Indicator Name,Date,Value
10630,DZA,GDP (current US$),1990-01-01,62048560000.0
10631,DZA,GDP (current US$),1991-01-01,45715610000.0
10632,DZA,GDP (current US$),1992-01-01,48003080000.0
10633,DZA,GDP (current US$),1993-01-01,49945600000.0
10634,DZA,GDP (current US$),1994-01-01,42543180000.0
10635,DZA,GDP (current US$),1995-01-01,41764320000.0
10636,DZA,GDP (current US$),1996-01-01,46941580000.0
10637,DZA,GDP (current US$),1997-01-01,48177610000.0
10638,DZA,GDP (current US$),1998-01-01,48187750000.0
10639,DZA,GDP (current US$),1999-01-01,48640650000.0
