# Basics

Import libraries and functions.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from pyspark.sql.functions import concat, col, lit, split

Loading database from WDB (https://databank.worldbank.org/home.aspx)

In [12]:
df= pd.read_csv (os.getcwd()+'\WDIData.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.936004,17.337896,17.687093,18.140971,18.491344,18.825520,19.272212,19.628009,,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.499471,6.680066,6.859110,7.016238,7.180364,7.322294,7.517191,7.651598,,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,37.855399,38.046781,38.326255,38.468426,38.670044,38.722783,38.927016,39.042839,,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.794160,32.001027,33.871910,38.880173,40.261358,43.061877,44.270860,45.803485,,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,18.663502,17.633986,16.464681,24.531436,25.345111,27.449908,29.641760,30.404935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384365,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,14.500000,,,,,,,
384366,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,3.700000,,,,5.418352,,,
384367,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,33.500000,32.400000,,,,33.658057,,,
384368,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.400000,,


To work more comfortably, we remove those columns not useful for us, as country name and indicator code, since with the country code and the indicator name we have the information we need.

In [13]:
df.drop(columns=["Country Name","Indicator Code"], axis=1, inplace=True)

From the thousand countries we have information about in the worlwide database, we have decided to study 50 of them, groupin gthem by geographical and economical similiarity. By this way, we keep in our df just those countries selected.

In [14]:
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PCZ','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list 

In [15]:
df.loc[df['Country Code'].isin(country_list)]

Unnamed: 0,Country Code,Indicator Name,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
73695,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,99.400000,99.500000,99.500000,99.600000,99.600000,99.600000,99.700000,99.700000,,
73696,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,98.300000,98.400000,98.500000,98.600000,98.600000,98.700000,98.700000,98.800000,,
73697,DZA,Access to clean fuels and technologies for coo...,,,,,,,,,...,100.000000,100.000000,100.000000,100.000000,100.000000,100.000000,100.000000,99.900000,,
73698,DZA,Access to electricity (% of population),,,,,,,,,...,99.002205,99.087013,99.186661,99.350250,99.635490,99.697838,99.500000,99.804131,,
73699,DZA,"Access to electricity, rural (% of rural popul...",,,,,,,,,...,97.288933,97.430611,97.652374,98.126846,99.072006,99.071304,98.681053,99.579903,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381475,YEM,Women who believe a husband is justified in be...,,,,,,,,,...,32.400000,,,,,,,,,
381476,YEM,Women who were first married by age 15 (% of w...,,,,,,,,,...,9.400000,,,,,,,,,
381477,YEM,Women who were first married by age 18 (% of w...,,,,,,,,,...,31.900000,,,,,,,,,
381478,YEM,Women's share of population ages 15+ living wi...,,,,,,,,,...,38.100000,38.000000,37.800000,37.700000,37.600000,37.500000,37.300000,37.200000,,


Now we transpose the rows of years into the columns.

In [16]:
dftras=(df.set_index(["Country Code", "Indicator Name"]).stack().reset_index(name='Value').rename(columns={'level_2':'Date'}))
dftras

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,AFE,Access to clean fuels and technologies for coo...,2000,11.435351
1,AFE,Access to clean fuels and technologies for coo...,2001,11.805706
2,AFE,Access to clean fuels and technologies for coo...,2002,12.167502
3,AFE,Access to clean fuels and technologies for coo...,2003,12.533893
4,AFE,Access to clean fuels and technologies for coo...,2004,12.916778
...,...,...,...,...
7864969,ZWE,Young people (ages 15-24) newly infected with HIV,2016,14000.000000
7864970,ZWE,Young people (ages 15-24) newly infected with HIV,2017,12000.000000
7864971,ZWE,Young people (ages 15-24) newly infected with HIV,2018,9700.000000
7864972,ZWE,Young people (ages 15-24) newly infected with HIV,2019,9600.000000


In [17]:
indicators_list=['GDP (current US$)','Literacy rate, adult total (% of people ages 15 and above)', 'Government expenditure on education, total (% of government expenditure)','Net migration','Commercial service exports (current US$)','Exports of goods and services (current US$)','Taxes on international trade (current LCU)','Fertility rate, total (births per woman)','People using at least basic sanitation services (% of population)','Employment in agriculture (% of total employment) (modeled ILO estimate)','Employment in services (% of total employment) (modeled ILO estimate)','Employment in industry (% of total employment) (modeled ILO estimate)','Electricity production from renewable sources, excluding hydroelectric (kWh)','Number of infant deaths','Number of infant deaths','Foreign direct investment, net (BoP, current US$)','Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)','Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)','Research and development expenditure (% of GDP)','Labor force with advanced education (% of total working-age population with advanced education)','Suicide mortality rate (per 100,000 population)','CPIA gender equality rating (1=low to 6=high)','Share of youth not in education, employment or training, total (% of youth population)','Government expenditure on education, total (% of government expenditure)']

df2=dftras.loc[dftras['Indicator Name'].isin(indicators_list)]
pd.set_option('display.max_rows', 10)
df2

Unnamed: 0,Country Code,Indicator Name,Date,Value
3549,AFE,Commercial service exports (current US$),1976,2.303046e+09
3550,AFE,Commercial service exports (current US$),1977,2.531549e+09
3551,AFE,Commercial service exports (current US$),1978,2.978281e+09
3552,AFE,Commercial service exports (current US$),1979,3.600502e+09
3553,AFE,Commercial service exports (current US$),1980,4.494631e+09
...,...,...,...,...
7863115,ZWE,Total alcohol consumption per capita (liters o...,2000,2.460000e+00
7863116,ZWE,Total alcohol consumption per capita (liters o...,2005,2.770000e+00
7863117,ZWE,Total alcohol consumption per capita (liters o...,2010,3.930000e+00
7863118,ZWE,Total alcohol consumption per capita (liters o...,2015,4.920000e+00


CONVIERTE A INTEGER PERO NO FILTRA

Our time range covers from 1960 to 2021. However, the record is not uniform and complete for all areas and indicators. We can appreaciate that specially in the first years of the last century, so many data is missing, so it makes no sense to study it. Besides, for the year 2021 many data is also lacking. Therefore, we would delimit our study between 1990 and 2020.

In [18]:
df2[['Date']] = df2[['Date']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[['Date']] = df2[['Date']].astype(int)


In [19]:
df2.dtypes

Country Code       object
Indicator Name     object
Date                int32
Value             float64
dtype: object

In [20]:
df3 = df2[df2['Date'] > 1989]
df3

Unnamed: 0,Country Code,Indicator Name,Date,Value
3563,AFE,Commercial service exports (current US$),1990,6.298326e+09
3564,AFE,Commercial service exports (current US$),1991,5.966474e+09
3565,AFE,Commercial service exports (current US$),1992,6.378636e+09
3566,AFE,Commercial service exports (current US$),1993,6.849594e+09
3567,AFE,Commercial service exports (current US$),1994,7.769650e+09
...,...,...,...,...
7863115,ZWE,Total alcohol consumption per capita (liters o...,2000,2.460000e+00
7863116,ZWE,Total alcohol consumption per capita (liters o...,2005,2.770000e+00
7863117,ZWE,Total alcohol consumption per capita (liters o...,2010,3.930000e+00
7863118,ZWE,Total alcohol consumption per capita (liters o...,2015,4.920000e+00


In [22]:
df4=df3.set_index(["Country Code", "Date"]).pivot(columns="Indicator Name", values="Value").reset_index()
df4

Indicator Name,Country Code,Date,CPIA gender equality rating (1=low to 6=high),Commercial service exports (current US$),"Electricity production from renewable sources, excluding hydroelectric (kWh)",Employment in agriculture (% of total employment) (modeled ILO estimate),Employment in industry (% of total employment) (modeled ILO estimate),Employment in services (% of total employment) (modeled ILO estimate),Exports of goods and services (current US$),"Fertility rate, total (births per woman)",...,"Literacy rate, adult total (% of people ages 15 and above)","Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)",Net migration,Number of infant deaths,People using at least basic sanitation services (% of population),Research and development expenditure (% of GDP),"Share of youth not in education, employment or training, total (% of youth population)","Suicide mortality rate (per 100,000 population)",Taxes on international trade (current LCU),"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)"
0,ABW,1990,,3.984358e+08,,,,,,2.249,...,,,,,,,,,,
1,ABW,1991,,4.637430e+08,,,,,,2.221,...,,,,,,,,,,
2,ABW,1992,,5.587709e+08,,,,,,2.187,...,,,14218.0,,,,,,,
3,ABW,1993,,5.881564e+08,,,,,,2.149,...,,,,,,,,,,
4,ABW,1994,,6.046927e+08,,,,,,2.108,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8228,ZWE,2016,4.0,3.486285e+08,,66.879997,7.05,26.070000,4.098132e+09,3.804,...,,133.0,,18753.0,37.529193,,,16.8,272855002.0,
8229,ZWE,2017,4.0,4.289693e+08,,66.480003,6.90,26.629999,3.456997e+09,3.707,...,,,-584288.0,18018.0,36.941673,,,15.0,295088115.0,
8230,ZWE,2018,4.0,4.516801e+08,,66.019997,6.75,27.230000,5.081366e+09,3.615,...,,,,17196.0,36.357160,,,14.0,432769378.0,4.67
8231,ZWE,2019,4.0,5.541892e+08,,66.190002,6.57,27.240000,6.026541e+09,3.531,...,,,,16615.0,35.774337,,44.77,14.1,,


In [23]:
list(df4)

['Country Code',
 'Date',
 'CPIA gender equality rating (1=low to 6=high)',
 'Commercial service exports (current US$)',
 'Electricity production from renewable sources, excluding hydroelectric (kWh)',
 'Employment in agriculture (% of total employment) (modeled ILO estimate)',
 'Employment in industry (% of total employment) (modeled ILO estimate)',
 'Employment in services (% of total employment) (modeled ILO estimate)',
 'Exports of goods and services (current US$)',
 'Fertility rate, total (births per woman)',
 'Foreign direct investment, net (BoP, current US$)',
 'GDP (current US$)',
 'Government expenditure on education, total (% of government expenditure)',
 'Labor force with advanced education (% of total working-age population with advanced education)',
 'Literacy rate, adult total (% of people ages 15 and above)',
 'Mortality rate attributed to household and ambient air pollution, age-standardized (per 100,000 population)',
 'Net migration',
 'Number of infant deaths',
 'Peop

In [24]:
df4.columns=['Country','Year','Gender equality','Exports-Commercial services','Renewable electricity','Employment-agriculture','Employment-industry','Employment-services','Exports-G&S','Fertility rate','Foreign investment','GDP','Education GExp','Workers high education','Literacy rate','Mortality-pollution','Net migration','Mortality-infants','Health services use','R&D GExp','Ninis','Suicide','International taxes','Alcohol per capita']

In [26]:
list(df4)
df4

Unnamed: 0,Country,Year,Gender equality,Exports-Commercial services,Renewable electricity,Employment-agriculture,Employment-industry,Employment-services,Exports-G&S,Fertility rate,...,Literacy rate,Mortality-pollution,Net migration,Mortality-infants,Health services use,R&D GExp,Ninis,Suicide,International taxes,Alcohol per capita
0,ABW,1990,,3.984358e+08,,,,,,2.249,...,,,,,,,,,,
1,ABW,1991,,4.637430e+08,,,,,,2.221,...,,,,,,,,,,
2,ABW,1992,,5.587709e+08,,,,,,2.187,...,,,14218.0,,,,,,,
3,ABW,1993,,5.881564e+08,,,,,,2.149,...,,,,,,,,,,
4,ABW,1994,,6.046927e+08,,,,,,2.108,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8228,ZWE,2016,4.0,3.486285e+08,,66.879997,7.05,26.070000,4.098132e+09,3.804,...,,133.0,,18753.0,37.529193,,,16.8,272855002.0,
8229,ZWE,2017,4.0,4.289693e+08,,66.480003,6.90,26.629999,3.456997e+09,3.707,...,,,-584288.0,18018.0,36.941673,,,15.0,295088115.0,
8230,ZWE,2018,4.0,4.516801e+08,,66.019997,6.75,27.230000,5.081366e+09,3.615,...,,,,17196.0,36.357160,,,14.0,432769378.0,4.67
8231,ZWE,2019,4.0,5.541892e+08,,66.190002,6.57,27.240000,6.026541e+09,3.531,...,,,,16615.0,35.774337,,44.77,14.1,,


Get the mean of each column by country.

In [35]:
mean_value=df4.groupby('Country').mean()
mean_value

Unnamed: 0_level_0,Year,Gender equality,Exports-Commercial services,Renewable electricity,Employment-agriculture,Employment-industry,Employment-services,Exports-G&S,Fertility rate,Foreign investment,...,Literacy rate,Mortality-pollution,Net migration,Mortality-infants,Health services use,R&D GExp,Ninis,Suicide,International taxes,Alcohol per capita
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,2005.0,,1.365734e+09,,,,,1.642403e+09,1.905419,-4.953070e+07,...,97.307103,,4870.500000,,97.840466,,16.559999,,,
AFE,2005.0,3.268640,2.283736e+10,2.170577e+09,63.931376,9.264748,26.805276,1.544392e+11,5.279340,,...,,137.502262,-859890.833333,1.200588e+06,26.957594,0.689464,,9.345236,,5.131846
AFG,2005.5,1.733333,1.136812e+09,,58.072069,13.100000,28.828965,,6.445645,-6.489401e+07,...,34.357446,211.100000,352307.000000,7.677177e+04,35.195994,,35.086665,4.460000,2.658521e+10,0.210000
AFW,2005.5,3.151069,9.411114e+09,1.565000e+08,52.273698,11.740215,35.988105,1.007956e+11,5.797355,,...,,260.403292,-902641.333333,1.060598e+06,28.389268,0.151856,,6.251779,,7.421209
AGO,2005.5,3.166667,4.794909e+08,0.000000e+00,43.345862,8.210345,48.446207,3.699712e+10,6.383194,4.132114e+08,...,66.717766,118.500000,193224.333333,8.073706e+04,40.781692,0.032290,23.670000,7.310000,8.185973e+10,6.142000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XKX,2005.0,3.500000,9.299527e+08,6.250000e+04,,,,1.582165e+09,2.513226,-3.121143e+08,...,,,,,,,31.758889,,,
YEM,2005.0,1.875000,9.239832e+08,0.000000e+00,36.354138,14.085172,49.562069,5.947737e+09,5.702548,-2.245402e+08,...,45.594999,194.200000,17475.500000,4.363123e+04,49.644188,,44.770000,5.815000,,0.283200
ZAF,2005.5,,9.963859e+09,4.568846e+08,7.721379,25.845862,66.433793,6.882880e+10,2.782871,-1.213116e+09,...,91.430060,86.700000,798448.000000,4.431929e+04,68.909984,0.787751,32.881250,24.240000,2.098393e+10,9.754000
ZMB,2005.5,3.187500,4.996178e+08,0.000000e+00,65.074138,8.289655,26.637586,4.957076e+09,5.631323,-6.736132e+08,...,72.172003,127.200000,-90833.333333,3.376677e+04,27.188308,0.051096,36.097499,10.200000,1.010484e+09,5.362000


Fill null values of each indicator of each country with the mean computed previously.

In [36]:
df4.fillna(value=mean_value)

Unnamed: 0,Country,Year,Gender equality,Exports-Commercial services,Renewable electricity,Employment-agriculture,Employment-industry,Employment-services,Exports-G&S,Fertility rate,...,Literacy rate,Mortality-pollution,Net migration,Mortality-infants,Health services use,R&D GExp,Ninis,Suicide,International taxes,Alcohol per capita
0,ABW,1990,,3.984358e+08,,,,,,2.249,...,,,,,,,,,,
1,ABW,1991,,4.637430e+08,,,,,,2.221,...,,,,,,,,,,
2,ABW,1992,,5.587709e+08,,,,,,2.187,...,,,14218.0,,,,,,,
3,ABW,1993,,5.881564e+08,,,,,,2.149,...,,,,,,,,,,
4,ABW,1994,,6.046927e+08,,,,,,2.108,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8228,ZWE,2016,4.0,3.486285e+08,,66.879997,7.05,26.070000,4.098132e+09,3.804,...,,133.0,,18753.0,37.529193,,,16.8,272855002.0,
8229,ZWE,2017,4.0,4.289693e+08,,66.480003,6.90,26.629999,3.456997e+09,3.707,...,,,-584288.0,18018.0,36.941673,,,15.0,295088115.0,
8230,ZWE,2018,4.0,4.516801e+08,,66.019997,6.75,27.230000,5.081366e+09,3.615,...,,,,17196.0,36.357160,,,14.0,432769378.0,4.67
8231,ZWE,2019,4.0,5.541892e+08,,66.190002,6.57,27.240000,6.026541e+09,3.531,...,,,,16615.0,35.774337,,44.77,14.1,,


In [38]:
groups=dict(tuple(df4.groupby('Country')))