# EXTRACTION

Import libraries and functions.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import warnings
warnings.filterwarnings("ignore")
import functools as ft
from pyspark.sql.functions import concat, col, lit, split
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import interact, interact_manual
import plotly.express as px
from scipy import stats
from scipy.stats import shapiro
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from pandas.api.types import is_numeric_dtype

Firstly we load the database from World Data Bank that has been downloaded and extracted in the *Data extraction* notebook. We acquire it from the predetermined path that is on our computer.

In [2]:
df= pd.read_csv (os.getcwd()+'/Data/'+'WDIData.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.936004,17.337896,17.687093,18.140971,18.491344,18.825520,19.272212,19.628009,,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.499471,6.680066,6.859110,7.016238,7.180364,7.322294,7.517191,7.651598,,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,37.855399,38.046781,38.326255,38.468426,38.670044,38.722783,38.927016,39.042839,,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.794160,32.001027,33.871910,38.880173,40.261358,43.061877,44.270860,45.803485,,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,18.663502,17.633986,16.464681,24.531436,25.345111,27.449908,29.641760,30.404935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384365,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,14.500000,,,,,,,
384366,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,3.700000,,,,5.418352,,,
384367,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,33.500000,32.400000,,,,33.658057,,,
384368,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.400000,,


# INTEGRATION

Moreover, to work more comfortably, we remove those columns not useful for us, as *Country Name* and *Indicator Code*, since with the *Country Code*, *Value* and the *Indicator Name* we have the relevant information.

In [3]:
df.drop(columns=["Country Name","Indicator Code"], axis=1, inplace=True)

FILTER 1: BY COUNTRY

From the almost two hundred countries we have information about in the worldwide database, we have decided to study 50 of them, making an initial grouping by geographical and economical similiarities. With this, we can keep in our dataframe the selected countries.

Criteria for grouping:
- Europe: Germany, France, Sweden, United Kingdom, Spain, Croatia, Poland, Greece, Austria and Netherlands.

*Interesting countries of the European continent that can reflect events such as the Brexit process, the 2008 crisis or their historical strength.*
- Persian Gulf: Iraq, Qatar, United Arab Emirates, Arabia Saudita, Azerbayan, Yemen, Yemen Democratic and Oman.

*Countries located in the Persian Gulf, which have a similar economy based mainly on petrol and social structures.*
- North Africa: Algeria, Egiypt, Lybia, Israel, Turkey and Morroco.

*Countries of the african continent that are middle developed and with high mobility of people and goods.*
- South Africa: Senegal, South Africa, Liberia, Mozambique, Cameroon, Nigeria and Ghana.

*Countries of the south and central africa that are mainly subdeveloped and considered some of the poorest countries worldwide; but, on the contrary, one of them is highly developed.*
- Asia: Bangladesh, India, Vietnam, Thailand, Indonesia, Philipines and Korea (South).

*Converted in the last decades in the manufacturing of the world, they are subdeveloped countries with high population and childhood.*
- Latin America: Mexico, Brasil, Argentina, Peru, Venezuela, Colombia, Chile, Panama and Costa Rica.

*Countries located in same continet and some with singular political structures.* 
- Pair: USA and China.

*Although these countries seem to be confronted between them, they have been the top two most growing worlwide, despite the fact that culturally and economically they are completely distant.*


In [4]:
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list 

In [5]:
df1=df.loc[df['Country Code'].isin(country_list)]

Now we transpose the rows of years into the columns.

In [6]:
df2=(df1.set_index(["Country Code", "Indicator Name"]).stack().reset_index(name='Value').rename(columns={'level_2':'Date'}))
df2

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1769874,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1769875,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1769876,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1769877,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


FILTER 2: BY YEAR

Our time range covers from 1960 to 2021. However, the record is not uniform and complete for all areas and indicators. We can appreaciate that specially in the first years of the last century, so many data is missing, then it makes no sense to study it. Besides, for the year 2021 many data is also lacking. Therefore, we would delimit our study between 1990 and 2020.

In [7]:
df2[['Date']] = df2[['Date']].astype(int)

In [8]:
df2.dtypes

Country Code       object
Indicator Name     object
Date                int32
Value             float64
dtype: object

In [9]:
df3 = df2[df2['Date'] > 1989]
df3

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1769874,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1769875,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1769876,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1769877,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


In [10]:
BronzeDataFrame=df3

-----

# NORMALIZATION

Taking as reference both works of https://www.pluralsight.com/guides/cleaning-up-data-from-outliers and https://careerfoundry.com/en/blog/data-analytics/how-to-find-outliers/, for normalizing our data we need to start computing the outliers and removing them from our dataframe. As there is not a direct function of pandas that performs this step, it´s been step-by-step code, where we begin with the computation of the quartiles, then the IQR (Inter Quartile Range) and finally the upper and lower limit.

##### IQR explanation

The interquartile range (IQR) measures the spread of the middle half of your data. It is the range for the middle 50% of your sample. Use the IQR to assess the variability where most of your values lie. Larger values indicate that the central portion of your data spread out further. Conversely, smaller values show that the middle values cluster more tightly.

To visualize the interquartile range, imagine dividing your data into quarters. Statisticians refer to these quarters as quartiles and label them from low to high as Q1, Q2, Q3, and Q4. The lowest quartile (Q1) covers the smallest quarter of values in your dataset. The upper quartile (Q4) comprises the highest quarter of values. The interquartile range is the middle half of the data that lies between the upper and lower quartiles. In other words, the interquartile range includes the 50% of data points that are above Q1 and below Q4.

When measuring variability, statisticians prefer using the interquartile range instead of the full data range because extreme values and outliers affect it less. Typically, use the IQR with a measure of central tendency, such as the median, to understand your data’s center and spread. This combination creates a fuller picture of your data’s distribution.

Therefore it is being utilized to get rid of all the outliers that may come from errors when creating the data or from unexpected years.

Firstly, we compute the first quartile (Q1=25%) and the third quartile (Q3=75%). For that, we have grouped the data by country code and indicator name, so we get the Q1 and Q3 values for each indicator in each geographical area. 

In [11]:
grouped=BronzeDataFrame.groupby(['Country Code','Indicator Name'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000275FFD8AB30>

In [12]:
Q1=BronzeDataFrame.groupby(['Country Code','Indicator Name']).quantile(0.25)
Q3=BronzeDataFrame.groupby(['Country Code','Indicator Name']).quantile(0.75)
IQR=Q3-Q1
IQR

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Value
Country Code,Indicator Name,Unnamed: 2_level_1,Unnamed: 3_level_1
ARE,Access to clean fuels and technologies for cooking (% of population),10.0,0.00
ARE,"Access to clean fuels and technologies for cooking, rural (% of rural population)",10.0,0.00
ARE,"Access to clean fuels and technologies for cooking, urban (% of urban population)",10.0,0.00
ARE,Access to electricity (% of population),15.0,0.00
ARE,"Access to electricity, rural (% of rural population)",15.0,0.00
...,...,...,...
ZAF,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),0.0,0.00
ZAF,Women who were first married by age 15 (% of women ages 20-24),9.0,0.15
ZAF,Women who were first married by age 18 (% of women ages 20-24),9.0,2.15
ZAF,Women's share of population ages 15+ living with HIV (%),15.0,4.90


Once we got the quartiles, we compute the upper and lower limit, with a basic mathematical expression.

In [13]:
lower_limit=Q1 - 1.5 * IQR
lower=lower_limit.drop(['Date'],axis=1)
lower.rename(columns={"Value":"Lower limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Lower limit
Country Code,Indicator Name,Unnamed: 2_level_1
ARE,Access to clean fuels and technologies for cooking (% of population),100.000
ARE,"Access to clean fuels and technologies for cooking, rural (% of rural population)",100.000
ARE,"Access to clean fuels and technologies for cooking, urban (% of urban population)",100.000
ARE,Access to electricity (% of population),100.000
ARE,"Access to electricity, rural (% of rural population)",100.000
...,...,...
ZAF,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),1.000
ZAF,Women who were first married by age 15 (% of women ages 20-24),0.625
ZAF,Women who were first married by age 18 (% of women ages 20-24),1.375
ZAF,Women's share of population ages 15+ living with HIV (%),49.850


In [14]:
upper_limit=Q3 + 1.5 * IQR
upper=upper_limit.drop(['Date'],axis=1)
upper.rename(columns={"Value":"Upper limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Upper limit
Country Code,Indicator Name,Unnamed: 2_level_1
ARE,Access to clean fuels and technologies for cooking (% of population),100.000
ARE,"Access to clean fuels and technologies for cooking, rural (% of rural population)",100.000
ARE,"Access to clean fuels and technologies for cooking, urban (% of urban population)",100.000
ARE,Access to electricity (% of population),100.000
ARE,"Access to electricity, rural (% of rural population)",100.000
...,...,...
ZAF,Women who believe a husband is justified in beating his wife when she refuses sex with him (%),1.000
ZAF,Women who were first married by age 15 (% of women ages 20-24),1.225
ZAF,Women who were first married by age 18 (% of women ages 20-24),9.975
ZAF,Women's share of population ages 15+ living with HIV (%),69.450


Thirdly, we join the three tables we have (main dataframe, upper limit and lower limit) by matching country code and indicator name..

In [15]:
dfs = [BronzeDataFrame,lower,upper]
df_joined = ft.reduce(lambda left, right: pd.merge(left, right, on=['Country Code','Indicator Name']), dfs)
df_joined

Unnamed: 0,Country Code,Indicator Name,Date,Value_x,Value_y,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1,97.0,101.0
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3,97.0,101.0
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8,97.0,101.0
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0,97.0,101.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2,97.0,101.0
...,...,...,...,...,...,...
1225413,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0,-50.0,350.0
1225414,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0,-50.0,350.0
1225415,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0,-50.0,350.0
1225416,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0,-50.0,350.0


In [16]:
list(df_joined)

['Country Code', 'Indicator Name', 'Date', 'Value_x', 'Value_y', 'Value']

We rename the columns of the new table, as the columns headers are not saved after the joining. 

In [17]:
renamed=df_joined.set_axis(['Country','Indicator','Year', 'Real value', 'Lower value', 'Upper value'], axis=1, inplace=False)
renamed

Unnamed: 0,Country,Indicator,Year,Real value,Lower value,Upper value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1,97.0,101.0
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3,97.0,101.0
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8,97.0,101.0
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0,97.0,101.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2,97.0,101.0
...,...,...,...,...,...,...
1225413,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0,-50.0,350.0
1225414,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0,-50.0,350.0
1225415,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0,-50.0,350.0
1225416,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0,-50.0,350.0


Now that we have the table correctly defined, we remove from our dataframe the values that are outside our range, as it means that they are outliers.

In [18]:
sin_outliers=renamed.loc[~((renamed['Real value']<renamed['Lower value']) | (renamed['Real value']>renamed['Upper value']))]
sin_outliers

Unnamed: 0,Country,Indicator,Year,Real value,Lower value,Upper value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1,97.0,101.0
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3,97.0,101.0
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8,97.0,101.0
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0,97.0,101.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2,97.0,101.0
...,...,...,...,...,...,...
1225413,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0,-50.0,350.0
1225414,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0,-50.0,350.0
1225415,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0,-50.0,350.0
1225416,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0,-50.0,350.0


From the data above, we can perceive that our data comes down from 19944 rows to 19424, so 500 were outliers. The next steps are to order and display data better, removing those columns that we just do not need and pivoting the rows and columns. 

In [19]:
df_limpio=sin_outliers.drop(['Lower value','Upper value'],axis=1)
df_limpio

Unnamed: 0,Country,Indicator,Year,Real value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1225413,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1225414,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1225415,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1225416,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


In [20]:
cols=df_limpio['Indicator'].unique().tolist()

In [21]:
SilverDataFrame=df_limpio.set_index(["Country", "Year"]).pivot(columns="Indicator", values="Real value").reset_index()
SilverDataFrame

Indicator,Country,Year,ARI treatment (% of children under 5 taken to a health provider),Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),...,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Women who believe a husband is justified in beating his wife when she argues with him (%),Women who believe a husband is justified in beating his wife when she burns the food (%),Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Women who believe a husband is justified in beating his wife when she neglects the children (%),Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Women who were first married by age 15 (% of women ages 20-24),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
0,ARE,1990,,,,,100.000000,100.000000,100.000000,,...,,,,,,,,,18.8,100.0
1,ARE,1991,,,,,100.000000,100.000000,100.000000,,...,,,,,,,,,18.2,100.0
2,ARE,1992,,,,,100.000000,100.000000,100.000000,,...,,,,,,,,,19.4,100.0
3,ARE,1993,,,,,100.000000,100.000000,100.000000,,...,,,,,,,,,20.0,100.0
4,ARE,1994,,,,,100.000000,100.000000,100.000000,,...,,,,,,,,,20.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,ZAF,2017,,85.2,64.6,94.20,84.400002,76.738983,88.373024,69.218491,...,,,,,,,,,63.3,100000.0
1532,ZAF,2018,,85.7,65.5,94.65,84.699997,77.168495,88.518814,,...,,,,,,,,,63.7,92000.0
1533,ZAF,2019,,86.3,65.5,94.90,85.000000,77.611824,88.662704,,...,,,,,,,,,64.1,85000.0
1534,ZAF,2020,,86.8,65.9,95.20,84.385536,75.264854,88.806267,,...,,,,,,,,,64.4,79000.0


On the other hand, another big stone of normalizations is to nan/null values, which we have in all variables.

In [22]:
SilverDataFrame.isna().sum().sum()

1016628

As we can observe, we have lots of missing data, and as there is no optimal way to fullfill these values, thus, we will test some to arrive to the optimal method for our data set.

First, we need to create some lists so our loops work.

In [23]:
df=SilverDataFrame
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list


We are attempting the linear interpolation, which is achieved by geometrically rendering a straight line between two adjacent points on a graph or plane.

In [24]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

685787

Here we attempt the backward filling. (Filling the previous cell with future values)

In [25]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.fillna(method='bfill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.fillna(method='bfill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

498648

Here we will attempt the forward filling. (Filling the next cell with previous values)

In [26]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.fillna(method='ffill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

685787

And as none of the methods have worked out correctly, independently, we are going to mix them, to achieve a better result.

In [27]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datc=datc.fillna(method='ffill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

685787

In [28]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datc=datc.fillna(method='bfill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

310048

And finally, mixing the three methods all together.

In [29]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datf=datc.fillna(method='bfill')
datr=datf.fillna(method='ffill')
data=datr

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

310048

Fix explanation

Therefore, the preferred method for the Nan values´ treatment that we are going to develop is a mix, between the linear interpolation and backwards filling. The linear interpolation a form of interpolation, which involves the generation of new values based on an existing set of values. Linear interpolation is achieved by geometrically rendering a straight line between two adjacent points on a graph or plane. Whereas the backwards filling, will help us to arrive to those values which have not been fullfilled with the linear interpolation.

Moreover, we are also going to scale all the values between the max and min of each country for each variable.

In [30]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datf=datc.fillna(method='bfill')
datr=datf.fillna(method='ffill')
data=datr

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data

Indicator,Country,Year,ARI treatment (% of children under 5 taken to a health provider),Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),...,Women who believe a husband is justified in beating his wife (any of five reasons) (%),Women who believe a husband is justified in beating his wife when she argues with him (%),Women who believe a husband is justified in beating his wife when she burns the food (%),Women who believe a husband is justified in beating his wife when she goes out without telling him (%),Women who believe a husband is justified in beating his wife when she neglects the children (%),Women who believe a husband is justified in beating his wife when she refuses sex with him (%),Women who were first married by age 15 (% of women ages 20-24),Women who were first married by age 18 (% of women ages 20-24),Women's share of population ages 15+ living with HIV (%),Young people (ages 15-24) newly infected with HIV
352,DEU,1990,,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,...,,,,,,,,,19.1,500.0
353,DEU,1991,,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,...,,,,,,,,,19.1,500.0
354,DEU,1992,,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,...,,,,,,,,,19.1,500.0
355,DEU,1993,,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,...,,,,,,,,,19.1,500.0
356,DEU,1994,,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,...,,,,,,,,,19.1,500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,CHN,2017,,73.2,55.2,86.2,100.0,100.0,100.0,80.229118,...,,,,,,,,,,
252,CHN,2018,,75.6,59.0,87.4,100.0,100.0,100.0,80.229118,...,,,,,,,,,,
253,CHN,2019,,77.6,61.9,88.4,100.0,100.0,100.0,80.229118,...,,,,,,,,,,
254,CHN,2020,,79.4,65.2,89.4,100.0,100.0,100.0,80.229118,...,,,,,,,,,,


Now, we will drop the columns which have over 308 missing values (20%), because the absence of data creates an unreliable source.

In [31]:
for i in range(0, len(cols)):
    if data[cols[i]].isna().sum()>308:
        del(data[cols[i]])
        print(cols[i])
data

Adults (ages 15+) and children (ages 0-14) newly infected with HIV
Adults (ages 15-49) newly infected with HIV
Antiretroviral therapy coverage (% of people living with HIV)
Antiretroviral therapy coverage for PMTCT (% of pregnant women living with HIV)
ARI treatment (% of children under 5 taken to a health provider)
Average transaction cost of sending remittances to a specific country (%)
Average working hours of children, study and work, ages 7-14 (hours per week)
Average working hours of children, study and work, female, ages 7-14 (hours per week)
Average working hours of children, study and work, male, ages 7-14 (hours per week)
Average working hours of children, working only, ages 7-14 (hours per week)
Average working hours of children, working only, female, ages 7-14 (hours per week)
Average working hours of children, working only, male, ages 7-14 (hours per week)
Bank capital to assets ratio (%)
Bank liquid reserves to bank assets ratio (%)
Bank nonperforming loans to total gross

Indicator,Country,Year,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)",...,Urban population growth (annual %),Urban population living in areas where elevation is below 5 meters (% of total population),"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100)
352,DEU,1990,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,98.704536,...,1.056365,3.031776,5.700000,4.800000,5.170000,92.050003,89.110001,90.330002,54.519497,71.250
353,DEU,1991,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,98.704536,...,0.934908,3.029378,5.700000,4.800000,5.170000,92.050003,89.110001,90.330002,54.519497,71.250
354,DEU,1992,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,98.704536,...,0.884470,3.026980,5.740000,4.930000,5.270000,91.910004,88.589996,89.970001,54.519497,71.250
355,DEU,1993,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,98.704536,...,0.843967,3.024581,5.850000,5.040000,5.380000,91.669998,88.250000,89.669998,56.039631,71.250
356,DEU,1994,100.0,100.0,100.0,100.0,100.0,100.0,98.133621,98.704536,...,0.636245,3.022183,5.610000,5.200000,5.370000,91.629997,87.739998,89.370003,57.559764,71.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,CHN,2017,73.2,55.2,86.2,100.0,100.0,100.0,80.229118,76.364731,...,2.739664,4.203002,46.530001,42.949999,44.530002,52.430000,54.169998,53.400002,21.358958,75.625
252,CHN,2018,75.6,59.0,87.4,100.0,100.0,100.0,80.229118,76.364731,...,2.503401,4.203002,45.720001,41.940001,43.609999,53.209999,55.139999,54.290001,21.358958,75.625
253,CHN,2019,77.6,61.9,88.4,100.0,100.0,100.0,80.229118,76.364731,...,2.290177,4.203002,44.760000,40.819999,42.540000,54.150002,56.279999,55.340000,21.358958,75.625
254,CHN,2020,79.4,65.2,89.4,100.0,100.0,100.0,80.229118,76.364731,...,2.066047,4.203002,44.760000,40.819999,42.540000,54.150002,56.279999,55.340000,21.358958,75.625


As a result we have dropped the *Gender equality* and *Mortality-pollution* variables.

In [32]:
columns=data.columns.values.tolist()

In [33]:
datae=data.loc[data.loc[:, 'Country'] == country_list[0]]
for i in range(2,len(columns)):
    a=columns[i]
    datae[a]=datae[a]/datae.iloc[0,i]
datau=datae

In [34]:
for u in range(1,len(country_list)):
    datae=data.loc[data.loc[:, 'Country'] == country_list[u]]   
    for i in range(2,len(columns)):
        a=columns[i]
        datae[a]=datae[a]/datae.iloc[0,i]
    datau=pd.concat((datau, datae), axis = 0)
data=datau
data

Indicator,Country,Year,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)",...,Urban population growth (annual %),Urban population living in areas where elevation is below 5 meters (% of total population),"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100)
352,DEU,1990,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
353,DEU,1991,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,...,0.885023,0.999209,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
354,DEU,1992,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,...,0.837277,0.998418,1.007018,1.027083,1.019342,0.998479,0.994164,0.996015,1.000000,1.000000
355,DEU,1993,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,...,0.798935,0.997627,1.026316,1.050000,1.040619,0.995872,0.990349,0.992693,1.027882,1.000000
356,DEU,1994,1.000000,1.000000,1.000000,1.000000,1.000000,1.0,1.000000,1.000000,...,0.602296,0.996836,0.984210,1.083333,1.038685,0.995437,0.984626,0.989372,1.055765,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,CHN,2017,1.742857,2.348936,1.261156,1.030696,1.048707,1.0,1.257169,1.272562,...,0.635700,1.362819,0.707250,0.617275,0.656204,1.555786,1.984976,1.768798,8.610987,1.273684
252,CHN,2018,1.800000,2.510638,1.278713,1.030696,1.048707,1.0,1.257169,1.272562,...,0.580879,1.362819,0.694938,0.602759,0.642647,1.578932,2.020520,1.798278,8.610987,1.273684
253,CHN,2019,1.847619,2.634043,1.293343,1.030696,1.048707,1.0,1.257169,1.272562,...,0.531403,1.362819,0.680347,0.586663,0.626879,1.606825,2.062294,1.833057,8.610987,1.273684
254,CHN,2020,1.890476,2.774468,1.307974,1.030696,1.048707,1.0,1.257169,1.272562,...,0.479397,1.362819,0.680347,0.586663,0.626879,1.606825,2.062294,1.833057,8.610987,1.273684


For the next part of analyzing this data, we think it is gonna be interesting to have it classify by the categories of the Country groups defined before, to which we call "Continent". This category is useful as it groups the nations with similar economies or geographical proximity, so we can extract common conclusions from them.

We create a dictionary with the regions and the countries included in each one. Where we will relate the countries and regions so then we can apply the .map function and arrive to the final dataframe.

In [139]:
countries_by_region = {
    "Europe": ('DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD'),
    'Persian Gulf': ('IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN'),
    'North Africa':('DZA','EGY','LBY','ISR','TUR','MAR'),
    'South Africa':('SEN','ZAF','LBR','MOZ','CMR','NGA','GHA'),
    'Asia':('BGD','IND','VNM','THA','IDN','PHL','KOR'),
    'Latam':('MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI'),
    'Pair':('USA','CHN')
    }

all_countries = {}
for region in countries_by_region.keys():
  for country in countries_by_region[region]:
    all_countries[country] = region

print(all_countries)

{'DEU': 'Europe', 'FRA': 'Europe', 'SWE': 'Europe', 'GBR': 'Europe', 'ESP': 'Europe', 'HRV': 'Europe', 'POL': 'Europe', 'GRC': 'Europe', 'AUT': 'Europe', 'NLD': 'Europe', 'IRQ': 'Persian Gulf', 'QAT': 'Persian Gulf', 'ARE': 'Persian Gulf', 'SAU': 'Persian Gulf', 'AZE': 'Persian Gulf', 'YEM': 'Persian Gulf', 'YDR': 'Persian Gulf', 'OMN': 'Persian Gulf', 'DZA': 'North Africa', 'EGY': 'North Africa', 'LBY': 'North Africa', 'ISR': 'North Africa', 'TUR': 'North Africa', 'MAR': 'North Africa', 'SEN': 'South Africa', 'ZAF': 'South Africa', 'LBR': 'South Africa', 'MOZ': 'South Africa', 'CMR': 'South Africa', 'NGA': 'South Africa', 'GHA': 'South Africa', 'BGD': 'Asia', 'IND': 'Asia', 'VNM': 'Asia', 'THA': 'Asia', 'IDN': 'Asia', 'PHL': 'Asia', 'KOR': 'Asia', 'MEX': 'Latam', 'BRA': 'Latam', 'ARG': 'Latam', 'PER': 'Latam', 'VEN': 'Latam', 'COL': 'Latam', 'CHL': 'Latam', 'PAN': 'Latam', 'CRI': 'Latam', 'USA': 'Pair', 'CHN': 'Pair'}


In [140]:
data['Continent']=data['Country'].map(all_countries)
Goldendataframe=data
Goldendataframe

Indicator,Country,Year,Access to clean fuels and technologies for cooking (% of population),"Access to clean fuels and technologies for cooking, rural (% of rural population)","Access to clean fuels and technologies for cooking, urban (% of urban population)",Access to electricity (% of population),"Access to electricity, rural (% of rural population)","Access to electricity, urban (% of urban population)",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+),"Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)",...,Urban population living in areas where elevation is below 5 meters (% of total population),"Vulnerable employment, female (% of female employment) (modeled ILO estimate)","Vulnerable employment, male (% of male employment) (modeled ILO estimate)","Vulnerable employment, total (% of total employment) (modeled ILO estimate)","Wage and salaried workers, female (% of female employment) (modeled ILO estimate)","Wage and salaried workers, male (% of male employment) (modeled ILO estimate)","Wage and salaried workers, total (% of total employment) (modeled ILO estimate)","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal)",Women Business and the Law Index Score (scale 1-100),Continent
352,DEU,1990,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,1.00000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,Europe
353,DEU,1991,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,1.00000,...,0.999209,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,Europe
354,DEU,1992,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,1.00000,...,0.998418,1.007018,1.027083,1.019342,0.998479,0.994164,0.996015,1.000000,1.000000,Europe
355,DEU,1993,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,1.00000,...,0.997627,1.026316,1.050000,1.040619,0.995872,0.990349,0.992693,1.027882,1.000000,Europe
356,DEU,1994,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,1.00000,...,0.996836,0.984210,1.083333,1.038685,0.995437,0.984626,0.989372,1.055765,1.000000,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,CHN,2017,0.732,0.552,0.862,1.0,1.0,1.0,0.81755,0.77367,...,1.386317,8.163158,8.947917,8.613153,0.569582,0.607900,0.591166,0.391767,1.061404,Pair
252,CHN,2018,0.756,0.590,0.874,1.0,1.0,1.0,0.81755,0.77367,...,1.386317,8.021053,8.737500,8.435203,0.578055,0.618786,0.601018,0.391767,1.061404,Pair
253,CHN,2019,0.776,0.619,0.884,1.0,1.0,1.0,0.81755,0.77367,...,1.386317,7.852632,8.504167,8.228240,0.588267,0.631579,0.612643,0.391767,1.061404,Pair
254,CHN,2020,0.794,0.652,0.894,1.0,1.0,1.0,0.81755,0.77367,...,1.386317,7.852632,8.504167,8.228240,0.588267,0.631579,0.612643,0.391767,1.061404,Pair


With that all, we export our dataframe all-in-one and by the continent category.

In [141]:
Goldendataframe.to_csv(os.getcwd()+'/Data/GoldenDataFrame.csv')

In [142]:
for region, data in Goldendataframe.groupby('Continent'):
   data.to_csv(os.getcwd()+'/Data/{}.csv'.format(region))

CATEGORISATION OF VARIABLES FOR A DEEPER STUDY

In [143]:
#Pivot dataframe for having all variables inside the same column for better treatment.
columns_golden=list(Goldendataframe.columns)
del columns_golden[0:2]

In [144]:
Categorization=Goldendataframe.set_index(['Country','Year', 'Continent']).stack().reset_index()
Categorization.rename(columns={'Indicator':'Indicator Name','0':'Value'},inplace=True)
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
3,DEU,1990,Europe,Access to electricity (% of population),1.000000
4,DEU,1990,Europe,"Access to electricity, rural (% of rural popul...",1.000000
...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female (% of female...",0.588267
1511899,CHN,2021,Pair,"Wage and salaried workers, male (% of male emp...",0.631579
1511900,CHN,2021,Pair,"Wage and salaried workers, total (% of total e...",0.612643
1511901,CHN,2021,Pair,"Water productivity, total (constant 2015 US$ G...",0.391767


In [145]:
#For indicators with many units, just keeping the US$.
discard=["annual % growth","constant 2015 US$","% of GNI","constant LCU","current LCU"]
Categorization[~Categorization['Indicator Name'].str.contains('|'.join(discard))]

Unnamed: 0,Country,Year,Continent,Indicator Name,0
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000
3,DEU,1990,Europe,Access to electricity (% of population),1.000000
4,DEU,1990,Europe,"Access to electricity, rural (% of rural popul...",1.000000
...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female (% of female...",0.588267
1511899,CHN,2021,Pair,"Wage and salaried workers, male (% of male emp...",0.631579
1511900,CHN,2021,Pair,"Wage and salaried workers, total (% of total e...",0.612643
1511901,CHN,2021,Pair,"Water productivity, total (constant 2015 US$ G...",0.391767


In [146]:
#To check previous step:
#BronzeDataFrame.apply(lambda row: row.astype(str).str.contains('LCU').any(), axis=1)

In [147]:
#Make a new column with the units, that correspond to the string inside the first parenthesis of the indicator name column.
Categorization['Units']=Categorization['Indicator Name'].str.extract('(\(.*\))')
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population)
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population)
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population)
3,DEU,1990,Europe,Access to electricity (% of population),1.000000,(% of population)
4,DEU,1990,Europe,"Access to electricity, rural (% of rural popul...",1.000000,(% of rural population)
...,...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female (% of female...",0.588267,(% of female employment) (modeled ILO estimate)
1511899,CHN,2021,Pair,"Wage and salaried workers, male (% of male emp...",0.631579,(% of male employment) (modeled ILO estimate)
1511900,CHN,2021,Pair,"Wage and salaried workers, total (% of total e...",0.612643,(% of total employment) (modeled ILO estimate)
1511901,CHN,2021,Pair,"Water productivity, total (constant 2015 US$ G...",0.391767,(constant 2015 US$ GDP per cubic meter of tota...


In [148]:
#Delete the extracted information from origin column. 
Categorization['Indicator Name']=Categorization['Indicator Name'].str.replace(r"\(.*\)","")
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population)
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population)
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population)
3,DEU,1990,Europe,Access to electricity,1.000000,(% of population)
4,DEU,1990,Europe,"Access to electricity, rural",1.000000,(% of rural population)
...,...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female",0.588267,(% of female employment) (modeled ILO estimate)
1511899,CHN,2021,Pair,"Wage and salaried workers, male",0.631579,(% of male employment) (modeled ILO estimate)
1511900,CHN,2021,Pair,"Wage and salaried workers, total",0.612643,(% of total employment) (modeled ILO estimate)
1511901,CHN,2021,Pair,"Water productivity, total",0.391767,(constant 2015 US$ GDP per cubic meter of tota...


In [149]:
#Checking cases of two parenthesis
two_parent=Categorization[Categorization['Indicator Name'].str.contains('Contributing family workers')]
two_parent

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units
151,DEU,1990,Europe,"Contributing family workers, female",1.000000,(% of female employment) (modeled ILO estimate)
152,DEU,1990,Europe,"Contributing family workers, male",1.000000,(% of male employment) (modeled ILO estimate)
153,DEU,1990,Europe,"Contributing family workers, total",1.000000,(% of total employment) (modeled ILO estimate)
1151,DEU,1991,Europe,"Contributing family workers, female",1.000000,(% of female employment) (modeled ILO estimate)
1152,DEU,1991,Europe,"Contributing family workers, male",1.000000,(% of male employment) (modeled ILO estimate)
...,...,...,...,...,...,...
1510141,CHN,2020,Pair,"Contributing family workers, male",14.564103,(% of male employment) (modeled ILO estimate)
1510142,CHN,2020,Pair,"Contributing family workers, total",7.096551,(% of total employment) (modeled ILO estimate)
1511094,CHN,2021,Pair,"Contributing family workers, female",5.481356,(% of female employment) (modeled ILO estimate)
1511095,CHN,2021,Pair,"Contributing family workers, male",14.564103,(% of male employment) (modeled ILO estimate)


In [150]:
#As there were variables with more than one parentheses element, we are going to separate them since they are not related to units. They go into Other specification column, although it is not relevant for our study.
Categorization[['Units','Other specification']]=Categorization['Units'].str.split("\) ", n=1,expand=True)
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units,Other specification
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population),
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population),
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population),
3,DEU,1990,Europe,Access to electricity,1.000000,(% of population),
4,DEU,1990,Europe,"Access to electricity, rural",1.000000,(% of rural population),
...,...,...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female",0.588267,(% of female employment,(modeled ILO estimate)
1511899,CHN,2021,Pair,"Wage and salaried workers, male",0.631579,(% of male employment,(modeled ILO estimate)
1511900,CHN,2021,Pair,"Wage and salaried workers, total",0.612643,(% of total employment,(modeled ILO estimate)
1511901,CHN,2021,Pair,"Water productivity, total",0.391767,(constant 2015 US$ GDP per cubic meter of tota...,


In [151]:
#Checking of previous step, as not all of them have this second element.
Categorization['Other specification'].unique()

array([None, nan, '(per 100,000 adults)', '(modeled ILO estimate)',
       '(cumulative)', '(national estimate)',
       'per $1,000 GDP (constant 2017 PPP)', '(constant 2015 US$)',
       '(constant LCU)', '(current LCU)', '(current US$)', '(scale 0-1)',
       'poverty line due to out-of-pocket health care expenditure (% of poverty line)',
       'poverty line due to out-of-pocket health care expenditure (USD)',
       '(%)', '(% of population)',
       'poverty line by out-of-pocket health care expenditure (%)'],
      dtype=object)

In [152]:
#Getting the subclassification of some variables that have information divided for small groups. These categorization is specified at the end of the variable separated by a ",".
Categorization[['Subgroup']]=Categorization['Indicator Name'].str.extract(',(?P<field>[^,]*?)$')
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units,Other specification,Subgroup
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population),,
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population),,rural
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population),,urban
3,DEU,1990,Europe,Access to electricity,1.000000,(% of population),,
4,DEU,1990,Europe,"Access to electricity, rural",1.000000,(% of rural population),,rural
...,...,...,...,...,...,...,...,...
1511898,CHN,2021,Pair,"Wage and salaried workers, female",0.588267,(% of female employment,(modeled ILO estimate),female
1511899,CHN,2021,Pair,"Wage and salaried workers, male",0.631579,(% of male employment,(modeled ILO estimate),male
1511900,CHN,2021,Pair,"Wage and salaried workers, total",0.612643,(% of total employment,(modeled ILO estimate),total
1511901,CHN,2021,Pair,"Water productivity, total",0.391767,(constant 2015 US$ GDP per cubic meter of tota...,,total


In [153]:
#Delete the extracted information from origin column. 
Categorization['Indicator Name']=Categorization['Indicator Name'].str.replace(',(?P<field>[^,]*?)$',"")
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units,Other specification,Subgroup
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population),,
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population),,rural
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population),,urban
3,DEU,1990,Europe,Access to electricity,1.000000,(% of population),,
4,DEU,1990,Europe,Access to electricity,1.000000,(% of rural population),,rural
...,...,...,...,...,...,...,...,...
1511898,CHN,2021,Pair,Wage and salaried workers,0.588267,(% of female employment,(modeled ILO estimate),female
1511899,CHN,2021,Pair,Wage and salaried workers,0.631579,(% of male employment,(modeled ILO estimate),male
1511900,CHN,2021,Pair,Wage and salaried workers,0.612643,(% of total employment,(modeled ILO estimate),total
1511901,CHN,2021,Pair,Water productivity,0.391767,(constant 2015 US$ GDP per cubic meter of tota...,,total


In [154]:
#Checking of previous step, as not all of them have this second element.
Categorization['Subgroup']=Categorization['Subgroup'].replace(['None'],['total'])
Categorization['Subgroup']=Categorization['Subgroup'].fillna('total')
Categorization.drop(columns='Other specification', inplace=True)

In [155]:
Categorization['Indicator Name'].unique()

array(['Access to clean fuels and technologies for cooking ',
       'Access to clean fuels and technologies for cooking',
       'Access to electricity ', 'Access to electricity',
       'Account ownership at a financial institution or with a mobile-money-service provider ',
       'Account ownership at a financial institution or with a mobile-money-service provider',
       'Adjusted net enrollment rate',
       'Adjusted net enrollment rate, primary',
       'Adjusted net national income ',
       'Adjusted net national income per capita ', 'Adjusted net savings',
       'Adjusted savings: carbon dioxide damage ',
       'Adjusted savings: consumption of fixed capital ',
       'Adjusted savings: education expenditure ',
       'Adjusted savings: energy depletion ',
       'Adjusted savings: gross savings ',
       'Adjusted savings: mineral depletion ',
       'Adjusted savings: natural resources depletion ',
       'Adjusted savings: net national savings ',
       'Adjusted saving

In [156]:
Categorization

Unnamed: 0,Country,Year,Continent,Indicator Name,0,Units,Subgroup
0,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of population),total
1,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of rural population),rural
2,DEU,1990,Europe,Access to clean fuels and technologies for coo...,1.000000,(% of urban population),urban
3,DEU,1990,Europe,Access to electricity,1.000000,(% of population),total
4,DEU,1990,Europe,Access to electricity,1.000000,(% of rural population),rural
...,...,...,...,...,...,...,...
1511898,CHN,2021,Pair,Wage and salaried workers,0.588267,(% of female employment,female
1511899,CHN,2021,Pair,Wage and salaried workers,0.631579,(% of male employment,male
1511900,CHN,2021,Pair,Wage and salaried workers,0.612643,(% of total employment,total
1511901,CHN,2021,Pair,Water productivity,0.391767,(constant 2015 US$ GDP per cubic meter of tota...,total


------------------------------

FROM HERE ON, ADAPT FOR MAKING CORRELATIONS JUST FOR SUBGROUP=TOTAL.

In [136]:
columns=Goldendataframe.columns.values.tolist()
clist=Goldendataframe['Country'].unique()
common=['Unnamed: 0','Country','Year']

In the following cell, we have defined a function that will allow us to calculate the different posibilities of relations: cuadratic, cubic and logaritmic.

In [137]:
def multcolumn(frame):
    for u in range(2, len(columns)-1):
        name=columns[u]+'.l'
        name2=columns[u]+'.^2'
        name3=columns[u]+'.^3'
        namelog=columns[u]+'.log'
        frame.loc[:,name2] = frame[columns[u]]**2
        frame.loc[:,name3] = frame[columns[u]]**3
        frame.loc[:,namelog] = np.log(frame[columns[u]])
        frame.rename(columns={columns[u]:name}, inplace=True)

Moreover, we want to know the correlation between all the variables, so to acomplish this, we have created the following loop, which will help us create a new dataframe where we will have: the *Indicator*, the *Type* of relation, the value of the *R^2*, its *Behaviour*, the *Country* and the *Continent*.

In [138]:
df= pd.read_csv (os.getcwd()+'/Data/'+'GoldenDataFrame.csv')
multcolumn(df)


Firstly we are going to create two lists for the variables, which their p-value is under 0.05 for each correlation, so later on, we can calculate only the correlations of those variables.

In [139]:
df

Unnamed: 0.1,Unnamed: 0,Country,Year,Access to clean fuels and technologies for cooking (% of population).l,"Access to clean fuels and technologies for cooking, rural (% of rural population).l","Access to clean fuels and technologies for cooking, urban (% of urban population).l",Access to electricity (% of population).l,"Access to electricity, rural (% of rural population).l","Access to electricity, urban (% of urban population).l",Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+).l,...,"Wage and salaried workers, male (% of male employment) (modeled ILO estimate).log","Wage and salaried workers, total (% of total employment) (modeled ILO estimate).^2","Wage and salaried workers, total (% of total employment) (modeled ILO estimate).^3","Wage and salaried workers, total (% of total employment) (modeled ILO estimate).log","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal).^2","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal).^3","Water productivity, total (constant 2015 US$ GDP per cubic meter of total freshwater withdrawal).log",Women Business and the Law Index Score (scale 1-100).^2,Women Business and the Law Index Score (scale 1-100).^3,Women Business and the Law Index Score (scale 1-100).log
0,352,DEU,1990,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,...,0.000000,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000
1,353,DEU,1991,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,...,0.000000,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000
2,354,DEU,1992,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,...,-0.005853,0.992045,0.988091,-0.003993,1.000000,1.000000,0.000000,1.000000,1.000000,0.000000
3,355,DEU,1993,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,...,-0.009698,0.985440,0.978240,-0.007333,1.056542,1.086001,0.027501,1.000000,1.000000,0.000000
4,356,DEU,1994,1.000,1.000,1.000,1.0,1.0,1.0,1.00000,...,-0.015494,0.978858,0.968455,-0.010685,1.114639,1.176797,0.054265,1.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,251,CHN,2017,0.732,0.552,0.862,1.0,1.0,1.0,0.81755,...,-0.497744,0.349477,0.206599,-0.525659,0.153482,0.060129,-0.937087,1.126577,1.195753,0.059592
1532,252,CHN,2018,0.756,0.590,0.874,1.0,1.0,1.0,0.81755,...,-0.479996,0.361223,0.217102,-0.509130,0.153482,0.060129,-0.937087,1.126577,1.195753,0.059592
1533,253,CHN,2019,0.776,0.619,0.884,1.0,1.0,1.0,0.81755,...,-0.459532,0.375331,0.229944,-0.489974,0.153482,0.060129,-0.937087,1.126577,1.195753,0.059592
1534,254,CHN,2020,0.794,0.652,0.894,1.0,1.0,1.0,0.81755,...,-0.459532,0.375331,0.229944,-0.489974,0.153482,0.060129,-0.937087,1.126577,1.195753,0.059592


In [85]:
dat=df.loc[df.loc[:, 'Country'] == clist[0]]
listacorpe=[]
listacorsp=[]
clmns=dat.columns.values.tolist()
dat.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in range(0, len(clmns)):
    if dat[clmns[c]].isna().sum()>=1:
        del(dat[clmns[c]])
pilares=dat.columns.values.tolist()
for u in range(0,len(pilares)):
    if is_numeric_dtype(dat[pilares[u]]):
        correlation, pvalue=pearsonr(dat[pilares[u]], dat['GDP (current US$).l'])
        if pvalue<=0.05:
            listacorpe.append(pilares[u])
        else:
            pass
        correlation, pvalue=spearmanr(dat[pilares[u]], dat['GDP (current US$).l'])
        if pvalue<=0.05:
            listacorsp.append(pilares[u])
        else:
            pass
    else:
        pass

Secondly, we need to calculate the correlation table for each country, therefore we use the basic function `corr()` which provides either the Pearson correlation table or the Spearman correlation table, as well as a filter for the countries.

In [140]:
dat=df.loc[df.loc[:, 'Country'] == clist[0]]

datp=dat[dat.columns[dat.columns.isin(listacorpe)]]
corp=datp.corr('pearson')

datsp=dat[dat.columns[dat.columns.isin(listacorsp)]]
cors=datsp.corr('spearman')

Then we calculate the coefficient of determination which is the correlation squared.

In [141]:
corp.loc[:,'R^2 Pearson'] = corp['GDP (current US$).l']**2

cors.loc[:,'R^2 Spearman'] = cors['GDP (current US$).l']**2

Moreover, we are going to create new columns to know which *Indicator* are we talking about, and the *Type* of correlation that is being analyzed (linear, cuadratic, cubic or logarithmic)

In [142]:
corp.loc[:,'Indicator']=corp.index
corp[['Indicator','Type']]=corp.Indicator.str.split('.',1, expand=True)

cors.loc[:,'Indicator']=cors.index
cors[['Indicator','Type']]=cors.Indicator.str.split('.',1, expand=True)

Now, we can apply the filter we have consider that is enough, R^2>=0.75 to filter the correlations.

In [143]:
corpcolumn=corp[['Indicator','R^2 Pearson','Type','GDP (current US$).l']]
corpcolumn=corpcolumn.loc[corpcolumn.loc[:, 'R^2 Pearson'] >= 0.75]

corscolumn=cors[['Indicator','R^2 Spearman','Type','GDP (current US$).l']]
corscolumn=corscolumn.loc[corscolumn.loc[:, 'R^2 Spearman'] >= 0.75]

Furthermore, we add all the columns that we have created into a data frame, thanks to the following cell.

In [144]:
idp=corpcolumn.groupby('Indicator')['R^2 Pearson'].transform(max)==corpcolumn['R^2 Pearson']
corpcolumn[idp]
maxp_df=pd.DataFrame(corpcolumn[idp])

ids=corscolumn.groupby('Indicator')['R^2 Spearman'].transform(max)==corscolumn['R^2 Spearman']
corscolumn[ids]
maxs_df=pd.DataFrame(corscolumn[ids])

Here, we conmute the values, by expressions. For example if the correlation is positive, we want in the new column called *Behaviour* the word Positive. Or for the *Type* column if the greatest correlation is cuadratic we want to put, Cuadratic. We also add the country.

In [145]:
maxp_df['Behaviour']=np.where(maxp_df['GDP (current US$).l']>0, 'Positive', 'Negative')
maxp_df['Type']=maxp_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
maxp_df['Country']= clist[0]

maxs_df['Behaviour']=np.where(maxs_df['GDP (current US$).l']>0, 'Positive', 'Negative')
maxs_df['Type']=maxs_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
maxs_df['Country']= clist[0]

In addition, we also drop the columns which do not add any value, as *GDP*, *Year*, and *Unnamed:0*.

In [146]:
maxp_df.drop("GDP (current US$).l",axis=1,inplace=True)
maxp_df=maxp_df.reset_index(drop=True)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Year'].index)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='GDP (current US$)'].index)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Unnamed: 0'].index)

maxs_df.drop("GDP (current US$).l",axis=1,inplace=True)
maxs_df=maxs_df.reset_index(drop=True)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Year'].index)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='GDP (current US$)'].index)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Unnamed: 0'].index)
maxs_df=maxs_df.sort_values(by = 'R^2 Spearman',ascending = False)

And finally we sort the values in descending order by the column *R^2 Pearson*.

In [147]:
maxp_df_deu=maxp_df.sort_values(by = 'R^2 Pearson',ascending = False)
pearsondf= maxp_df_deu
spearmandf=maxs_df

So, we can do it with all the countries and create just one dataframe.

In [149]:
pearsondf
spearmandf
for i in range(1,len(clist)):
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]
    listacorpe=[]
    listacorsp=[]
    clmns=dat.columns.values.tolist()
    dat.replace([np.inf, -np.inf], np.nan, inplace=True)
    for c in range(0, len(clmns)):
        if dat[clmns[c]].isna().sum()>=1:
            del(dat[clmns[c]])
    pilares=dat.columns.values.tolist()
    for u in range(0,len(pilares)):
        if is_numeric_dtype(dat[pilares[u]]):
            correlation, pvalue=pearsonr(dat[pilares[u]], dat['GDP (current US$).l'])
            if pvalue<=0.05:
                listacorpe.append(pilares[u])
            else:
                pass
            correlation, pvalue=spearmanr(dat[pilares[u]], dat['GDP (current US$).l'])
            if pvalue<=0.05:
                listacorsp.append(pilares[u])
            else:
                pass
        else:
            pass
    
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]

    datp=dat[dat.columns[dat.columns.isin(listacorpe)]]
    corp=datp.corr('pearson')

    datsp=dat[dat.columns[dat.columns.isin(listacorsp)]]
    cors=datsp.corr('spearman')


    corp.loc[:,'R^2 Pearson'] = corp['GDP (current US$).l']**2

    cors.loc[:,'R^2 Spearman'] = cors['GDP (current US$).l']**2


    corp.loc[:,'Indicator']=corp.index
    corp[['Indicator','Type']]=corp.Indicator.str.split('.',1, expand=True)

    cors.loc[:,'Indicator']=cors.index
    cors[['Indicator','Type']]=cors.Indicator.str.split('.',1, expand=True)


    corpcolumn=corp[['Indicator','R^2 Pearson','Type','GDP (current US$).l']]
    corpcolumn=corpcolumn.loc[corpcolumn.loc[:, 'R^2 Pearson'] >= 0.75]
    
    corscolumn=cors[['Indicator','R^2 Spearman','Type','GDP (current US$).l']]
    corscolumn=corscolumn.loc[corscolumn.loc[:, 'R^2 Spearman'] >= 0.75]


    idp=corpcolumn.groupby('Indicator')['R^2 Pearson'].transform(max)==corpcolumn['R^2 Pearson']
    corpcolumn[idp]
    maxp_df=pd.DataFrame(corpcolumn[idp])

    ids=corscolumn.groupby('Indicator')['R^2 Spearman'].transform(max)==corscolumn['R^2 Spearman']
    corscolumn[ids]
    maxs_df=pd.DataFrame(corscolumn[ids])


    maxp_df['Behaviour']=np.where(maxp_df['GDP (current US$).l']>0, 'Positive', 'Negative')
    maxp_df['Type']=maxp_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
    maxp_df['Country']= clist[i]

    maxs_df['Behaviour']=np.where(maxs_df['GDP (current US$).l']>0, 'Positive', 'Negative')
    maxs_df['Type']=maxs_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
    maxs_df['Country']= clist[i]


    maxp_df.drop("GDP (current US$).l",axis=1,inplace=True)
    maxp_df=maxp_df.reset_index(drop=True)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Year'].index)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='GDP (current US$)'].index)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Unnamed: 0'].index)

    maxs_df.drop("GDP (current US$).l",axis=1,inplace=True)
    maxs_df=maxs_df.reset_index(drop=True)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Year'].index)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='GDP (current US$)'].index)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Unnamed: 0'].index)
    maxs_df=maxs_df.sort_values(by = 'R^2 Spearman',ascending = False)


    maxp_df=maxp_df.sort_values(by = 'R^2 Pearson',ascending = False)
    pearsondf=pd.concat((pearsondf, maxp_df), axis = 0)
    spearmandf=pd.concat((spearmandf, maxs_df), axis = 0)

corrtable=spearmandf.merge(pearsondf, left_on=('Indicator', 'Country','Type','Behaviour'), right_on=('Indicator', 'Country','Type','Behaviour'))
display(corrtable)

Unnamed: 0,Indicator,R^2 Spearman,Type,Behaviour,Country,R^2 Pearson
0,Adjusted net national income (current US$),0.996519,Linear,Positive,DEU,0.999141
1,Gross value added at basic prices (GVA) (curre...,0.996519,Linear,Positive,DEU,0.999851
2,GNI (current US$),0.996337,Linear,Positive,DEU,0.999362
3,Gross national expenditure (current US$),0.990490,Linear,Positive,DEU,0.997181
4,Final consumption expenditure (current US$),0.988302,Linear,Positive,DEU,0.996540
...,...,...,...,...,...,...
14012,Out-of-pocket expenditure (% of current health...,0.782573,Logarithmic,Negative,CHN,0.929883
14013,"Revenue, excluding grants (% of GDP)",0.778198,Logarithmic,Positive,CHN,0.795803
14014,Taxes on goods and services (current LCU),0.776123,Linear,Positive,CHN,0.978917
14015,Access to clean fuels and technologies for coo...,0.772788,Logarithmic,Positive,CHN,0.990567


In [150]:
columnssf=corrtable.Indicator.to_list()
columnsf=np.unique(columnssf)

In [151]:
powerind=[]
for i in range(0, len(columnsf)):
    powerind.append(columnssf.count(columnsf[i]))

df_indicators = pd.DataFrame(list(zip(columnsf,powerind)), columns = ['Indicator','Number of times repeated'])
df_indicators=df_indicators.sort_values(by = 'Number of times repeated',ascending = False)
df_indicators

Unnamed: 0,Indicator,Number of times repeated
277,"GNI per capita, Atlas method (current US$)",46
373,"Industry (including construction), value added...",46
260,GDP per capita (current US$),46
273,GNI (current US$),46
280,"GNI, Atlas method (current US$)",46
...,...,...
463,Merchandise exports to economies in the Arab W...,1
299,Gross capital formation (% of GDP),1
25,Adjusted savings: net national savings (% of GNI),1
787,Surface area (sq,1


Review widgets, obsoleted

Now that we’ve loaded the data, we can start right away to create widgets. These widgets are essentials to add interactivity to our visualizations. We’re going to use two widgets: both, multiple selection widgets. To create these widgets, we can use `ipywidgets` library that is available for Jupyter Notebook.

The first widget that we are going to create is the multiple selection widget. We can do this by using `SelectMultiple()attribute` from `ipywidgets`. With this widget, we have the option to visualize the R^2 only in particular selection of indicators instead of all.

The first argument that we should specify is `options` , which should contain the list of available options of our variable (in our case different indicators). The next one is `value` , which should contain the variable values that we want to display as default, and then `description` is for the text field to describe the name of the widget.The rest of options are just visual details.

In [71]:
unique_tri = demo2['Indicator'].unique()
tri = widgets.SelectMultiple(
    options = unique_tri.tolist(),
    value = ['Exports-G&S'],
    description='Indicator',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)

def graf1(tri):
    dat=demo2.loc[demo2.loc[:, 'Indicator'].isin(np.array(tri))]
    a=px.choropleth(dat, locations="Country", locationmode='ISO-3', 
                     color="R^2 Pearson", hover_name="Country",hover_data = [dat.Type, dat.Behaviour],projection="natural earth",
                     color_continuous_scale='Reds', width=700, height=500, title= dat.Indicator.unique().tolist()[0])
    print(tri)
    a.show()
widgets.interactive(graf1, tri=tri)


NameError: name 'demo2' is not defined

To wrap up, we can create the second widget that is exactly the same as the previous multiple selection widget. The purpose of this widget is to enable us to choose which Continent we want to visualize. Below is the code implementation of this widget.

In [None]:
unique_tric = demo2['Continent'].unique()
tric = widgets.SelectMultiple(
    options = unique_tric.tolist(),
    value = ['North Africa'],
    description='Continent',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)

def graf1(tric):
    dat=demo2.loc[demo2.loc[:, 'Continent'].isin(np.array(tric))]
    a=px.scatter(dat, x="R^2 Pearson", y='Indicator',
                     color="R^2 Pearson", hover_name="Country",hover_data = [dat.Type, dat.Behaviour],
                     color_continuous_scale='Blues', width=700, height=500, title= dat.Continent.unique().tolist()[0])
    a.show()
widgets.interactive(graf1, tric=tric)

interactive(children=(SelectMultiple(description='Continent', index=(2,), layout=Layout(height='80px', width='…

Now, if we execute the following loop, it will provide with the variables that follow a normal distribution.

In [None]:
for i in range(0,len(clist)):
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]
    for e in range(2,len(columns)):
        data=dat.iloc[:, e]
        stat, p = shapiro(data)
        print(clist[i] +"-"+ columns[e])
        print('Statistical=%.3f, p=%.3f' % (stat, p))
        alpha = 0.05
        if p > alpha:
            print('Data is NORMAL ( H0 not denied )')
        else:
            pass

DEU-Exports-Commercial services
Statistical=0.957, p=0.229
Data is NORMAL ( H0 not denied )
DEU-Renewable electricity
Statistical=0.806, p=0.000
DEU-Employment-agriculture
Statistical=0.888, p=0.003
DEU-Employment-industry
Statistical=0.906, p=0.009
DEU-Employment-services
Statistical=0.877, p=0.002
DEU-Exports-G&S
Statistical=0.887, p=0.003
DEU-Fertility rate
Statistical=0.893, p=0.004
DEU-Foreign investment
Statistical=0.851, p=0.000
DEU-GDP
Statistical=0.927, p=0.031
DEU-Education GExp
Statistical=0.972, p=0.548
Data is NORMAL ( H0 not denied )
DEU-Workers high education
Statistical=0.887, p=0.003
DEU-Literacy rate
Statistical=0.809, p=0.000
DEU-Net migration
Statistical=nan, p=1.000
Data is NORMAL ( H0 not denied )
DEU-Mortality-infants
Statistical=nan, p=1.000
Data is NORMAL ( H0 not denied )
DEU-Health services use
Statistical=0.804, p=0.000
DEU-R&D GExp
Statistical=0.879, p=0.002
DEU-Ninis
Statistical=0.819, p=0.000
DEU-Suicide
Statistical=0.892, p=0.004
DEU-International taxes
