# EXTRACTION

Import libraries and functions.

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import warnings
warnings.filterwarnings("ignore")
import functools as ft
from pyspark.sql.functions import concat, col, lit, split
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import interact, interact_manual
import plotly.express as px
from scipy import stats
from scipy.stats import shapiro
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from pandas.api.types import is_numeric_dtype

Firstly we load the database from World Data Bank that has been downloaded and extracted in the *Data extraction* notebook. We acquire it from the predetermined path that is on our computer.

In [2]:
df= pd.read_csv (os.getcwd()+'/Data/'+'WDIData.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,16.936004,17.337896,17.687093,18.140971,18.491344,18.825520,19.272212,19.628009,,
1,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.RU.ZS,,,,,,,...,6.499471,6.680066,6.859110,7.016238,7.180364,7.322294,7.517191,7.651598,,
2,Africa Eastern and Southern,AFE,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.UR.ZS,,,,,,,...,37.855399,38.046781,38.326255,38.468426,38.670044,38.722783,38.927016,39.042839,,
3,Africa Eastern and Southern,AFE,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,31.794160,32.001027,33.871910,38.880173,40.261358,43.061877,44.270860,45.803485,,
4,Africa Eastern and Southern,AFE,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,18.663502,17.633986,16.464681,24.531436,25.345111,27.449908,29.641760,30.404935,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384365,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,14.500000,,,,,,,
384366,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,3.700000,,,,5.418352,,,
384367,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,33.500000,32.400000,,,,33.658057,,,
384368,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.200000,59.400000,59.500000,59.700000,59.900000,60.000000,60.200000,60.400000,,


# INTEGRATION

Moreover, to work more comfortably, we remove those columns not useful for us, as *Country Name* and *Indicator Code*, since with the *Country Code*, *Value* and the *Indicator Name* we have the relevant information.

In [3]:
df.drop(columns=["Country Name","Indicator Code"], axis=1, inplace=True)

FILTER 1: BY COUNTRY

From the almost two hundred countries we have information about in the worldwide database, we have decided to study 50 of them, grouping them by geographical and economical similiarities. With this, we can keep in our dataframe the selected countries.

Criteria for grouping:
- Europe: Germany, France, Sweden, United Kingdom, Spain, Croatia, Poland, Greece, Austria and Netherlands.

*Interesting countries of the European continent that can reflect events such as the Brexit process, the 2008 crisis or their historical strength.*
- Persian Gulf: Iraq, Qatar, United Arab Emirates, Arabia Saudita, Azerbayan, Yemen, Yemen Democratic and Oman.

*Countries located in the Persian Gulf, which have a similar economy based mainly on petrol and social structures.*
- North Africa: Algeria, Egiypt, Lybia, Israel, Turkey and Morroco.

*Countries of the african continent that are middle developed and with high mobility of people and goods.*
- South Africa: Senegal, South Africa, Liberia, Mozambique, Cameroon, Nigeria and Ghana.

*Countries of the south and central africa that are mainly subdeveloped and considered some of the poorest countries worldwide; but, on the contrary, one of them is highly developed.*
- Asia: Bangladesh, India, Vietnam, Thailand, Indonesia, Philipines and Korea (South).

*Converted in the last decades in the manufacturing of the world, they are subdeveloped countries with high population and childhood.*
- Latin America: Mexico, Brasil, Argentina, Peru, Venezuela, Colombia, Chile, Panama and Costa Rica.

*Countries located in same continet and some with singular political structures.* 
- Pair: USA and China.

*Although these countries seem to be confronted between them, they have been the top two most growing worlwide, despite the fact that culturally and economically they are completely distant.*


In [4]:
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list 

In [5]:
df1=df.loc[df['Country Code'].isin(country_list)]

Now we transpose the rows of years into the columns.

In [6]:
df2=(df1.set_index(["Country Code", "Indicator Name"]).stack().reset_index(name='Value').rename(columns={'level_2':'Date'}))
df2

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1769874,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1769875,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1769876,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1769877,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


FILTER 2: BY YEAR

Our time range covers from 1960 to 2021. However, the record is not uniform and complete for all areas and indicators. We can appreaciate that specially in the first years of the last century, so many data is missing, then it makes no sense to study it. Besides, for the year 2021 many data is also lacking. Therefore, we would delimit our study between 1990 and 2020.

In [7]:
df2[['Date']] = df2[['Date']].astype(int)

In [8]:
df2.dtypes

Country Code       object
Indicator Name     object
Date                int32
Value             float64
dtype: object

In [9]:
df3 = df2[df2['Date'] > 1989]
df3

Unnamed: 0,Country Code,Indicator Name,Date,Value
0,DZA,Access to clean fuels and technologies for coo...,2000,97.1
1,DZA,Access to clean fuels and technologies for coo...,2001,97.3
2,DZA,Access to clean fuels and technologies for coo...,2002,97.8
3,DZA,Access to clean fuels and technologies for coo...,2003,98.0
4,DZA,Access to clean fuels and technologies for coo...,2004,98.2
...,...,...,...,...
1769874,YEM,Young people (ages 15-24) newly infected with HIV,2016,200.0
1769875,YEM,Young people (ages 15-24) newly infected with HIV,2017,200.0
1769876,YEM,Young people (ages 15-24) newly infected with HIV,2018,200.0
1769877,YEM,Young people (ages 15-24) newly infected with HIV,2019,200.0


In [None]:
BronzeDataFrame=df3

---- 

FILTER 3: BY INDICATOR

As there are lots of indicators that have very similar meaning we have decided to select some indicators to perform the study (**Indicator group** = *Name of the selected indicator*):
- **GDP** = *GDP (current US$), measures the monetary value of final goods and services produced in a country at a given period of time.*
- **Literacy** = *Literacy rate, % of people ages 15 and above which are able to expand one's knowledge of reading and writing in order to develop one's thinking and learning for the purpose of understanding oneself and the world. Government expenditure on education, total % of government expenditure incurred on education service.*
- **Migration** = *Net migration, difference between the number of immigrants (people coming into an area) and the number of emigrants (people leaving an area) throughout the year.*
- **Exports** = *Commercial service exports (current US$)* and *Exports of goods and services (current US$). Exports term is referred to the goods and services which are produced in a country and sold to buyers in another one.*
- **International trading** = *Taxes on international trade. This one reflects the amount of money that a government collects thanks to all kinds of taxes (on products that enter and leave, customs...)..*
- **Fertility** = *Fertility rate, mean of total births per woman. How many childs have born during a year per women.*
- **Healthcare** = *% of people using at least basic sanitation services. Amount of children covert by sanitation.*
- **Employment** = *Employment in agriculture (% of total employment), *Employment in services (% of total employment), and *Employment in industry (% of total employment). Amount of people employed in these three relevant sectors.*
- **Renewable energy** = *Electricity production from renewable sources, excluding hydroelectric. The units are KWh.*
- **Mortality** = *Number of infant deaths.*
- **Outside investment** = *Foreign direct investment, which is the net inflow of investment to acquire a lasting management interest  (BoP, current US$).*
- **Pollution** = *Mortality rate over 100,000 population attributed to household and ambient air pollution and age-standardized.*
- **Alcoholism** = *Total alcohol consumed per capita measure in liters of pure alcohol, taking into account people who are 15 or more years of age.*
- **Tech adoption** = *% of GDP which goes to the research and development expenditure.*
- **Workers high education** = *Labor force with advanced education. % of total working-age population with high level education. It measures the probability of having a good job according to the studies.*
- **Optimisim and pessimisim** = *Suicide mortality rate per 100,000 population.*
- **Gender equality** = *Rate of gender equality in a country between  (**1=low to **6=high). It assesses the extent to which the country has installed institutions and programs to enforce laws and policies that promote equal access for men and women in education, health, the economy, and protection under law.*
- **Education** = *Share of youth not in education, employment or training, total, Total number of young people.* and *Government expenditure on education of total. *

To acomplish this, we use the function `isin` that will allow us to only select the the indicators afromentioned, that have been compilied in the list called *indicators_list*

-----

# NORMALIZATION

Taking as reference both works of https://www.pluralsight.com/guides/cleaning-up-data-from-outliers and https://careerfoundry.com/en/blog/data-analytics/how-to-find-outliers/, for normalizing our data we need to start computing the outliers and removing them from our dataframe. As there is not a direct function of pandas that performs this step, it´s been step-by-step code, where we begin with the computation of the quartiles, then the IQR (Inter Quartile Range) and finally the upper and lower limit.

##### IQR explanation

The interquartile range (IQR) measures the spread of the middle half of your data. It is the range for the middle 50% of your sample. Use the IQR to assess the variability where most of your values lie. Larger values indicate that the central portion of your data spread out further. Conversely, smaller values show that the middle values cluster more tightly.

To visualize the interquartile range, imagine dividing your data into quarters. Statisticians refer to these quarters as quartiles and label them from low to high as Q1, Q2, Q3, and Q4. The lowest quartile (Q1) covers the smallest quarter of values in your dataset. The upper quartile (Q4) comprises the highest quarter of values. The interquartile range is the middle half of the data that lies between the upper and lower quartiles. In other words, the interquartile range includes the 50% of data points that are above Q1 and below Q4.

When measuring variability, statisticians prefer using the interquartile range instead of the full data range because extreme values and outliers affect it less. Typically, use the IQR with a measure of central tendency, such as the median, to understand your data’s center and spread. This combination creates a fuller picture of your data’s distribution.

Therefore it is being utilized to get rid of all the outliers that may come from errors when creating the data or from unexpected years.

Firstly, we compute the first quartile (Q1=25%) and the third quartile (Q3=75%). For that, we have grouped the data by country code and indicator name, so we get the Q1 and Q3 values for each indicator in each geographical area. 

In [13]:
grouped=BronzeDataFrame.groupby(['Country Code','Indicator Name'])
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AE50FA37F0>

In [14]:
Q1=BronzeDataFrame.groupby(['Country Code','Indicator Name']).quantile(0.25)
Q3=BronzeDataFrame.groupby(['Country Code','Indicator Name']).quantile(0.75)
IQR=Q3-Q1
IQR

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Value
Country Code,Indicator Name,Unnamed: 2_level_1,Unnamed: 3_level_1
ARE,Alcohol per capita,10.0,6.400000e-01
ARE,Education GExp,0.0,0.000000e+00
ARE,Employment-agriculture,14.0,5.130000e+00
ARE,Employment-industry,14.0,1.449997e+00
ARE,Employment-services,14.0,3.830002e+00
...,...,...,...
ZAF,Ninis,12.5,2.997499e+00
ZAF,R&D GExp,8.0,1.011100e-01
ZAF,Renewable electricity,12.5,2.292500e+08
ZAF,Suicide,9.5,1.025000e+00


Once we got the quartiles, we compute the upper and lower limit, with a basic mathematical expression.

In [15]:
lower_limit=Q1 - 1.5 * IQR
lower=lower_limit.drop(['Date'],axis=1)
lower.rename(columns={"Value":"Lower limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Lower limit
Country Code,Indicator Name,Unnamed: 2_level_1
ARE,Alcohol per capita,2.190000e+00
ARE,Education GExp,1.026766e+01
ARE,Employment-agriculture,-4.905000e+00
ARE,Employment-industry,3.131501e+01
ARE,Employment-services,5.269500e+01
...,...,...
ZAF,Ninis,2.684375e+01
ZAF,R&D GExp,5.828450e-01
ZAF,Renewable electricity,-2.623750e+08
ZAF,Suicide,2.203750e+01


In [16]:
upper_limit=Q3 + 1.5 * IQR
upper=upper_limit.drop(['Date'],axis=1)
upper.rename(columns={"Value":"Upper limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Upper limit
Country Code,Indicator Name,Unnamed: 2_level_1
ARE,Alcohol per capita,4.750000e+00
ARE,Education GExp,1.026766e+01
ARE,Employment-agriculture,1.561500e+01
ARE,Employment-industry,3.711499e+01
ARE,Employment-services,6.801500e+01
...,...,...
ZAF,Ninis,3.883375e+01
ZAF,R&D GExp,9.872850e-01
ZAF,Renewable electricity,6.546250e+08
ZAF,Suicide,2.613750e+01


Thirdly, we join the three tables we have (main dataframe, upper limit and lower limit) by matching country code and indicator name..

In [17]:
dfs = [BronzeDataFrame,lower,upper]
df_joined = ft.reduce(lambda left, right: pd.merge(left, right, on=['Country Code','Indicator Name']), dfs)
df_joined

Unnamed: 0,Country Code,Indicator Name,Date,Value_x,Value_y,Value
0,DZA,Exports-Commercial services,1990,4.795977e+08,1.736231e+09,4.453536e+09
1,DZA,Exports-Commercial services,1991,3.747657e+08,1.736231e+09,4.453536e+09
2,DZA,Exports-Commercial services,2005,2.466000e+09,1.736231e+09,4.453536e+09
3,DZA,Exports-Commercial services,2006,2.512000e+09,1.736231e+09,4.453536e+09
4,DZA,Exports-Commercial services,2007,2.786733e+09,1.736231e+09,4.453536e+09
...,...,...,...,...,...,...
20372,YEM,Alcohol per capita,2000,7.900000e-01,-3.725000e-01,7.675000e-01
20373,YEM,Alcohol per capita,2005,3.400000e-01,-3.725000e-01,7.675000e-01
20374,YEM,Alcohol per capita,2010,1.800000e-01,-3.725000e-01,7.675000e-01
20375,YEM,Alcohol per capita,2015,5.500000e-02,-3.725000e-01,7.675000e-01


In [18]:
list(df_joined)

['Country Code', 'Indicator Name', 'Date', 'Value_x', 'Value_y', 'Value']

We rename the columns of the new table, as the columns headers are not saved after the joining. 

In [19]:
renamed=df_joined.set_axis(['Country','Indicator','Year', 'Real value', 'Lower value', 'Upper value'], axis=1, inplace=False)
renamed

Unnamed: 0,Country,Indicator,Year,Real value,Lower value,Upper value
0,DZA,Exports-Commercial services,1990,4.795977e+08,1.736231e+09,4.453536e+09
1,DZA,Exports-Commercial services,1991,3.747657e+08,1.736231e+09,4.453536e+09
2,DZA,Exports-Commercial services,2005,2.466000e+09,1.736231e+09,4.453536e+09
3,DZA,Exports-Commercial services,2006,2.512000e+09,1.736231e+09,4.453536e+09
4,DZA,Exports-Commercial services,2007,2.786733e+09,1.736231e+09,4.453536e+09
...,...,...,...,...,...,...
20372,YEM,Alcohol per capita,2000,7.900000e-01,-3.725000e-01,7.675000e-01
20373,YEM,Alcohol per capita,2005,3.400000e-01,-3.725000e-01,7.675000e-01
20374,YEM,Alcohol per capita,2010,1.800000e-01,-3.725000e-01,7.675000e-01
20375,YEM,Alcohol per capita,2015,5.500000e-02,-3.725000e-01,7.675000e-01


Now that we have the table correctly defined, we remove from our dataframe the values that are outside our range, as it means that they are outliers.

In [20]:
sin_outliers=renamed.loc[~((renamed['Real value']<renamed['Lower value']) | (renamed['Real value']>renamed['Upper value']))]
sin_outliers

Unnamed: 0,Country,Indicator,Year,Real value,Lower value,Upper value
2,DZA,Exports-Commercial services,2005,2.466000e+09,1.736231e+09,4.453536e+09
3,DZA,Exports-Commercial services,2006,2.512000e+09,1.736231e+09,4.453536e+09
4,DZA,Exports-Commercial services,2007,2.786733e+09,1.736231e+09,4.453536e+09
5,DZA,Exports-Commercial services,2008,3.412421e+09,1.736231e+09,4.453536e+09
6,DZA,Exports-Commercial services,2009,2.744716e+09,1.736231e+09,4.453536e+09
...,...,...,...,...,...,...
20371,YEM,Suicide,2019,5.800000e+00,5.400000e+00,6.200000e+00
20373,YEM,Alcohol per capita,2005,3.400000e-01,-3.725000e-01,7.675000e-01
20374,YEM,Alcohol per capita,2010,1.800000e-01,-3.725000e-01,7.675000e-01
20375,YEM,Alcohol per capita,2015,5.500000e-02,-3.725000e-01,7.675000e-01


From the data above, we can perceive that our data comes down from 19944 rows to 19424, so 500 were outliers. The next steps are to order and display data better, removing those columns that we just do not need and pivoting the rows and columns. 

In [21]:
df_limpio=sin_outliers.drop(['Lower value','Upper value'],axis=1)
df_limpio

Unnamed: 0,Country,Indicator,Year,Real value
2,DZA,Exports-Commercial services,2005,2.466000e+09
3,DZA,Exports-Commercial services,2006,2.512000e+09
4,DZA,Exports-Commercial services,2007,2.786733e+09
5,DZA,Exports-Commercial services,2008,3.412421e+09
6,DZA,Exports-Commercial services,2009,2.744716e+09
...,...,...,...,...
20371,YEM,Suicide,2019,5.800000e+00
20373,YEM,Alcohol per capita,2005,3.400000e-01
20374,YEM,Alcohol per capita,2010,1.800000e-01
20375,YEM,Alcohol per capita,2015,5.500000e-02


In [None]:
cols=df_limpio['Indicator'].unique().tolist()

In [22]:
SilverDataFrame=df_limpio.set_index(["Country", "Year"]).pivot(columns="Indicator", values="Real value").reset_index()
SilverDataFrame

Indicator,Country,Year,Alcohol per capita,Education GExp,Employment-agriculture,Employment-industry,Employment-services,Exports-Commercial services,Exports-G&S,Fertility rate,...,International taxes,Literacy rate,Mortality-infants,Mortality-pollution,Net migration,Ninis,R&D GExp,Renewable electricity,Suicide,Workers high education
0,ARE,1990,,,,,,,,4.454,...,,,672.0,,,,,0.0,,
1,ARE,1991,,,8.46,33.330002,58.200001,,,4.253,...,,,645.0,,,,,0.0,,
2,ARE,1992,,,8.37,33.360001,58.279999,,,4.041,...,,,618.0,,368126.0,,,0.0,,
3,ARE,1993,,,8.24,33.470001,58.290001,,,3.827,...,,,592.0,,,,,0.0,,
4,ARE,1994,,,8.13,33.490002,58.380001,,,3.618,...,,,568.0,,,,,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504,ZAF,2017,,18.719290,5.28,23.340000,71.379997,1.614806e+10,1.042884e+11,2.430,...,4.993941e+10,87.046669,32777.0,,727026.0,31.010000,0.83215,,25.2,83.809998
1505,ZAF,2018,9.52,18.901590,5.16,23.129999,71.709999,1.670823e+10,1.112854e+11,2.405,...,5.572291e+10,,31810.0,,,31.559999,,,24.1,82.879997
1506,ZAF,2019,,19.596230,5.28,22.309999,72.410004,1.554886e+10,1.060698e+11,2.381,...,5.522342e+10,95.022972,30937.0,,,32.459999,,,23.5,82.019997
1507,ZAF,2020,,19.527281,,,,8.404204e+09,9.317915e+10,2.358,...,,,30153.0,,,32.400002,,,,


On the other hand, another big stone of normalizations is to nan/null values, which we have in all variables.

In [23]:
SilverDataFrame.isna().sum().sum()

13351

As we can observe, we have lots of missing data, and as there is no optimal way to fullfill these values, thus, we will test some to arrive to the optimal method for our data set.

First, we need to create some lists so our loops work.

In [24]:
df=SilverDataFrame
europe_list=['DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD']
persian_list=['IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN']
naf_list=['DZA','EGY','LBY','ISR','TUR','MAR']
saf_list=['SEN','ZAF','LBR','MOZ','CMR','NGA','GHA']
asia_list=['BGD','IND','VNM','THA','IDN','PHL','KOR']
latam_list=['MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI']
two_list=['USA','CHN']
country_list=europe_list+persian_list+naf_list+saf_list+asia_list+latam_list+two_list
col_to_scale=['Gender equality','Exports-Commercial services','Renewable electricity','Employment-agriculture','Employment-industry','Employment-services','Exports-G&S','Fertility rate','Foreign investment','GDP','Education GExp','Workers high education','Literacy rate','Mortality-pollution','Net migration','Mortality-infants','Health services use','R&D GExp','Ninis','Suicide','International taxes','Alcohol per capita']


We are attempting the linear interpolation, which is achieved by geometrically rendering a straight line between two adjacent points on a graph or plane.

In [25]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

8519

Here we attempt the backward filling. (Filling the previous cell with future values)

In [26]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.fillna(method='bfill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.fillna(method='bfill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

4500

Here we will attempt the forward filling. (Filling the next cell with previous values)

In [27]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.fillna(method='ffill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

8519

And as none of the methods have worked out correctly, independently, we are going to mix them, to achieve a better result.

In [28]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datc=datc.fillna(method='ffill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

8519

In [29]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datc=datc.fillna(method='bfill')
data=datc

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

2447

And finally, mixing the three methods all together.

In [30]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datf=datc.fillna(method='bfill')
datr=datf.fillna(method='ffill')
data=datr

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data.isna().sum().sum()

2447

Fix explanation

Therefore, the preferred method for the Nan values´ treatment that we are going to develop is a mix, between the linear interpolation and backwards filling. The linear interpolation a form of interpolation, which involves the generation of new values based on an existing set of values. Linear interpolation is achieved by geometrically rendering a straight line between two adjacent points on a graph or plane. Whereas the backwards filling, will help us to arrive to those values which have not been fullfilled with the linear interpolation.

Moreover, we are also going to scale all the values between the max and min of each country for each variable.

In [31]:
dat=df.loc[df.loc[:, 'Country'] == country_list[0]]
datc=dat.interpolate(method="linear")
datf=datc.fillna(method='bfill')
datr=datf.fillna(method='ffill')
data=datr

for i in range(1,len(country_list)):
    dat=df.loc[df.loc[:, 'Country'] == country_list[i]]
    datc=dat.interpolate(method="linear")
    datc=datc.fillna(method='bfill')
    datc=datc.fillna(method='ffill')
    data=pd.concat((data, datc), axis = 0)
data

Indicator,Country,Year,Alcohol per capita,Education GExp,Employment-agriculture,Employment-industry,Employment-services,Exports-Commercial services,Exports-G&S,Fertility rate,...,International taxes,Literacy rate,Mortality-infants,Mortality-pollution,Net migration,Ninis,R&D GExp,Renewable electricity,Suicide,Workers high education
347,DEU,1990,1.00000,0.443529,1.000000,1.000000,0.000000,0.002386,0.000000,0.636364,...,,,1.000000,,0.966124,0.77758,0.000000,0.000000,0.956522,1.0
348,DEU,1991,1.00000,0.443529,1.000000,1.000000,0.000000,0.000000,0.025535,0.272727,...,,,1.000000,,0.966124,0.77758,0.000000,0.002525,0.956522,1.0
349,DEU,1992,1.00000,0.443529,0.964758,0.966793,0.034321,0.018415,0.046400,0.151515,...,,,0.875324,,0.966124,0.77758,0.000000,0.004025,0.956522,1.0
350,DEU,1993,1.00000,0.443529,0.942731,0.907021,0.088143,0.012182,0.010957,0.121212,...,,,0.765220,,0.823187,0.77758,0.000000,0.005848,0.956522,1.0
351,DEU,1994,1.00000,0.380220,0.903084,0.876660,0.119345,0.018513,0.041261,0.000000,...,,,0.670984,,0.680250,0.77758,0.000000,0.011012,0.956522,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,CHN,2016,0.99002,0.246406,0.068956,0.831461,0.867725,0.849888,0.804316,0.588235,...,0.790308,0.977482,0.042573,,0.264203,,0.974482,1.000000,0.016129,
248,CHN,2017,0.98004,0.212024,0.048007,0.753933,0.917460,0.867490,0.888169,0.647059,...,1.000000,0.988741,0.030745,,0.243061,,0.984436,1.000000,0.000000,
249,CHN,2018,0.97006,0.000000,0.021530,0.777528,0.942152,0.953748,0.974699,0.698529,...,0.920304,1.000000,0.019650,,0.243061,,1.000000,1.000000,0.000000,
250,CHN,2019,0.97006,0.000000,0.000000,0.676405,1.000000,1.000000,0.964730,0.742647,...,0.920304,1.000000,0.009383,,0.243061,,1.000000,1.000000,0.000000,


Now, we will drop the columns which have over 1000 missing values, because the absence of data creates an unreliable source.

In [32]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)
for i in range(0, len(col_to_scale)):
    if data[col_to_scale[i]].isna().sum()>1000:
        del(data[col_to_scale[i]])
        print(col_to_scale[i])
data

Gender equality
Mortality-pollution


Indicator,Country,Year,Alcohol per capita,Education GExp,Employment-agriculture,Employment-industry,Employment-services,Exports-Commercial services,Exports-G&S,Fertility rate,...,Health services use,International taxes,Literacy rate,Mortality-infants,Net migration,Ninis,R&D GExp,Renewable electricity,Suicide,Workers high education
347,DEU,1990,1.00000,0.443529,1.000000,1.000000,0.000000,0.002386,0.000000,0.636364,...,0.000000,,,1.000000,0.966124,0.77758,0.000000,0.000000,0.956522,1.0
348,DEU,1991,1.00000,0.443529,1.000000,1.000000,0.000000,0.000000,0.025535,0.272727,...,0.000000,,,1.000000,0.966124,0.77758,0.000000,0.002525,0.956522,1.0
349,DEU,1992,1.00000,0.443529,0.964758,0.966793,0.034321,0.018415,0.046400,0.151515,...,0.000000,,,0.875324,0.966124,0.77758,0.000000,0.004025,0.956522,1.0
350,DEU,1993,1.00000,0.443529,0.942731,0.907021,0.088143,0.012182,0.010957,0.121212,...,0.000000,,,0.765220,0.823187,0.77758,0.000000,0.005848,0.956522,1.0
351,DEU,1994,1.00000,0.380220,0.903084,0.876660,0.119345,0.018513,0.041261,0.000000,...,0.000000,,,0.670984,0.680250,0.77758,0.000000,0.011012,0.956522,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,CHN,2016,0.99002,0.246406,0.068956,0.831461,0.867725,0.849888,0.804316,0.588235,...,0.822241,0.790308,0.977482,0.042573,0.264203,,0.974482,1.000000,0.016129,
248,CHN,2017,0.98004,0.212024,0.048007,0.753933,0.917460,0.867490,0.888169,0.647059,...,0.868064,1.000000,0.988741,0.030745,0.243061,,0.984436,1.000000,0.000000,
249,CHN,2018,0.97006,0.000000,0.021530,0.777528,0.942152,0.953748,0.974699,0.698529,...,0.912945,0.920304,1.000000,0.019650,0.243061,,1.000000,1.000000,0.000000,
250,CHN,2019,0.97006,0.000000,0.000000,0.676405,1.000000,1.000000,0.964730,0.742647,...,0.956910,0.920304,1.000000,0.009383,0.243061,,1.000000,1.000000,0.000000,


As a result we have dropped the *Gender equality* and *Mortality-pollution* variables.

In [None]:
columns=data.columns.values.tolist()

In [None]:
for i in range(2,len(columns)):
    data[columns[i]]=data[columns[i]]/data.iloc[0,i]
data

For the next part of analyzing this data, we think it is gonna be interesting to have it classify by the categories of the Country groups defined before, to which we call "Continent". This category is useful as it groups the nations with similar economies or geographical proximity, so we can extract common conclusions from them.

We create a dictionary with the regions and the countries included in each one. Where we will relate the countries and regions so then we can apply the .map function and arrive to the final dataframe.

In [33]:
countries_by_region = {
    "Europe": ('DEU','FRA','SWE','GBR','ESP','HRV','POL','GRC','AUT','NLD'),
    'Persian Gulf': ('IRQ','QAT','ARE','SAU','AZE','YEM','YDR','OMN'),
    'North Africa':('DZA','EGY','LBY','ISR','TUR','MAR'),
    'South Africa':('SEN','ZAF','LBR','MOZ','CMR','NGA','GHA'),
    'Asia':('BGD','IND','VNM','THA','IDN','PHL','KOR'),
    'Latam':('MEX','BRA','ARG','PER','VEN','COL','CHL','PAN','CRI'),
    'Pair':('USA','CHN')
    }

all_countries = {}
for region in countries_by_region.keys():
  for country in countries_by_region[region]:
    all_countries[country] = region

print(all_countries)

{'DEU': 'Europe', 'FRA': 'Europe', 'SWE': 'Europe', 'GBR': 'Europe', 'ESP': 'Europe', 'HRV': 'Europe', 'POL': 'Europe', 'GRC': 'Europe', 'AUT': 'Europe', 'NLD': 'Europe', 'IRQ': 'Persian Gulf', 'QAT': 'Persian Gulf', 'ARE': 'Persian Gulf', 'SAU': 'Persian Gulf', 'AZE': 'Persian Gulf', 'YEM': 'Persian Gulf', 'YDR': 'Persian Gulf', 'OMN': 'Persian Gulf', 'DZA': 'North Africa', 'EGY': 'North Africa', 'LBY': 'North Africa', 'ISR': 'North Africa', 'TUR': 'North Africa', 'MAR': 'North Africa', 'SEN': 'South Africa', 'ZAF': 'South Africa', 'LBR': 'South Africa', 'MOZ': 'South Africa', 'CMR': 'South Africa', 'NGA': 'South Africa', 'GHA': 'South Africa', 'BGD': 'Asia', 'IND': 'Asia', 'VNM': 'Asia', 'THA': 'Asia', 'IDN': 'Asia', 'PHL': 'Asia', 'KOR': 'Asia', 'MEX': 'Latam', 'BRA': 'Latam', 'ARG': 'Latam', 'PER': 'Latam', 'VEN': 'Latam', 'COL': 'Latam', 'CHL': 'Latam', 'PAN': 'Latam', 'CRI': 'Latam', 'USA': 'Pair', 'CHN': 'Pair'}


In [34]:
data['Continent']=data['Country'].map(all_countries)
Goldendataframe=data
Goldendataframe

Indicator,Country,Year,Alcohol per capita,Education GExp,Employment-agriculture,Employment-industry,Employment-services,Exports-Commercial services,Exports-G&S,Fertility rate,...,International taxes,Literacy rate,Mortality-infants,Net migration,Ninis,R&D GExp,Renewable electricity,Suicide,Workers high education,Continent
347,DEU,1990,1.00000,0.443529,1.000000,1.000000,0.000000,0.002386,0.000000,0.636364,...,,,1.000000,0.966124,0.77758,0.000000,0.000000,0.956522,1.0,Europe
348,DEU,1991,1.00000,0.443529,1.000000,1.000000,0.000000,0.000000,0.025535,0.272727,...,,,1.000000,0.966124,0.77758,0.000000,0.002525,0.956522,1.0,Europe
349,DEU,1992,1.00000,0.443529,0.964758,0.966793,0.034321,0.018415,0.046400,0.151515,...,,,0.875324,0.966124,0.77758,0.000000,0.004025,0.956522,1.0,Europe
350,DEU,1993,1.00000,0.443529,0.942731,0.907021,0.088143,0.012182,0.010957,0.121212,...,,,0.765220,0.823187,0.77758,0.000000,0.005848,0.956522,1.0,Europe
351,DEU,1994,1.00000,0.380220,0.903084,0.876660,0.119345,0.018513,0.041261,0.000000,...,,,0.670984,0.680250,0.77758,0.000000,0.011012,0.956522,1.0,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,CHN,2016,0.99002,0.246406,0.068956,0.831461,0.867725,0.849888,0.804316,0.588235,...,0.790308,0.977482,0.042573,0.264203,,0.974482,1.000000,0.016129,,Pair
248,CHN,2017,0.98004,0.212024,0.048007,0.753933,0.917460,0.867490,0.888169,0.647059,...,1.000000,0.988741,0.030745,0.243061,,0.984436,1.000000,0.000000,,Pair
249,CHN,2018,0.97006,0.000000,0.021530,0.777528,0.942152,0.953748,0.974699,0.698529,...,0.920304,1.000000,0.019650,0.243061,,1.000000,1.000000,0.000000,,Pair
250,CHN,2019,0.97006,0.000000,0.000000,0.676405,1.000000,1.000000,0.964730,0.742647,...,0.920304,1.000000,0.009383,0.243061,,1.000000,1.000000,0.000000,,Pair


With that all, we export our dataframe all-in-one and by the continent category.

In [35]:
Goldendataframe.to_csv(os.getcwd()+'/Data/GoldenDataFrame.csv')

In [36]:
for region, data in Goldendataframe.groupby('Continent'):
   data.to_csv(os.getcwd()+'/Data/{}.csv'.format(region))

In [37]:
columns=Goldendataframe.columns.values.tolist()
clist=Goldendataframe['Country'].unique()
common=['Unnamed: 0','Country','Year']

In the following cell, we have defined a function that will allow us to calculate the different posibilities of relations: cuadratic, cubic and logaritmic.

In [38]:
def multcolumn(frame):
    for u in range(2, len(columns)-1):
        name=columns[u]+'.l'
        name2=columns[u]+'.^2'
        name3=columns[u]+'.^3'
        namelog=columns[u]+'.log'
        frame.loc[:,name2] = frame[columns[u]]**2
        frame.loc[:,name3] = frame[columns[u]]**3
        frame.loc[:,namelog] = np.log(frame[columns[u]])
        frame.rename(columns={columns[u]:name}, inplace=True)

Moreover, we want to know the correlation between all the variables, so to acomplish this, we have created the following loop, which will help us create a new dataframe where we will have: the *Indicator*, the *Type* of relation, the value of the *R^2*, its *Behaviour*, the *Country* and the *Continent*.

In [39]:
df= pd.read_csv (os.getcwd()+'/Data/'+'GoldenDataFrame.csv')
multcolumn(df)

Firstly we are going to create two lists for the variables, which their p-value is under 0.05 for each correlation, so later on, we can calculate only the correlations of those variables.

In [None]:
dat=df.loc[df.loc[:, 'Country'] == clist[0]]
listacorpe=[]
listacorsp=[]
clmns=dat.columns.values.tolist()
for c in range(0, len(clmns)):
    if dat[clmns[c]].isna().sum()>=1:
        del(dat[clmns[c]])
pilares=dat.columns.values.tolist()
for u in range(0,len(pilares)):
    if is_numeric_dtype(dat[pilares[u]]):
        correlation, pvalue=pearsonr(dat[pilares[u]], dat['GDP (current US$).l'])
        if pvalue<=0.05:
            listacorpe.append(pilares[u])
        else:
            pass
        correlation, pvalue=spearmanr(dat[pilares[u]], dat['GDP (current US$).l'])
        if pvalue<=0.05:
            listacorsp.append(pilares[u])
        else:
            pass
    else:
        pass

Secondly, we need to calculate the correlation table for each country, therefore we use the basic function `corr()` which provides either the Pearson correlation table or the Spearman correlation table, as well as a filter for the countries.

In [40]:
dat=df.loc[df.loc[:, 'Country'] == clist[0]]

datp=dat[dat.columns[dat.columns.isin(listacorpe)]]
corp=datp.corr('pearson')

datsp=dat[dat.columns[dat.columns.isin(listacorsp)]]
cors=datsp.corr('spearman')

Then we calculate the coefficient of determination which is the correlation squared.

In [41]:
corp.loc[:,'R^2 Pearson'] = corp['GDP (current US$).l']**2

cors.loc[:,'R^2 Spearman'] = cors['GDP (current US$).l']**2

Moreover, we are going to create new columns to know which *Indicator* are we talking about, and the *Type* of correlation that is being analyzed (linear, cuadratic, cubic or logarithmic)

In [42]:
corp.loc[:,'Indicator']=corp.index
corp[['Indicator','Type']]=corp.Indicator.str.split('.',1, expand=True)

cors.loc[:,'Indicator']=cors.index
cors[['Indicator','Type']]=cors.Indicator.str.split('.',1, expand=True)

Now, we can apply the filter we have consider that is enough, R^2>=0.75 to filter the correlations.

In [43]:
corpcolumn=corp[['Indicator','R^2 Pearson','Type','GDP (current US$).l']]
corpcolumn=corpcolumn.loc[corpcolumn.loc[:, 'R^2 Pearson'] >= 0.75]

corscolumn=cors[['Indicator','R^2 Spearman','Type','GDP (current US$).l']]
corscolumn=corscolumn.loc[corscolumn.loc[:, 'R^2 Spearman'] >= 0.6]

Furthermore, we add all the columns that we have created into a data frame, thanks to the following cell.

In [44]:
idp=corpcolumn.groupby('Indicator')['R^2 Pearson'].transform(max)==corpcolumn['R^2 Pearson']
corpcolumn[idp]
maxp_df=pd.DataFrame(corpcolumn[idp])

ids=corscolumn.groupby('Indicator')['R^2 Spearman'].transform(max)==corscolumn['R^2 Spearman']
corscolumn[ids]
maxs_df=pd.DataFrame(corscolumn[ids])

Here, we conmute the values, by expressions. For example if the correlation is positive, we want in the new column called *Behaviour* the word Positive. Or for the *Type* column if the greatest correlation is cuadratic we want to put, Cuadratic.

In [45]:
maxp_df['Behaviour']=np.where(maxp_df['GDP (current US$).l']>0, 'Positive', 'Negative')
maxp_df['Type']=maxp_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
maxp_df['Country']= clist[0]

maxs_df['Behaviour']=np.where(maxs_df['GDP (current US$).l']>0, 'Positive', 'Negative')
maxs_df['Type']=maxs_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
maxs_df['Country']= clist[0]

In addition, we also drop the columns which do not add any value, as *GDP*, *Year*, and *Unnamed:0*.

In [46]:
maxp_df.drop("GDP (current US$).l",axis=1,inplace=True)
maxp_df=maxp_df.reset_index(drop=True)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Year'].index)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='GDP (current US$)'].index)
maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Unnamed: 0'].index)

maxs_df.drop("GDP (current US$).l",axis=1,inplace=True)
maxs_df=maxs_df.reset_index(drop=True)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Year'].index)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='GDP (current US$)'].index)
maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Unnamed: 0'].index)
maxs_df=maxs_df.sort_values(by = 'R^2 Spearman',ascending = False)

And finally we sort the values in descending order by the column *R^2 Pearson*.

In [47]:
maxp_df=maxp_df.sort_values(by = 'R^2 Pearson',ascending = False)
pearsondf= maxp_df
spearmandf=maxs_df

So, we can do it with all the countries and create just one dataframe.

In [48]:
for i in range(1,len(clist)):
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]
    listacorpe=[]
    listacorsp=[]
    clmns=dat.columns.values.tolist()
    for c in range(0, len(clmns)):
        if dat[clmns[c]].isna().sum()>=1:
            del(dat[clmns[c]])
    pilares=dat.columns.values.tolist()
    for u in range(0,len(pilares)):
        if is_numeric_dtype(dat[pilares[u]]):
            correlation, pvalue=pearsonr(dat[pilares[u]], dat['GDP (current US$).l'])
            if pvalue<=0.05:
                listacorpe.append(pilares[u])
            else:
                pass
            correlation, pvalue=spearmanr(dat[pilares[u]], dat['GDP (current US$).l'])
            if pvalue<=0.05:
                listacorsp.append(pilares[u])
            else:
                pass
        else:
            pass
    
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]

    datp=dat[dat.columns[dat.columns.isin(listacorpe)]]
    corp=datp.corr('pearson')

    datsp=dat[dat.columns[dat.columns.isin(listacorsp)]]
    cors=datsp.corr('spearman')


    corp.loc[:,'R^2 Pearson'] = corp['GDP (current US$).l']**2

    cors.loc[:,'R^2 Spearman'] = cors['GDP (current US$).l']**2


    corp.loc[:,'Indicator']=corp.index
    corp[['Indicator','Type']]=corp.Indicator.str.split('.',1, expand=True)

    cors.loc[:,'Indicator']=cors.index
    cors[['Indicator','Type']]=cors.Indicator.str.split('.',1, expand=True)


    corpcolumn=corp[['Indicator','R^2 Pearson','Type','GDP (current US$).l']]
    corpcolumn=corpcolumn.loc[corpcolumn.loc[:, 'R^2 Pearson'] >= 0.75]
    
    corscolumn=cors[['Indicator','R^2 Spearman','Type','GDP (current US$).l']]
    corscolumn=corscolumn.loc[corscolumn.loc[:, 'R^2 Spearman'] >= 0.6]


    idp=corpcolumn.groupby('Indicator')['R^2 Pearson'].transform(max)==corpcolumn['R^2 Pearson']
    corpcolumn[idp]
    maxp_df=pd.DataFrame(corpcolumn[idp])

    ids=corscolumn.groupby('Indicator')['R^2 Spearman'].transform(max)==corscolumn['R^2 Spearman']
    corscolumn[ids]
    maxs_df=pd.DataFrame(corscolumn[ids])


    maxp_df['Behaviour']=np.where(maxp_df['GDP (current US$).l']>0, 'Positive', 'Negative')
    maxp_df['Type']=maxp_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
    maxp_df['Country']= clist[i]

    maxs_df['Behaviour']=np.where(maxs_df['GDP (current US$).l']>0, 'Positive', 'Negative')
    maxs_df['Type']=maxs_df['Type'].replace(['l','^2','^3','log'],['Linear','Cuadratic','Cubic','Logarithmic'])
    maxs_df['Country']= clist[i]


    maxp_df.drop("GDP (current US$).l",axis=1,inplace=True)
    maxp_df=maxp_df.reset_index(drop=True)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Year'].index)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='GDP (current US$)'].index)
    maxp_df = maxp_df.drop(maxp_df[maxp_df['Indicator']=='Unnamed: 0'].index)

    maxs_df.drop("GDP (current US$).l",axis=1,inplace=True)
    maxs_df=maxs_df.reset_index(drop=True)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Year'].index)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='GDP (current US$)'].index)
    maxs_df = maxs_df.drop(maxs_df[maxs_df['Indicator']=='Unnamed: 0'].index)
    maxs_df=maxs_df.sort_values(by = 'R^2 Spearman',ascending = False)


    maxp_df=maxp_df.sort_values(by = 'R^2 Pearson',ascending = False)
    pearsondf= maxp_df
    spearmandf=maxs_df

display(spearmandf.merge(pearsondf, left_on=('Indicator', 'Country','Type','Behaviour'), right_on=('Indicator', 'Country','Type','Behaviour')))

Unnamed: 0,Indicator,Type,R^2 Pearson,Behaviour,Country,Continent
5,Exports-G&S,,0.954928,Positive,DEU,Europe
7,Health services use,,0.916594,Positive,DEU,Europe
4,Exports-Commercial services,,0.911725,Positive,DEU,Europe
10,Employment-services,Cuadratic,0.883525,Positive,DEU,Europe
12,Alcohol per capita,Cubic,0.875820,Negative,DEU,Europe
...,...,...,...,...,...,...
6,Suicide,,0.922759,Negative,CHN,Pair
5,Renewable electricity,,0.912560,Positive,CHN,Pair
2,Exports-Commercial services,,0.879177,Positive,CHN,Pair
14,Alcohol per capita,Cuadratic,0.867314,Positive,CHN,Pair


Now that we’ve loaded the data, we can start right away to create widgets. These widgets are essentials to add interactivity to our visualizations. We’re going to use two widgets: both, multiple selection widgets. To create these widgets, we can use `ipywidgets` library that is available for Jupyter Notebook.

The first widget that we are going to create is the multiple selection widget. We can do this by using `SelectMultiple()attribute` from `ipywidgets`. With this widget, we have the option to visualize the R^2 only in particular selection of indicators instead of all.

The first argument that we should specify is `options` , which should contain the list of available options of our variable (in our case different indicators). The next one is `value` , which should contain the variable values that we want to display as default, and then `description` is for the text field to describe the name of the widget.The rest of options are just visual details.

In [49]:
unique_tri = demo2['Indicator'].unique()
tri = widgets.SelectMultiple(
    options = unique_tri.tolist(),
    value = ['Exports-G&S'],
    description='Indicator',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)

def graf1(tri):
    dat=demo2.loc[demo2.loc[:, 'Indicator'].isin(np.array(tri))]
    a=px.choropleth(dat, locations="Country", locationmode='ISO-3', 
                     color="R^2 Pearson", hover_name="Country",hover_data = [dat.Type, dat.Behaviour],projection="natural earth",
                     color_continuous_scale='Reds', width=700, height=500, title= dat.Indicator.unique().tolist()[0])
    print(tri)
    a.show()
widgets.interactive(graf1, tri=tri)


interactive(children=(SelectMultiple(description='Indicator', index=(0,), layout=Layout(height='80px', width='…

To wrap up, we can create the second widget that is exactly the same as the previous multiple selection widget. The purpose of this widget is to enable us to choose which Continent we want to visualize. Below is the code implementation of this widget.

In [50]:
unique_tric = demo2['Continent'].unique()
tric = widgets.SelectMultiple(
    options = unique_tric.tolist(),
    value = ['North Africa'],
    description='Continent',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)

def graf1(tric):
    dat=demo2.loc[demo2.loc[:, 'Continent'].isin(np.array(tric))]
    a=px.scatter(dat, x="R^2 Pearson", y='Indicator',
                     color="R^2 Pearson", hover_name="Country",hover_data = [dat.Type, dat.Behaviour],
                     color_continuous_scale='Blues', width=700, height=500, title= dat.Continent.unique().tolist()[0])
    a.show()
widgets.interactive(graf1, tric=tric)

interactive(children=(SelectMultiple(description='Continent', index=(2,), layout=Layout(height='80px', width='…

Now, if we execute the following loop, it will provide with the variables that follow a normal distribution.

In [51]:
for i in range(0,len(clist)):
    dat=df.loc[df.loc[:, 'Country'] == clist[i]]
    for e in range(2,len(columns)):
        data=dat.iloc[:, e]
        stat, p = shapiro(data)
        print(clist[i] +"-"+ columns[e])
        print('Statistical=%.3f, p=%.3f' % (stat, p))
        alpha = 0.05
        if p > alpha:
            print('Data is NORMAL ( H0 not denied )')
        else:
            pass

DEU-Exports-Commercial services
Statistical=0.957, p=0.229
Data is NORMAL ( H0 not denied )
DEU-Renewable electricity
Statistical=0.806, p=0.000
DEU-Employment-agriculture
Statistical=0.888, p=0.003
DEU-Employment-industry
Statistical=0.906, p=0.009
DEU-Employment-services
Statistical=0.877, p=0.002
DEU-Exports-G&S
Statistical=0.887, p=0.003
DEU-Fertility rate
Statistical=0.893, p=0.004
DEU-Foreign investment
Statistical=0.851, p=0.000
DEU-GDP
Statistical=0.927, p=0.031
DEU-Education GExp
Statistical=0.972, p=0.548
Data is NORMAL ( H0 not denied )
DEU-Workers high education
Statistical=0.887, p=0.003
DEU-Literacy rate
Statistical=0.809, p=0.000
DEU-Net migration
Statistical=nan, p=1.000
Data is NORMAL ( H0 not denied )
DEU-Mortality-infants
Statistical=nan, p=1.000
Data is NORMAL ( H0 not denied )
DEU-Health services use
Statistical=0.804, p=0.000
DEU-R&D GExp
Statistical=0.879, p=0.002
DEU-Ninis
Statistical=0.819, p=0.000
DEU-Suicide
Statistical=0.892, p=0.004
DEU-International taxes
