#  <u> 1st step : Load

Open the final merge table of the 'integration part '

In [3]:
# we import the useful librairies 

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import functools
import os
import plotly.express as px
from scipy.stats import shapiro
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import interact, interact_manual


# we open and read the merge table of all indicators
bronze_dataset = pd.read_csv ('./data/bronze_dataset.csv')
bronze_dataset = bronze_dataset.drop(bronze_dataset.columns[0], axis = 1)
bronze_dataset

Unnamed: 0,Code,Year,Indicator,Value
0,AFG,1966,Deaths,1.616590e+05
1,AFG,1966,LifeExpectancy,3.550000e+01
2,AFG,1966,GDP,5.000002e+08
3,AFG,1966,Fertility,7.320300e+00
4,AFG,1967,Deaths,1.625790e+05
...,...,...,...,...
66664,ZWE,1945,GDP,4.100000e+07
66665,ZWE,1946,GDP,4.800000e+07
66666,ZWE,1947,GDP,5.900000e+07
66667,ZWE,1948,GDP,7.200000e+07


# <u>STEP 2 : Normalization  

We compute the quantiles :  first quartile (Q1=25%) and the third quartile (Q3=75%). Then the interquartile range.

for each country and each indicators 


In [4]:
# 1st quartile 
Q1=bronze_dataset.groupby(['Code','Indicator']).quantile(0.25)

# 3rd quartile 
Q3=bronze_dataset.groupby(['Code','Indicator']).quantile(0.75)

#interquartile range 
IQR=Q3-Q1

IQR

Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Value
Code,Indicator,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,Fertility,35.5,1.317100e+00
ABW,GDP,8.5,3.090634e+08
ABW,LifeExpectancy,35.5,7.025000e+00
AFG,Deaths,27.0,4.707350e+04
AFG,Fertility,35.5,3.561750e-01
...,...,...,...
ZWE,Fertility,35.5,3.106400e+00
ZWE,GDP,45.5,4.186134e+09
ZWE,GenderInequality,15.5,5.250000e-02
ZWE,LifeExpectancy,35.5,8.150000e+00


Then we compute the upper and lower limit, and we delete the column year and rename the Value column

In [5]:
lower_limit =Q1 - 1.5 * IQR
lower_table =lower_limit.drop(['Year'],axis=1)
lower_table.rename(columns={"Value":"Lower limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Lower limit
Code,Indicator,Unnamed: 2_level_1
ABW,Fertility,-1.925000e-02
ABW,GDP,5.567916e+08
ABW,LifeExpectancy,5.616250e+01
AFG,Deaths,3.980225e+04
AFG,Fertility,6.647813e+00
...,...,...
ZWE,Fertility,-6.594250e-01
ZWE,GDP,-6.228451e+09
ZWE,GenderInequality,4.677500e-01
ZWE,LifeExpectancy,3.835000e+01


In [6]:
upper_limit=Q3 + 1.5 * IQR
upper_table=upper_limit.drop(['Year'],axis=1)
upper_table.rename(columns={"Value":"Upper limit"})

Unnamed: 0_level_0,Unnamed: 1_level_0,Upper limit
Code,Indicator,Unnamed: 2_level_1
ABW,Fertility,5.249150e+00
ABW,GDP,1.793045e+09
ABW,LifeExpectancy,8.426250e+01
AFG,Deaths,2.280962e+05
AFG,Fertility,8.072512e+00
...,...,...
ZWE,Fertility,1.176618e+01
ZWE,GDP,1.051608e+10
ZWE,GenderInequality,6.777500e-01
ZWE,LifeExpectancy,7.095000e+01



Then we merge the three tables : the Bronze_dataset, the upper_table and the lower_table
However, we use the functions reduce from functools
It allows to merge the three tables in one command

In [7]:
three_tables = [bronze_dataset,lower_table,upper_table]
tables_joined = functools.reduce(lambda left, right: pd.merge(left, right, on=['Code','Indicator']), three_tables)
tables_joined

Unnamed: 0,Code,Year,Indicator,Value_x,Value_y,Value
0,AFG,1966,Deaths,1.616590e+05,3.980225e+04,2.280962e+05
1,AFG,1967,Deaths,1.625790e+05,3.980225e+04,2.280962e+05
2,AFG,1968,Deaths,1.635730e+05,3.980225e+04,2.280962e+05
3,AFG,1969,Deaths,1.646380e+05,3.980225e+04,2.280962e+05
4,AFG,1970,Deaths,1.654300e+05,3.980225e+04,2.280962e+05
...,...,...,...,...,...,...
66489,OWID_GFR,1986,GDP,7.110545e+11,-5.467304e+11,9.798115e+11
66490,OWID_GFR,1987,GDP,7.913833e+11,-5.467304e+11,9.798115e+11
66491,OWID_GFR,1988,GDP,7.847509e+11,-5.467304e+11,9.798115e+11
66492,OWID_GFR,1989,GDP,8.517760e+11,-5.467304e+11,9.798115e+11


We rename the columns in order to understand better

In [8]:
renamed=tables_joined.set_axis(['Code','Year','Indicator', 'Real value', 'Lower value', 'Upper value'], axis=1)
renamed

Unnamed: 0,Code,Year,Indicator,Real value,Lower value,Upper value
0,AFG,1966,Deaths,1.616590e+05,3.980225e+04,2.280962e+05
1,AFG,1967,Deaths,1.625790e+05,3.980225e+04,2.280962e+05
2,AFG,1968,Deaths,1.635730e+05,3.980225e+04,2.280962e+05
3,AFG,1969,Deaths,1.646380e+05,3.980225e+04,2.280962e+05
4,AFG,1970,Deaths,1.654300e+05,3.980225e+04,2.280962e+05
...,...,...,...,...,...,...
66489,OWID_GFR,1986,GDP,7.110545e+11,-5.467304e+11,9.798115e+11
66490,OWID_GFR,1987,GDP,7.913833e+11,-5.467304e+11,9.798115e+11
66491,OWID_GFR,1988,GDP,7.847509e+11,-5.467304e+11,9.798115e+11
66492,OWID_GFR,1989,GDP,8.517760e+11,-5.467304e+11,9.798115e+11


In [10]:
renamed.to_csv('./data/bronze_dataset_with_outliers.csv')