# Visualize Notebook
In this notebook the Gold Dataframe will be read and extract information of it. The objective is to see the correlations between the variables and the GDP and also what countries have the highest correlation value.


## Imports
Start importing all the libraries and also the methods of pvalue and search indicators that will be used later in the notebook.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import shapiro, pearsonr, spearmanr
import os
import statistics
import seaborn as sns
from scipy.stats import norm
import plotly.express as px
from IPython.display import display_html
import warnings
warnings.filterwarnings("ignore")
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import AppLayout, Button, GridspecLayout
from ipywidgets import interact, interact_manual
from Project.Utils.visualize import  search


### Variables that can be changed
The p-value can be changed to a desired value. For the default value is recommnended 0.05. If you desire a higher confidence level lower this variable. This variable reffers directly to significance level, but for porpouses of clarification is set to this name:

In [2]:
PVALUE_VAR = 0.01

## Correlation dataframe.
This dataframe is the main piece of the notebook. Consists in generating for every country the correlation matrix for it and saving only the correlation value of the different variables with the GDP. 
This codeblock also calculates the p-value of Spearman and Pearson, if the value > PVALUE_VAR the correlation will be deleted due to not having statiscal significance.

Later on is concatenated and generates the following result:

In [3]:
#One dataframe per country

write_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

df= pd.read_csv (write_path + 'GoldDataframe.csv')
corr_df = pd.DataFrame()
corr_df.index.names = [col_country]

#List all the countries, none repeated
countries = set(df[col_country].to_list())

country_dict = {}
corr_dict = {}

for country in countries:
    #Get the DataFrame for a given country
    country_df = df.loc[df[col_country] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr()

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: pearsonr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_year, col_gdp])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df = pd.concat([corr_df, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format, save it and show it
corr_df = corr_df.transpose()
corr_df.to_csv(os.getcwd()+'/Output/Corr_DF.csv')
corr_df


Unnamed: 0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,Maternal Death Risk,...,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Russia,-0.930642,,,0.979737,,0.775658,-0.804394,-0.885042,0.778920,-0.859358,...,0.878687,-0.572207,,-0.697507,,-0.704384,,0.856897,0.877261,
Ireland,-0.721866,0.757281,,0.866601,-0.917973,-0.627465,-0.741050,,0.918794,-0.649948,...,,-0.930707,0.776979,,0.775556,-0.724210,,,0.652968,0.907705
Georgia,-0.950866,0.684907,,,,0.892911,0.906141,,0.889794,,...,0.757804,-0.920270,0.854252,0.702861,,0.902297,,,,-0.955508
Qatar,,,,0.702969,-0.909210,-0.937375,-0.937474,,0.914261,-0.931513,...,,-0.937258,,,,0.793261,,,,0.932368
Kiribati,0.644179,,,,,-0.672807,-0.921030,,0.923800,-0.924940,...,,-0.875490,,,-0.811530,,,,,0.941253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,,,,0.675176,,,,,,,...,,,,,-0.784325,0.837351,,,,0.595893
Nicaragua,,-0.827828,,0.976583,-0.920160,-0.960346,-0.720096,-0.700032,0.970542,-0.926781,...,-0.891714,-0.970781,-0.608949,-0.912412,0.931079,-0.849290,,,,0.973662
Lebanon,,,,0.891247,-0.939681,,,,0.888425,,...,,-0.985057,0.890831,0.634427,0.896726,0.899102,,,,0.958025
Morocco,,,,0.864131,-0.984657,-0.778574,-0.977643,,0.988583,-0.976531,...,0.696034,-0.946684,,,0.843802,0.811457,,,,0.937438


In [4]:
#One dataframe per country

write_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'

df= pd.read_csv (write_path + 'GoldDataframe.csv')
corr_df_spear = pd.DataFrame()
corr_df_spear.index.names = [col_country]

#List all the countries, none repeated
countries = set(df[col_country].to_list())

country_dict = {}
corr_dict = {}

for country in countries:
    #Get the DataFrame for a given country
    country_df = df.loc[df[col_country] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr(method='spearman')

    #Significance for the correlations
    pval = country_df.corr(method = lambda x, y: spearmanr(x, y)[1]) - np.eye(*country_corr_df.shape)
    p = pval.applymap(lambda x: 1 if x < PVALUE_VAR else np.NaN)
    country_corr_df = country_corr_df * p

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {col_gdp: country}).drop(index = [col_year, col_gdp])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df_spear = pd.concat([corr_df_spear, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format, save it and show it
corr_df_spear = corr_df_spear.transpose()
corr_df_spear.to_csv(os.getcwd()+'/Output/Corr_DF.csv')
corr_df_spear

Unnamed: 0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,Maternal Death Risk,...,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Russia,-0.926924,,,0.976623,,0.694635,-0.739441,-0.765930,0.716883,-0.730949,...,0.794805,-0.720779,,-0.702176,,-0.571990,,0.729112,0.763269,
Ireland,-0.659519,,,0.854173,-0.950895,,-0.553907,,0.914555,,...,,-0.915232,0.783265,,0.796880,-0.845882,,,0.715280,0.915232
Georgia,-0.877558,0.706424,,,,0.783371,0.760637,,0.907792,,...,0.678792,-0.903896,0.818781,0.644575,,0.809125,,,,-0.907792
Qatar,-0.594349,,,0.769730,-0.892068,-0.836364,-0.924675,,0.836364,-0.843653,...,,-0.836364,,,,,,,,0.836364
Kiribati,0.726210,,,,,-0.744398,-0.885714,,0.885714,-0.897073,...,,-0.885714,,,-0.798686,,,,,0.885714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yemen,,,,0.657143,,,,,,,...,,,,,-0.757143,0.836364,,,,
Nicaragua,,-0.854355,,0.951948,-0.963590,-0.959740,-0.815200,-0.602627,0.959740,-0.972644,...,-0.920677,-0.959740,-0.835395,-0.940145,0.904839,-0.605215,,,,0.959740
Lebanon,,,,0.927273,-0.986997,,,,0.998701,,...,,-0.998701,0.874272,,0.929780,0.814337,,,,0.994805
Morocco,,,,0.667532,-0.972693,-0.905195,-0.979221,,0.975325,-0.973947,...,0.698701,-0.975325,0.563959,,0.617934,0.876226,,,,0.975325


## Cleaned GoldDataframe 
Before the correlation dataframe was generated and it detected all the correlations with a high p-value. To perform all the following analysis a clean GoldDataframe is needed. It simply detects if the correlation for the indicator and country is Nan and procedes to delete the whole column indicator for the country.

In [5]:

df = df.copy()
for country in countries:
    for ind in corr_df.columns:
        if np.isnan(corr_df[ind][country]):
            df.loc[df[col_country] == country, [ind]] = np.NaN

df.set_index([col_country, col_region, col_year]).to_csv(write_path + 'GoldDataframe_Clean.csv')
df.set_index([col_country, col_region, col_year])



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,GDP,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,...,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Country,Region,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Afghanistan,South Asia,2000,54.06300,,,3342.034168,,,48.021,11.718,,55.841,...,,77.922,,0.625,,11.014,,,,20779957.0
Afghanistan,South Asia,2001,54.06300,,,3598.470576,,,47.505,11.387,,56.308,...,,77.831,,0.625,,12.037,,,,21606992.0
Afghanistan,South Asia,2002,45.13440,,,4141.523943,,,46.901,11.048,,56.784,...,,77.739,,0.625,,10.048,,,,22600774.0
Afghanistan,South Asia,2003,41.90340,,,4729.042179,,,46.231,10.704,,57.271,...,,77.647,,0.687,,10.227,,,,23680871.0
Afghanistan,South Asia,2004,35.61280,,,5388.482107,,,45.507,10.356,,57.772,...,,77.500,,0.677,,11.414,,,,24726689.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,Sub-Saharan Africa,2016,7.87399,,,20548.678073,,0.75975,,8.286,,60.294,...,1.549294,67.704,0.430,0.389,,7.425,,,,14030338.0
Zimbabwe,Sub-Saharan Africa,2017,8.34095,,,22040.902301,,0.75072,,8.044,,60.812,...,1.459406,67.763,0.488,0.431,,7.296,,,,14236599.0
Zimbabwe,Sub-Saharan Africa,2018,8.30469,,,24311.560545,,0.73821,,7.883,,61.195,...,1.410382,67.791,0.447,0.471,,7.296,,,,14438812.0
Zimbabwe,Sub-Saharan Africa,2019,8.17322,,,21935.075306,,0.73821,,7.773,,61.490,...,1.421142,67.790,0.403,0.434,,7.296,,,,14645473.0


# Analysis
From now on all the data will be analysed with the Cleaned dataframes.

## Choropleth map
This block uses the IppyWidget for the indicator list and the Choroplet for the Map. 

In this section a map is generated and painted with the values of the indicator. You can salect the indicator from the list and an new map is generated, also along 2 lists of the highest and lowest countries for the selected indicator. 




In [6]:
indicator = widgets.SelectMultiple(
    options = corr_df.columns.tolist(),
    value = ['AgriShareGDP'],
    description='Indicator',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)



def globalGrapgh(indicator):
    ind = indicator[0]
    N = 10
    fig = px.choropleth(corr_df, locations = corr_df.index, locationmode='country names', 
                        color= indicator[0],projection="natural earth",
                        color_continuous_scale='RdBu',
                        width=700, height=500)

    pos_corr = corr_df.drop(corr_df.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = False).head(n = N)
    neg_corr = corr_df.drop(corr_df.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = True).head(n = N)

    fig.update(layout_coloraxis_showscale=True)
    fig.show()
    

    pos_styler = pos_corr.style.set_table_attributes("style='display:inline'").set_caption('Direct correlation')
    neg_styler = neg_corr.style.set_table_attributes("style='display:inline'").set_caption('Inverse correlation')

    space = "\xa0" * 10
    display_html(pos_styler._repr_html_()+ space  + neg_styler._repr_html_(), raw=True)


widgets.interactive(globalGrapgh, indicator = indicator)

interactive(children=(SelectMultiple(description='Indicator', index=(0,), layout=Layout(height='80px', width='…

### Choropleth for population

This map represents the population of the world. It is usefull for analysing the data if the aggregation is calculated later by the population, because the countries with a higher population have more impact in the global indicators.

In [7]:
df_gold = pd.read_csv(write_path + '/GoldDataframe.csv')
fig = px.choropleth(df_gold, locations="Country", locationmode='country names', 
                     color="Population", hover_name="Country",projection="natural earth",
                     animation_frame="Year",width=800, height=500,
                     color_continuous_scale='Reds',
                     range_color=[1000,340000000])
fig.update(layout_coloraxis_showscale=True)
fig.show()

## Country Indicators
In this codeblock a widged is implmented to generate a table with the indicators. In order to make it work just select the country from the dropdown and procede to establasih the threshold. The default value for the threshold is 0.7, which we consider to be the minimum to consider an indicator correlated to the GDP. To analyze the results:
- H0: the indicator and the GDP are uncorrelated.​
- H1: the indicator and the GDP are correlated.​

P-value: is the probability of obtaining  test results at least as extreme as the result actually observed.​
Confidence level: probability that a population parameter will fall between a set of values for a certain proportion of times. ​
Significance level:  probability of the study rejecting the null hypothesis when it is actually true.​


Confidence level is set to 1 - PVALUE_VAR% .  Significance level α = PVALUE_VAR

If p-value < α then reject  H0 and accept H1.​

The code consists in a IppyWidget with a slider and a dropdown. Once this parameters have been set it calls the method 'search' and applies a style format of the returned Dataframe.

In [8]:
def tableOut(Threshold, Country):

    df = search(Threshold, 'Country', Country)
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([PVALUE_VAR, PVALUE_VAR], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)

    

@interact(
    Country = sorted(corr_df.index.tolist()),
    Threshold = (0, 1, 0.05))
def g(Country = 'Afghanistan', Threshold = 0.7):
    return tableOut(Threshold,Country)

    
        

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

## Region Indicators
It's almost the same codeblock as before, but for the regions. The groups have been formed according to the Dataframe of World Data Bank.


In [9]:
def tableOut2(Threshold, Region):

    df = search(Threshold, 'Region', Region)
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([PVALUE_VAR, PVALUE_VAR], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Region = set(df['Region'].to_list()),
    Threshold = (0, 1, 0.05))
def g(Region = 'East Asia and Pacific', Threshold = 0.7):
    return tableOut2(Threshold, Region)

    

interactive(children=(Dropdown(description='Region', index=3, options=('Europe and Central Asia', 'Middle East…

## Global Indicators
This section reperesents the final analysis of the indicators. It shows in the indicators table with the highest correlations. With this results we can give an answer to the hypothesis of the project and establish the indicators with a high GDP relation.

In [10]:
def tableOut3(Threshold):

    df = search(Threshold, 'Global')
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([PVALUE_VAR, PVALUE_VAR], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableOut3(Threshold)


interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…

## Median Global Indicators
Reading the corr_df generated at the beggining of this notebook it calculates the median of each column. This approximation is very simple and might give an idea if the aggregation method is correct. Obviously the result won't be the same but the tendency has to be similar.

Having this in mind for each column in the corr_df import the name of the indicator and compute the median. Save this new dataframe in result_df.

In [11]:
result_df = pd.DataFrame()
for column in corr_df.columns:
    aux = pd.DataFrame({'Indicator': [column],
                        'GDP Pearson Corr': [corr_df[column].median()],
                        'GDP Spearman Corr': [corr_df_spear[column].median()]})
    result_df = pd.concat([result_df, aux], ignore_index=False, axis = 0)
    result_df = result_df.sort_values(by=["GDP Pearson Corr"], ascending = False)
    

result_df.set_index(['Indicator'], inplace=True)

Using a IppyWidget display the previously generated result_df. Lowering the Threshold more than 0.7 will show indicators that don't have a real correlation with the GDP, procede with caution extracting conclusions of them.

In [12]:
def tableOutMedian(Threshold):
    df = pd.concat([result_df.loc[(result_df['GDP Pearson Corr'] >= Threshold) & (result_df['GDP Spearman Corr'] >= Threshold)], result_df.loc[(result_df['GDP Pearson Corr'] <= -Threshold) & (result_df['GDP Pearson Corr'] <= -Threshold)]], axis = 0)

    if df.empty:
        return print("No indicators have been found.")

    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\

    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableOutMedian(Threshold)

interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…