# Visualize Notebook
In this notebook the Gold Dataframe will be read and extract information of it. The objective is to see the correlations between the variables and the GDP and also what countries have the highest correlation value.


## Imports
Start importing all the libraries and also the methods of pvalue and search indicators that will be used later in the notebook.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import shapiro
import os
import statistics
import seaborn as sns
from scipy.stats import norm

import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import interact, interact_manual
from Project.Utils.visualize import  search


## Correlation dataframe.
This dataframe is the main piece of the notebook. Consists in generating for every country the correlation matrix for it and saving only the correlation value of the different variables with the GDP. Later on is concatenated and generates the following result:

In [2]:
#One dataframe per country

df= pd.read_csv (os.getcwd()+'/Output/GoldDataframe.csv')
corr_df = pd.DataFrame()
corr_df.index.names = ['Country']
aux_df = pd.DataFrame()

#List all the countries, none repeated
countries = set(df['Country'].to_list())

country_dict = {}
corr_dict = {}

for country in countries:

    #Get the DataFrame for a given country
    country_df = df.loc[df['Country'] == country]

    #Correlation matrix for that country
    country_corr_df = country_df.corr()

    #Trim it into a single row
    country_corr_df = country_corr_df.rename(columns = {'GDP': country}).drop(index = ['Year', 'GDP'])

    #Add the row to a new DataFrame with the correlations for each country
    corr_df = pd.concat([corr_df, country_corr_df[country]], axis = 1)

#Transpose the resulting DataFrame to have the desired format and show it
corr_df = corr_df.transpose()
corr_df


Unnamed: 0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,Maternal Death Risk,...,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Guatemala,-0.910784,-0.330781,,,-0.780193,-0.958764,-0.948101,-0.493473,0.988392,-0.965211,...,-0.953367,-0.993836,0.755812,0.093338,0.800016,-0.824666,,,,0.994983
Guinea,0.498159,,,0.174019,-0.694951,-0.939092,-0.914487,,0.935467,-0.879412,...,0.907336,-0.954342,0.293413,0.625618,0.694268,0.873440,0.419932,,,0.968902
Honduras,-0.162653,,,0.825982,-0.678494,-0.973299,-0.903542,0.002044,0.991914,-0.967663,...,-0.984283,-0.993126,-0.441828,-0.770195,-0.008528,0.125164,,,,0.991977
Nicaragua,-0.314105,-0.827828,,0.976583,-0.920160,-0.960346,-0.720096,-0.700032,0.970542,-0.926781,...,-0.891714,-0.970781,-0.608949,-0.912412,0.931079,-0.849290,,,,0.973662
Saint Lucia,-0.843703,-0.909062,,0.796964,,,,,,,...,,,,,0.522356,-0.545707,,,,0.957435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Niger,-0.440685,-0.259552,,0.990417,-0.953210,-0.989109,-0.983832,,0.988585,-0.979136,...,0.756457,-0.632398,0.307843,0.756975,-0.241000,0.919169,0.154794,,,0.984857
Sao Tome and Principe,-0.729275,,,,,-0.990529,-0.915989,,0.947938,-0.938216,...,-0.299480,-0.975995,0.923671,0.603643,-0.085872,-0.876400,0.688839,,,0.985990
Lithuania,-0.835335,,-0.788530,0.912241,0.706347,0.574701,0.867250,-0.837935,0.746119,-0.812293,...,0.071064,-0.633390,0.197869,0.077926,0.350933,-0.447208,-0.940626,0.331796,0.637510,-0.922340
Greece,-0.854153,0.114222,0.408677,0.862916,-0.425234,0.588829,-0.253547,0.524593,0.171029,0.428454,...,0.048550,-0.072185,0.292634,0.380068,0.513542,0.163758,,0.236961,0.633141,0.155726


In [3]:
import ipywidgets as widgets
import plotly.express as px


indicator = widgets.SelectMultiple(
    options = corr_df.columns.tolist(),
    value = ['AgriShareGDP'],
    description='Indicator',
    disabled=False,
    layout = Layout(width='50%', height='80px')
)


def globalGrapgh(indicator):
    fig = px.choropleth(corr_df, locations = corr_df.index, locationmode='country names', 
                        color= indicator[0],projection="natural earth",
                        color_continuous_scale='Reds',
                    width=700, height=500)
    fig.update(layout_coloraxis_showscale=True)
    fig.show()
    
    
widgets.interactive(globalGrapgh, indicator = indicator) 

interactive(children=(SelectMultiple(description='Indicator', index=(0,), layout=Layout(height='80px', width='…

## Country Indicators
In this codeblock a widged is implmented to generate a table with the indicators. In order to make it work just select the country from the dropdown and procede to establasih the threshold. The default value for the threshold is 0.7, which we consider to be the minimum to consider an indicator correlated to the GDP. To analyze the results:
- H0: the indicator and the GDP are uncorrelated.
- H1: the indicator and the GDP are correlated.

The p-value represents the probability that your data would have arisen if the null hypothesis were true. If it's higher than 0.05 then it has no statistical signifcance. If an indicator is lower than 0.05 it has no significance and would be painted the cell in red.

If the both correlations are  >0.7 or <-0.7 and the p-values are < 0.05 then we have a high correlation that can be applied to the whole population.

The code consists in a IppyWidget with a slider and a dropdown. Once this parameters have been set it calls the method 'search' and applies a style format of the returned Dataframe.

In [4]:
def tableOut(Threshold, Country):

    df = search(Threshold, 'Country', Country)
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([0.05, 0.05], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)

    

@interact(
    Country = sorted(corr_df.index.tolist()),
    Threshold = (0, 1, 0.05))
def g(Country = 'Afghanistan', Threshold = 0.7):
    return tableOut(Threshold,Country)

    
        

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

## Region Indicators
It's almost the same codeblock as before, but for the regions. The groups have been formed according to the Dataframe of World Data Bank.


In [5]:
def tableOut2(Threshold, Region):

    df = search(Threshold, 'Region', Region)
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([0.05, 0.05], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Region = set(df['Region'].to_list()),
    Threshold = (0, 1, 0.05))
def g(Region = 'East Asia and Pacific', Threshold = 0.7):
    return tableOut2(Threshold, Region)

    

interactive(children=(Dropdown(description='Region', index=4, options=('Sub-Saharan Africa', 'South Asia', 'Mi…

## Global Indicators
This section reperesents the final analysis of the indicators. It shows in the indicators table with the highest correlations. With this results we can give an answer to the hypothesis of the project and establish the indicators with a high GDP relation.

In [6]:
def tableOut3(Threshold):

    df = search(Threshold, 'Global')
    if df.empty:
        return print("No indicators have been found.")

    left = pd.Series([0.05, 0.05], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df =df.style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:red;')\
        .highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
        .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableOut3(Threshold)


interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…

In [26]:
result_df = pd.DataFrame()
for column in corr_df.columns:
    aux = pd.DataFrame({'Indicator': [column],
                        'GDP Pearson Corr': [corr_df[column].median()]})
    result_df = pd.concat([result_df, aux], ignore_index=False, axis = 0)
    result_df = result_df.sort_values(by=["GDP Pearson Corr"], ascending = False)
    

result_df.set_index(['Indicator'], inplace=True)
print(result_df)

                         GDP Pearson Corr
Indicator                                
Life Expectancy                  0.923128
TotalAgri                        0.903238
Population                       0.901123
% Women Employment               0.652968
% Healthcare Investment          0.350933
% Education Expenditure          0.183463
Civil Liberties                  0.007080
% Employment Industry           -0.030818
% Population Growth             -0.030905
Freedom of Expression           -0.045268
EmploymentRural                 -0.083709
CreditToAgriFishForest          -0.197254
% Men Employment                -0.359601
Homicides                       -0.480969
Death Rate                      -0.567266
% Soldiers                      -0.708804
AgriShareGDP                    -0.714992
Birth Rate                      -0.833254
% Rural Population              -0.886077
Maternal Death Risk             -0.899635
Infant Mortality                -0.913824


In [34]:
def tableOutMedian(Threshold):
    df = pd.concat([result_df.loc[result_df['GDP Pearson Corr'] >= Threshold], result_df.loc[result_df['GDP Pearson Corr'] <= -Threshold]], axis = 0)

    left2 = pd.Series([-1], index=['GDP Pearson Corr'])
    left3 = pd.Series([0], index=['GDP Pearson Corr'])
    df = df.style.highlight_between(left=left2, right=1.5, axis=1, props='color:white; background-color:#929bfc;')\
        .highlight_between(left=left3, right=1.5, axis=1, props='color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr'])
    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableOutMedian(Threshold)

interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…