# Visualize Notebook
In this notebook the Gold Dataframe will be read and extract information of it. The objective is to see the correlations between the variables and the GDP and also what countries have the highest correlation value.


## Imports
Start importing all the libraries and also the methods of pvalue and search indicators that will be used later in the notebook.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from scipy.stats import shapiro, pearsonr, spearmanr
import os
import statistics
import seaborn as sns
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")
import plotly.express as px
from IPython.display import display_html
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import AppLayout, Button, GridspecLayout
from ipywidgets import interact, interact_manual
from Project.Utils.visualize import  search
%store -r PVALUE_VAR

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'

region_url = os.getcwd() + '/Databases/AuxiliarData/world-regions-mod.csv'
output_path = os.getcwd() + '/Output/'


## LOADING DATAFRAMES

In [2]:
df = pd.read_csv(output_path + 'GoldDataframe.csv')
corr_df_pearson = pd.read_csv(output_path + 'Corr_DF_pearson.csv', index_col = col_country)
corr_df_spearman = pd.read_csv(output_path + 'Corr_DF_pearson.csv', index_col = col_country)
region_df = pd.read_csv(region_url, index_col = [col_region, col_country])

region_list = sorted(list(set(region_df.index.get_level_values(col_region))))

## Correlation dataframe.
This dataframe is the main piece of the notebook. Consists in generating for every country the correlation matrix for it and saving only the correlation value of the different variables with the GDP. 

Later on is concatenated and generates the following result:

In [3]:
corr_df = corr_df_pearson.copy()
corr_df.style.background_gradient(cmap='RdBu')

Unnamed: 0_level_0,AgriShareGDP,CreditToAgriFishForest,EmploymentRural,TotalAgri,% Soldiers,Birth Rate,Death Rate,Homicides,Life Expectancy,Maternal Death Risk,Infant Mortality,% Population Growth,% Rural Population,Civil Liberties,Freedom of Expression,% Healthcare Investment,% Employment Industry,% Education Expenditure,% Men Employment,% Women Employment,Population
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,-0.870233,,,,,-0.953124,-0.968941,,0.961018,-0.953193,-0.960322,,-0.931713,,0.680533,,-0.733667,,,,0.929911
Albania,-0.721376,,,0.955863,-0.809293,-0.923325,0.893681,-0.701728,0.903398,-0.862758,-0.945807,,-0.916568,0.862694,,,0.786991,,,,-0.925309
Algeria,,,,0.91197,-0.804748,0.951733,,,0.863359,,-0.892046,0.905385,-0.813765,0.808796,,0.656106,0.906667,,,,0.716609
Andorra,,,,,,,,,,,,,,,,,,,,,0.935892
Angola,,,,,-0.81089,-0.675961,-0.846921,,0.837416,-0.880651,-0.84646,,-0.79767,0.610423,0.528551,,,,,,0.731495
Antigua and Barbuda,,-0.889263,,0.70276,,-0.910879,0.594736,,0.90751,-0.608127,-0.893963,-0.658554,0.879513,,,,,,,,0.890888
Argentina,-0.503461,0.656928,,0.950021,-0.696227,-0.715829,-0.800463,,0.772551,-0.830826,-0.740793,,-0.780874,,-0.597334,,0.726257,-0.640218,,,0.763368
Armenia,-0.859478,,,0.936666,,,0.962825,,0.873227,-0.743241,-0.930687,0.614302,0.914839,0.781947,0.446507,,,,,,-0.787782
Aruba,,,,,,-0.781544,0.830859,,0.854511,,,-0.802325,0.818422,,,,,,,,0.903113
Australia,-0.781236,,,0.944949,-0.863665,,,-0.91139,0.924723,0.59839,-0.915224,,-0.895873,-0.611746,-0.576652,0.702196,,,,0.891998,0.888273


# Analysis
From now on all the data will be analysed with the Cleaned dataframes.

## Choropleth map
This block uses the IppyWidget for the indicator list and the Choroplet for the Map. 

In this section a map is generated and painted with the values of the indicator. You can salect the indicator from the list and an new map is generated, also along 2 lists of the highest and lowest countries for the selected indicator. 




In [4]:
indicator = widgets.SelectMultiple(
    options = corr_df.columns.tolist(),
    value = [corr_df.columns[0]],
    description = 'Indicator',
    disabled = False,
    layout = Layout(width = '50%', height = '80px')
)



def globalGrapgh(indicator):
    ind = indicator[0]
    N = 10
    fig = px.choropleth(corr_df, locations = corr_df.index, locationmode='country names', 
                        color = ind, projection="natural earth",
                        color_continuous_scale='RdBu',
                        width = 700, height=500)

    pos_corr = corr_df.drop(corr_df.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = False).head(n = N)
    neg_corr = corr_df.drop(corr_df.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = True).head(n = N)

    pos_corr = pos_corr.loc[pos_corr[ind] > 0]
    neg_corr = neg_corr.loc[neg_corr[ind] < 0]


    fig.update(layout_coloraxis_showscale=True)
    fig.show()
    
    pos_styler = pos_corr.style.set_table_attributes("style='display:inline'").set_caption('Direct correlation')
    neg_styler = neg_corr.style.set_table_attributes("style='display:inline'").set_caption('Inverse correlation')

    space = "\xa0" * 10
    display_html(pos_styler._repr_html_() + space  + neg_styler._repr_html_(), raw=True)

widgets.interactive(globalGrapgh, indicator = indicator)

interactive(children=(SelectMultiple(description='Indicator', index=(0,), layout=Layout(height='80px', width='…

### Choropleth for population

This map represents the population of the world. It is usefull for analysing the data if the aggregation is calculated later by the population, because the countries with a higher population have more impact in the global indicators.

In [5]:
#df_gold = pd.read_csv(output_path + 'GoldDataframe.csv')
fig = px.choropleth(df, locations = col_country, locationmode='country names', 
                     color = "Population", hover_name = col_country, projection = "natural earth",
                     animation_frame = col_year, width = 800, height = 500,
                     color_continuous_scale = 'Reds',
                     range_color = [1000, 340000000])
fig.update(layout_coloraxis_showscale = True)
fig.show()

## Country Indicators
In this codeblock a widged is implmented to generate a table with the indicators. In order to make it work just select the country from the dropdown and procede to establasih the threshold. The default value for the threshold is 0.7, which we consider to be the minimum to consider an indicator correlated to the GDP. To analyze the results:
- H0: the indicator and the GDP are uncorrelated.​
- H1: the indicator and the GDP are correlated.​

P-value: is the probability of obtaining  test results at least as extreme as the result actually observed.​
Confidence level: probability that a population parameter will fall between a set of values for a certain proportion of times. ​
Significance level:  probability of the study rejecting the null hypothesis when it is actually true.​


Confidence level is set to 1 - `PVALUE_VAR`% .  Significance level α = `PVALUE_VAR`

If `p-value` < α then reject  H0 and accept H1.​

The code consists in a IppyWidget with a slider and a dropdown. Once this parameters have been set it calls the method 'search' and applies a style format of the returned Dataframe.

In [6]:
def tableOut(Threshold, Country):

    df = search(Threshold, col_country, Country)
    if df.empty:
        return print("No indicators have been found.")

    left1 = pd.Series([PVALUE_VAR, PVALUE_VAR], index=['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index=['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props='color:white; background-color:red;')\
                 .highlight_between(left = left2, right = 1.5, axis = 1, props='color:white; background-color:#929bfc;')\
                 .highlight_between(left = left3, right = 1.5, axis = 1, props='color:white; background-color:#b3b9ff;')\
                 .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
                 .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Country = sorted(corr_df.index.tolist()),
    Threshold = (0, 1, 0.05))
def g(Country = 'Afghanistan', Threshold = 0.7):
    return tableOut(Threshold,Country)

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Andorra',…

## Region Indicators
We will follow the same procedure as before, with a significance level α = `PVALUE_VAR`.

This time, though, we do not have observations of the regions themselves, but their countries. Thus, we need to establish a procedure to estimate its values. We have opted for two methods:
- Aggregation of the indicators, using a weighted mean by population for relative indicators, and a summatory for absolute indicators. Then, calculate the correlations from this aggregated Dataframe. This is a standard procedure that sources such as the World Bank use to aggregate data from different areas.
- Calculating the median of the correlations, that is, from the correlation Dataframe we showed at the beginning of this notebook. That way, we obtain a more centered result and avoid the over-representation of countries like India or China, while giving more relevance to data from smaller countries.


### BY AGGREGATION

In [7]:
def tableRegAgg(Threshold, Region):

    df = search(Threshold, col_region, Region)
    if df.empty:
        return print("No indicators have been found.")

    left1 = pd.Series([PVALUE_VAR, PVALUE_VAR], index = ['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props = 'color:white; background-color:red;')\
                 .highlight_between(left = left2, right = 1.5, axis = 1, props = 'color:white; background-color:#929bfc;')\
                 .highlight_between(left = left3, right = 1.5, axis = 1, props = 'color:white; background-color:#b3b9ff;')\
                 .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
                 .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Region = region_list,
    Threshold = (0, 1, 0.05))
def g(Region = region_list[0], Threshold = 0.7):
    return tableRegAgg(Threshold, Region)


interactive(children=(Dropdown(description='Region', options=('East Asia and Pacific', 'Europe and Central Asi…

### BY MEDIAN

In [8]:
median_corr_df_pearson_region = pd.merge(corr_df_pearson, region_df, how = 'inner', left_index = True, right_index = True).groupby(level = col_region).median()
median_corr_df_spearman_region = pd.merge(corr_df_spearman, region_df, how = 'inner', left_index = True, right_index = True).groupby(level = col_region).median()

def tableRegMed(Region, Threshold):    
    median_corr_series_pearson_region = median_corr_df_pearson_region.loc[Region]
    median_corr_series_pearson_region.name = 'GDP Pearson Corr'
    median_corr_series_spearman_region = median_corr_df_spearman_region.loc[Region]
    median_corr_series_spearman_region.name = 'GDP Spearman Corr'
    df = pd.concat([median_corr_series_pearson_region, median_corr_series_spearman_region], axis = 1)
    df = df.loc[(abs(df['GDP Pearson Corr']) >= Threshold) & (abs(df['GDP Spearman Corr']) >= Threshold)]

    if df.empty:
        return print("No indicators have been found.")

    df = df.sort_values(by = df.columns[0], ascending = False, key = lambda row: df.sum(axis = 1))

    left1 = pd.Series([-1, -1], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    left2 = pd.Series([0, 0], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style\
        .highlight_between(left = left1, right = 1.5, axis = 1, props = 'color:white; background-color:#929bfc;')\
        .highlight_between(left = left2, right = 1.5, axis = 1, props = 'color:white; background-color:#b3b9ff;')\
        .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
    
    display(df)



@interact(
    Region = region_list,
    Threshold = (0, 1, 0.05))
def g(Region = region_list[0], Threshold = 0.7):
    return tableRegMed(Region, Threshold)


interactive(children=(Dropdown(description='Region', options=('East Asia and Pacific', 'Europe and Central Asi…

## Global Indicators
This section reperesents the final analysis of the indicators. It shows in the indicators table with the highest correlations, applying both methods we used to represent the correlations for the regions: aggregation and median.

With this results we can give an answer to the hypothesis of the project and establish the indicators with a high GDP relation.

### BY AGGREGATION

In [9]:
def tableWorldAgg(Threshold):

    df = search(Threshold, 'Global')
    if df.empty:
        return print("No indicators have been found.")

    left1 = pd.Series([PVALUE_VAR, PVALUE_VAR], index = ['P-value Pearson', 'P-value Spearman'])
    left2 = pd.Series([-1, -1], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    left3 = pd.Series([0, 0], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props = 'color:white; background-color:red;')\
                 .highlight_between(left = left2, right = 1.5, axis = 1, props = 'color:white; background-color:#929bfc;')\
                 .highlight_between(left = left3, right = 1.5, axis = 1, props = 'color:white; background-color:#b3b9ff;')\
                 .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\
                 .format('{:,.12f}', subset = ['P-value Pearson', 'P-value Spearman']) 
    
    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableWorldAgg(Threshold)


interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…

### BY MEDIAN

In [10]:
median_corr_df_pearson = pd.merge(corr_df_pearson, region_df, how = 'inner', left_index = True, right_index = True).median().rename('GDP Pearson Corr')
median_corr_df_spearman = pd.merge(corr_df_spearman, region_df, how = 'inner', left_index = True, right_index = True).median().rename('GDP Spearman Corr')

def tableWorldMed(Threshold):
    df = pd.concat([median_corr_df_pearson, median_corr_df_spearman], axis = 1)
    df = df.loc[(abs(df['GDP Pearson Corr']) >= Threshold) & (abs(df['GDP Spearman Corr']) >= Threshold)]

    if df.empty:
        return print("No indicators have been found.")

    df = df.sort_values(by = df.columns[0], ascending = False, key = lambda row: df.sum(axis = 1))

    left1 = pd.Series([-1, -1], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    left2 = pd.Series([0, 0], index = ['GDP Pearson Corr', 'GDP Spearman Corr'])
    df = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props = 'color:white; background-color:#929bfc;')\
                 .highlight_between(left = left2, right = 1.5, axis = 1, props = 'color:white; background-color:#b3b9ff;')\
                 .format('{:,.4f}', subset = ['GDP Pearson Corr', 'GDP Spearman Corr'])\

    display(df)



@interact(
    Threshold = (0, 1, 0.05))
def g(Threshold = 0.7):
    return tableWorldMed(Threshold)

interactive(children=(FloatSlider(value=0.7, description='Threshold', max=1.0, step=0.05), Output()), _dom_cla…