In [1]:
import os
import pandas as pd
import numpy as np
from numpy import sort
from os import listdir
from os.path import isfile, join

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import matplotlib.pyplot as plt
import plotly.express as px

from IPython.display import display_html

from matplotlib_venn import venn3
from Project.Utils.norm import norm
from Project.Utils.max_corr import max_corr

from Project.Utils.visualize import search, searchTimeSeries, get_years

import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path = os.getcwd() + '/Databases/'
output_path = os.getcwd() + '/Output/'
country_path = output_path + '/Country/'
region_path = output_path + '/Region/'
cluster_path = output_path + '/Cluster/'

In [3]:
file_regions = 'AuxiliarData/world-regions-mod.csv'

file_gold = 'GoldDataframe.csv'
file_corr_pearson = 'Corr_DF_Pearson.csv'
file_corr_spearman = 'Corr_DF_Spearman.csv'
file_agg_region = 'AggregatedRegion_DataFrame.csv'
file_agg_world = 'AggregatedWorld_DataFrame.csv'
file_norm_df = 'Norm_DF.csv'
file_shifted_corr_country = 'Shifted_Corr_Country.csv'
file_shifted_corr_region = 'Shifted_Corr_Region.csv'

In [4]:
col_country = 'Country'
col_year = 'Year'
col_region = 'Region'
col_gdp = 'GDP'
col_cluster = 'Cluster'
col_shift = 'Shift'

col_1comp = '1st_component'
col_2comp = '2nd_component'

In [5]:
# READ GOLD DATAFRME
# Read Golden Dataframe and initialize the variables that depend on it.

df = pd.read_csv(output_path + file_gold, on_bad_lines = 'warn', index_col = ['Region', 'Country', 'Year'])

# List of countries and list of regions.
country_list = list(np.sort(df.index.get_level_values(col_country).unique()))
region_list = list(np.sort(df.index.get_level_values(col_region).unique()))

mode_dict = {}
mode_dict[col_country] = country_list
mode_dict[col_region] = region_list
mode_list = [col_country, col_region]

# Range of years.
min_year = df.index.get_level_values(col_year).min()
max_year = df.index.get_level_values(col_year).max()

# List of all indicators, except for the GDP.
indicators_list = df.columns.tolist()
indicators_list.remove(col_gdp)
indicators_list.sort()

In [6]:
# READ CLUSTERS
# Read both Dataframes for each indicators group: the one with the indicator values, and the one with the components.

ind_dict = {}
cluster_dict = {}
cluster_list = []
comp_dict = {}

for element in listdir(cluster_path):
    url = join(cluster_path, element)
    if isfile(url) and url.endswith('_Comp.csv'):
        ind = element.removesuffix('_Comp.csv')
        comp_df = pd.read_csv(url, on_bad_lines = 'warn', index_col = [col_cluster, col_country])
        comp_dict[ind] = comp_df
        cluster_number_df = comp_df.drop(columns = comp_df.columns).reset_index(col_cluster, drop = False).rename({col_cluster: ind}, axis = 1)
        cluster_list.append(cluster_number_df)

    elif isfile(url) and url.endswith('.csv'):
        ind = element.removesuffix('.csv')
        ind_df = pd.read_csv(url, on_bad_lines = 'warn', index_col = [col_cluster, col_country])
        cluster_dict[ind] = ind_df.reset_index(col_cluster, drop = False)
        ind_dict[ind] = np.sort(ind_df.columns)

cluster_df = pd.concat(cluster_list, axis = 1)
cluster_df = cluster_df.fillna(-1).astype(int)

In [7]:
cluster_df

Unnamed: 0_level_0,All indicators,Economic Indicators,Equality Indicators,Social-demographic Indicators
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Albania,0,6,4,3
Algeria,0,7,6,9
Armenia,0,1,8,3
Bosnia and Herzegovina,0,1,0,3
Bulgaria,0,2,0,3
...,...,...,...,...
Sao Tome and Principe,9,1,8,5
Solomon Islands,9,6,8,5
Togo,9,4,8,4
United Arab Emirates,9,0,4,4


In [8]:
def group_of(ind: str):     # NEEDS ACCESS TO IND_DICT AND A LIST OF ALL INDICATORS
    for group in ind_dict.keys():
        if (len(ind_dict[group]) < len(indicators_list)) and (ind in ind_dict[group]):
            return group
    return None
    return [group if (ind in ind_dict[group]) else None for group in ind_dict.keys()]

# FIND A GOOD WAY TO AVOID RETURNING THE ALL INDICATORS GROUP

In [9]:
def plot_cluster_map (df: pd.DataFrame, group_name: str = 'All indicators', *, col_1comp: str = '1st_component', col_2comp: str = '2nd_component'):
    if df is None: return None
    fig = px.scatter(df, x = col_1comp, y = col_2comp, text = df.index, size_max=100, color=col_cluster, category_orders={col_cluster: np.sort(df.loc[:, col_cluster].unique())})
    fig.update_layout(title_text = group_name, title_x=0.5)
    fig.update_traces(textposition = 'top center')
    fig.show()

def table_cluster (df: pd.DataFrame, group_name: str = 'All indicators', country: str = '', cluster_number: int = 0):
    if df is None: return None
    try:
        df_s = df.style
        df_s.apply_index(lambda i: ['background-color: #aadfff; font-weight: 500' if c == country else '' for c in i], axis = 0)
        df_s.apply(lambda row: ['background-color: #ccebff;' if row.name == country else '' for cell in row], axis = 1)
        df_s.set_table_styles([{'selector': 'td:hover', 'props': [('background-color', '#ddfdff')]}])
        tt = {}
        for col in df.columns:
            tt[col] = 'Column median: ' + str(df.loc[:, col].median())
        df_s.set_tooltips(pd.DataFrame(tt, index = df.index))

        # Display a short descriptive title and the Dataframe.
        display(country + ' belongs to Cluster ' + str(cluster_number) + '. This Cluster contains a total of ' + str(df.shape[0]) + ' countries.')
        display(df_s)
    except Exception:
        return print('No indicators available for this country.')

def venn_group(cluster_df, country):
    try:
        _, econ_ind, eq_ind, socdem_ind = ind_dict.keys()       # NON ROBUST STATEMENT
        _, set_econ, set_eq, set_socdem = (set(cluster_df.loc[lambda df: df[ind] == df.loc[country, ind], ind].index.to_list()) for ind in ind_dict.keys())
        venn = venn3([set_econ, set_socdem, set_eq], (econ_ind, socdem_ind, eq_ind))

        venn.get_label_by_id('100').set_text('\n'.join(set_econ - set_socdem - set_eq)) # Only econ
        venn.get_label_by_id('010').set_text('\n'.join(set_socdem - set_econ - set_eq)) # Only socdem
        venn.get_label_by_id('001').set_text('\n'.join(set_eq - set_econ - set_socdem)) # Only eq

        # The three pair-intersections is guaranteed only if there is an intersection of the three groups.
        if len(set_econ & set_socdem & set_eq):
            venn.get_label_by_id('111').set_text('\n'.join(set_econ & set_socdem & set_eq))
            venn.get_label_by_id('110').set_text('\n'.join(set_econ & set_socdem - set_eq))
            venn.get_label_by_id('101').set_text('\n'.join(set_econ & set_eq - set_socdem))
            venn.get_label_by_id('011').set_text('\n'.join(set_socdem & set_eq - set_econ))
        else:
            # If no center, check the intersections that do exist.
            if len(set_econ & set_socdem - set_eq):
                venn.get_label_by_id('110').set_text('\n'.join(set_econ & set_socdem - set_eq))
            if len(set_econ & set_eq - set_socdem):
                venn.get_label_by_id('101').set_text('\n'.join(set_econ & set_eq - set_socdem))
            if len(set_socdem & set_eq - set_econ):
                venn.get_label_by_id('011').set_text('\n'.join(set_socdem & set_eq - set_econ))

        plt.rcParams["figure.figsize"] = (12, 12)
        plt.show()

    except:
        return print('No indicators available for this country.')

def plot_time_series(df: pd.DataFrame, mode: str, zone: str, indicator: str, years: tuple, shift: int):
    data_s = df.loc[df.index.get_level_values(mode) == zone, [col_gdp, indicator]].groupby(level = col_year).median()

    min_year = years[0]
    max_year = years[1]

    min_year_gdp = min_year + max(shift, 0)
    max_year_gdp = max_year + min(shift, 0)

    min_year_ind = min_year - min(shift, 0)
    max_year_ind = max_year - max(shift, 0)

    norm_gdp = norm(data_s.loc[min_year_gdp : max_year_gdp, [col_gdp]], None)
    norm_ind = norm(data_s.loc[min_year_ind : max_year_ind, [indicator]], None)

    plt.figure(figsize = (8,8))
    plt.plot(
                #norm_gdp.index.get_level_values(col_year),
                norm_gdp.reset_index(drop = True),
                color = "red", label = col_gdp)
    plt.plot(
                #norm_ind.index.get_level_values(col_year),
                norm_ind.reset_index(drop = True),
                color = "green", label = indicator)
    plt.legend(loc = "lower right")
    plt.show()

In [10]:
import itertools
from scipy import stats

def styler_method(df, name, pvalue = None):    
        if pvalue == None:
                pvalue = 0.05
    #try:
        styles = [dict(selector="caption", props=[("background-color", "#98D3FF")])]
        left1 = pd.Series([pvalue], index=['P-value Spearman'])
        left2 = pd.Series([-1], index=['GDP Spearman Corr'])
        left3 = pd.Series([0], index=['GDP Spearman Corr'])
        dfs = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props='color:white; background-color:red;')\
                .highlight_between(left = left2, right = 1.5, axis = 1, props='color:white; background-color:#929bfc;')\
                .highlight_between(left = left3, right = 1.5, axis = 1, props='color:white; background-color:#b3b9ff;')\
                .format('{:,.4f}', subset = ['GDP Spearman Corr'])\
                .format('{:,.12f}', subset = ['P-value Spearman']) \
                .set_caption(name).set_table_styles(styles)\
                .set_table_attributes("style='display:inline'")
    #except:
    #    dfs = 'No indicators have been found for the window dataframe in this range.'
        
        return dfs

def init_highest_table(indicators):    
    df_highest = pd.DataFrame(columns={"Indicator"})
    df_highest["Indicator"] = indicators
    df_highest["Year range"] = 0
    df_highest["Highest positive Spearman corr"] = 0
    df_highest["Year range "] = 0
    df_highest["Highest negative Spearman corr"] = 0
    df_highest.set_index("Indicator", inplace= True)

    return df_highest

def generate_years_combinations(min_diff: int, min, max):
    iterable = list(range(min, max + 1))
    iterable = list(itertools.combinations(iterable, 2))

    #The year length must be higher than 5. All entries with a lower range are deleted.
    for years in iterable.copy():
        if (years[1] - years[0]) < min_diff:
            iterable.remove(years)
    return iterable

In [11]:
def globe_corr(df_corr, ind):
    N = 10
    fig = px.choropleth(df_corr, locations = df_corr.index, locationmode='country names', 
                        color = ind, projection="natural earth",
                        color_continuous_scale='RdBu',
                        width = 700, height=500)

    pos_corr = df_corr.drop(df_corr.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = False).head(n = N)
    neg_corr = df_corr.drop(df_corr.columns.difference([ind]), axis = 1).sort_values(by = ind, axis = 0, ascending = True).head(n = N)

    pos_corr = pos_corr.loc[pos_corr[ind] > 0]
    neg_corr = neg_corr.loc[neg_corr[ind] < 0]


    fig.update(layout_coloraxis_showscale=True)
    fig.show()

def table_time_window(df, mode, zone, years, threshold, pval):
    df_zone = df.loc[df.index.get_level_values(mode) == zone]
    #Load the selected year range and the global range.
    df_time = searchTimeSeries(threshold, years[0], years[1], True, df_zone)
    df_global = searchTimeSeries(threshold, years[0], years[1], False, df_zone)

    # Display Data
    if years[0] > years[1]: return print("Please, select a valid range of years.")
 
    space = "\xa0" * 10
    try:
        df_time = styler_method(df_time, str(years[0]) + '-' + str(years[1]), pval)._repr_html_()
    except Exception as e: 
        df_time = 'No indicators available for the selected parameters'
    
    try:
        df_global = styler_method(df_global, '2000-2020', pval)._repr_html_()
    except: 
        df_global = 'No indicators available for the selected parameters'
    
    display_html(df_time + space  + df_global, raw=True)

def table_high_range(mode, zone, pval):
    if mode == 'Country':
        min_diff = 5
        
    elif mode == 'Region':
        min_diff = 2
    
    df_zone = df.loc[df.index.get_level_values(mode) == zone] #.drop(columns = 'GDP')
    indicators = df.columns
    df_highest = init_highest_table(indicators)

    i = 0
    computing_text = "Loading "
    print(computing_text, end="\r")
    
    #For all the combination of years...
    for years in generate_years_combinations(min_diff, min_year, max_year):
        i = (i + 1) % 50
        print (computing_text + "".join(["." for _ in range(i)]), end="\r")
        
        df_aux = searchTimeSeries(0, years[0], years[1], True, df_zone)
        #Delete indicators which are not available that year
        indicators_inter = list(set(indicators) & set(list(df_aux.index)))

        #For all the indicators availble that year....
        for indicator in indicators_inter:
            #Algorithm to search for the highest value
            indicator_corr_pos_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest positive Spearman corr"][0]
            indicator_corr_neg_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest negative Spearman corr"][0]
            
          
            indicator_corr_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["GDP Spearman Corr"][0]
            indicator_p_value_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["P-value Spearman"][0]

            if indicator_corr_aux != np.NaN and indicator_p_value_aux < pval:
                if indicator_corr_pos_last < indicator_corr_aux and indicator_corr_aux > 0:
                    df_highest.at[indicator, "Year range"] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest positive Spearman corr"] = indicator_corr_aux
                elif indicator_corr_neg_last > indicator_corr_aux and indicator_corr_aux < 0:
                    df_highest.at[indicator, "Year range "] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest negative Spearman corr"] = indicator_corr_aux

    df_highest = df_highest.replace(0, np.NaN).dropna(axis=0, how='all').fillna("-")
    
    print("                                                                                    ", end="\r")
    display(df_highest)

In [12]:
dropdown_mode = widgets.Dropdown(
    options = mode_list,
    description = 'Select: '
)

dropdown_zone = widgets.Dropdown(
    options = mode_dict[dropdown_mode.value],
    description = 'Show: '
)

dropdown_group = widgets.Dropdown(
    options = ind_dict.keys(),
    description = 'Group: '
)

dropdown_ind = widgets.Dropdown(
    options = indicators_list,
    description = 'Indicator: '
)

intrangeslider_years = widgets.IntRangeSlider(
    value = [min_year, max_year],
    min = min_year,
    max = max_year,
    step = 1,
    description = 'Years: ',
)

intslider_shift = widgets.IntSlider(
    value = 0,
    min = min_year - max_year + 1,
    max = max_year - min_year - 1,
    step = 1,
    description = 'Shifts: '
)

floatslider_threshold = widgets.FloatSlider(
    value = 0.7,
    min = 0.0,
    max = 1.0,
    step = 0.05,
    description = 'Threshold: '
)

floatslider_confidence = widgets.FloatSlider(
    value = 0.95,
    min = 0.0,
    max = 1.0,
    step = 0.05,
    description = 'Confidence: '
)

In [13]:
class WidgetStatus:
    def __init__(self):

        self.mode = dropdown_mode.value
        self.zone = dropdown_zone.value
        self.ind = dropdown_ind.value
        self.ind_group = dropdown_group.value
        self.years = list(intrangeslider_years.value)
        self.shift = intslider_shift.value
        self.threshold = floatslider_threshold.value
        self.pval = 1 - floatslider_confidence.value

        self.df = pd.read_csv(output_path + file_gold, index_col = [col_country, col_year, col_region])
        self.corr_df = pd.read_csv(output_path + file_corr_spearman, index_col = [col_country])
        self.norm_df = pd.read_csv(output_path + file_norm_df, index_col = [col_country, col_region, col_year])
        self.cluster_df = cluster_df
        self.comp_df = comp_dict[self.ind_group].reset_index(col_cluster, drop = False)
        self.ind_df = None

        self.clus_num = cluster_df.loc[cluster_df.index.get_level_values(col_country) == dropdown_zone.value, dropdown_group.value].item()

    def update_zone(self, zone):
        self.zone = zone
        if self.mode == col_country:
            self.clus_num = self.cluster_df.loc[self.cluster_df.index.get_level_values(col_country) == zone, self.ind].item()
            self.comp_df = comp_dict[self.ind_group].reset_index(col_cluster, drop = False)
            self.ind_df = cluster_dict[self.ind_group].loc[cluster_dict[self.ind_group][col_cluster] == self.clus_num].drop(col_cluster, axis = 'columns')

status = WidgetStatus()

In [14]:
def update_plots(tab = 'General', mode = 'Country', zone = 'Afghanistan', group = 'All indicators', ind = 'Population', years = (2000, 2020), shift = 0, threshold = 0.7, confidence = 0.95):
    if ind != status.ind:
        if mode == col_region:
            #status.ind = None
            status.ind_group = None
        else:
            status.ind = ind
            group = group_of(ind)
            if status.ind_group != group: status.ind_group = group
    if mode != None and mode != status.mode:
        dropdown_zone.options = mode_dict[mode]
        status.mode = mode
    
    if zone != status.zone:
        print(zone)
        status.update_zone(zone)

    if group != status.ind_group:
        
        keep_ind = status.ind if(status.ind in ind_dict[group]) else False

        dropdown_ind.options = ind_dict[group]
        
        if keep_ind:
            status.ind = keep_ind
            dropdown_ind.value = keep_ind
        
        if mode == col_country:
            status.clus_num = status.cluster_df.loc[cluster_df.index.get_level_values(col_country) == zone, dropdown_group.value].item()
            status.comp_df = comp_dict[status.ind_group].reset_index(col_cluster, drop = False)
            status.ind_df = cluster_dict[status.ind_group].loc[cluster_dict[status.ind_group][col_cluster] == status.clus_num].drop(col_cluster, axis = 'columns')

    if years[0] != status.years[0]:
        # Refresh lower limit
        status.years[0] = years[0]

    if years[1] != status.years[1]:
        # Refresh upper limit
        status.years[1] = years[1]

    if shift != status.shift:
        status.shift = shift
    
    if threshold != status.threshold:
        status.threshold = threshold
    
    if (1 - confidence) != status.pval:
        status.pval = (1 - confidence)
    
    # GENERAL
    if tab == 'General':
        globe_corr(status.corr_df, status.ind)

    # TIME SERIES
    if tab == 'Time Series':
        table_time_window(status.df, status.mode, status.zone, status.years, status.threshold, status.pval)
        plot_time_series(status.df, status.mode, status.zone, status.ind, status.years, status.shift)
        table_high_range(status.mode, status.zone, status.pval)

    # CLUSTER
    if tab == 'Cluster':
        plot_cluster_map (status.comp_df)
        table_cluster (status.ind_df, status.ind_group, status.zone) # MIRAR POR QUÉ NO PRINTEA
        venn_group (status.cluster_df, status.zone)

In [15]:
outGeneral = widgets.Output()
outTime = widgets.Output()
outCluster = widgets.Output()

tab = widgets.Tab(children = [outGeneral, outTime, outCluster])

tab.set_title(0, 'General')
tab.set_title(1, 'Time Series')
tab.set_title(2, 'Cluster')

display(tab)

with outGeneral:
    #init_plots()
    widgets.interact(update_plots, tab = tab.get_title(tab.selected_index), mode = dropdown_mode, zone = dropdown_zone, group = dropdown_group, ind = dropdown_ind,
                 years = intrangeslider_years, shift = intslider_shift, threshold = floatslider_threshold, confidence = floatslider_confidence)
    print()
with outTime:
    #init_plots()
    widgets.interact(update_plots, tab = tab.get_title(tab.selected_index), mode = dropdown_mode, zone = dropdown_zone, group = dropdown_group, ind = dropdown_ind,
                 years = intrangeslider_years, shift = intslider_shift, threshold = floatslider_threshold, confidence = floatslider_confidence)
    print()
with outCluster:
    #init_plots()
    widgets.interact(update_plots, tab = tab.get_title(tab.selected_index), zone = dropdown_zone, group = dropdown_group,
                 threshold = floatslider_threshold, confidence = floatslider_confidence)
    print() 

Tab(children=(Output(), Output(), Output()), _titles={'0': 'General', '1': 'Time Series', '2': 'Cluster'})

In [None]:
#tab.get_title(tab.selected_index)

In [None]:
tab.get_title(tab.selected_index)

'General'