# Window Time Series

In this notebook it will be performed an analysis of the indicators with the GDP through a time window. This consists in selecting a year range instead of the whole available data, allowing to see specific periods of time and their impact. 

The structure of the notebook is the following:
- Time window country/region: see the indicators correlation of the selected year range for the country/region
- Highest correlations country/region: finds the highest positive/negative correlations for the selected country/region.
- Plot for the countries: to compare their GDP with the indicators

We will start importing all the libraries and necessary dataframes.

In [28]:
import os
import pandas as pd
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import AppLayout, Button, GridspecLayout
import numpy as np
from ipywidgets import interact, interact_manual
from math import nan
from IPython.display import display_html
import itertools
from scipy import stats
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
PVALUE_VAR = 0.05
%store -r PVALUE_VAR

from Project.Utils.visualize import  search, searchTimeSeries, normalize_by_country

write_path = os.getcwd() + '/Output' #Path to the folder you want to store the dataframes
output_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'

df_gold = pd.read_csv(output_path + 'GoldDataframe.csv')
global df_gold_index
df_gold_index = df_gold.set_index(['Country', 'Year', 'Region'])

country_list = list(np.sort(df_gold['Country'].unique()))
region_list = list(np.sort(df_gold['Region'].unique()))

min_year = min(df_gold_index.index.get_level_values('Year').unique())
max_year = max(df_gold_index.index.get_level_values('Year').unique())

### Styler

A simple codeblock used to improve the visualization of the dataframes. It also catches the exception when no data is availble to dislpay.

In [2]:
def styler_method(df, name, pvalue = None):    
        if pvalue == None:
                pvalue = PVALUE_VAR
    #try:
        styles = [dict(selector="caption", props=[("background-color", "#98D3FF")])]
        left1 = pd.Series([pvalue], index=['P-value Spearman'])
        left2 = pd.Series([-1], index=['GDP Spearman Corr'])
        left3 = pd.Series([0], index=['GDP Spearman Corr'])
        dfs = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props='color:white; background-color:red;')\
                .highlight_between(left = left2, right = 1.5, axis = 1, props='color:white; background-color:#929bfc;')\
                .highlight_between(left = left3, right = 1.5, axis = 1, props='color:white; background-color:#b3b9ff;')\
                .format('{:,.4f}', subset = ['GDP Spearman Corr'])\
                .format('{:,.12f}', subset = ['P-value Spearman']) \
                .set_caption(name).set_table_styles(styles)\
                .set_table_attributes("style='display:inline'")
    #except:
    #    dfs = 'No indicators have been found for the window dataframe in this range.'
        
        return dfs

## Time Window

See the indicators correlation of the selected year range for the country or region. Select the zone, the year range and the results will be displayed in the left side, meanwhile in the right there is the original dataframe for the whole data. The only difference between country and region is that for region the dataframe is normalized, due to having different countries. Having more countries in the data input allows to generate a better statistical result (lower p-values) compared to the country widget.

It is recommended leaving the 'Threshold' parameter at 0.7 or higher to have statistical significance results.

**Don't select a very narrow year range, as this will generate a result with low data input and higher p-values. A p-value > 0.5 is not considered to have statistical significance.**

This widget allows to perform comparisons for specific years, for example the impact of a crysis in the indicators and GDP.

In [3]:
import time

def load_by_region(region, df_param):
    start = time.time()
    df = df_param.loc[df_param.index.get_level_values('Region') == region]
    df = normalize_by_country(df)
    df.sort_index(level = ['Year', 'Country'], inplace=True)
    end = time.time()
    print("[{:.2f} seconds] Normalized Region {}".format(end - start, region))
    return df

In [4]:
# Precompute contries by region normalized
countries_by_region = {}
for r in region_list:
    countries_by_region[r] = load_by_region(r, df_gold_index)

[3.26 seconds] Normalized Region East Asia and Pacific
[6.32 seconds] Normalized Region Europe and Central Asia
[3.51 seconds] Normalized Region Latin America and Caribbean
[1.78 seconds] Normalized Region Middle East and North Africa
[0.25 seconds] Normalized Region North America
[0.92 seconds] Normalized Region South Asia
[4.82 seconds] Normalized Region Sub-Saharan Africa


In [5]:
class WidgetTimeWindowCountryStatus:
    def __init__(self):
        self.filter_by = "Country"
        self.zone = None
        self.data = None
        
status_widget = WidgetTimeWindowCountryStatus()

In [6]:
def time_window_zone(by, zone, threshold , years):
    # Update Widget Status
    if by == 'Country':
        if status_widget.filter_by != "Country":
            status_widget.filter_by = "Country"
            zone_drop.options = country_list
            return
        if status_widget.zone != zone:
            #Search for entries of the country.
            status_widget.data = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == zone]
            status_widget.zone = zone

    elif by == 'Region':
         if status_widget.filter_by != "Region":
            status_widget.filter_by = "Region"
            zone_drop.options = region_list
            return
         if status_widget.zone != zone:
            #Search for entries of the country.
            status_widget.data = countries_by_region[zone] # load_by_region(Zone)
            status_widget.zone = zone
    
    # Update Widget Data
    df_zone = status_widget.data
    
    #Load the selected year range and the global range.
    df_time = searchTimeSeries(threshold, years[0], years[1], True, df_zone)
    df_global = searchTimeSeries(threshold, years[0], years[1], False, df_zone)

    # Display Data
    if years[0] > years[1]: return print("Please, select a valid range of years.")
 
    space = "\xa0" * 10
    try:
        df_time = styler_method(df_time, str(years[0]) + '-' + str(years[1]))._repr_html_()
    except: 
        df_time = 'No indicators available for the selected parameters'
    
    try:
        df_global = styler_method(df_global, '2000-2020')._repr_html_()
    except: 
        df_global = 'No indicators available for the selected parameters'

    display_html(df_time + space  + df_global, raw=True)
    
by_drop = widgets.Dropdown(
    options= ['Country', 'Region'],
    description='By: ',
)

intslider = widgets.IntRangeSlider(
    value=[min(set(df_gold['Year'])), max(set(df_gold['Year']))],
    min= min(set(df_gold['Year'])),
    max= max(set(df_gold['Year'])),
    step=1,
    description='Years: ',
)

zone_drop = widgets.Dropdown(
    options= country_list,
    value='Afghanistan',
    description='Zone: ',
)

floatslider = widgets.FloatSlider(
    value=0.7,
    min=0,
    max=1.0,
    step=0.05,
    description='Threshold:',
)

widgets.interact(time_window_zone, by = by_drop, zone = zone_drop, threshold = floatslider, years = intslider)

interactive(children=(Dropdown(description='By: ', options=('Country', 'Region'), value='Country'), Dropdown(d…

<function __main__.time_window_zone(by, zone, threshold, years)>

# Highest correlations

Finds the highest positive/negative correlations for the selected zone. The correlation found is displayed next to the year range where has been located. This widget allows to detect periods of time with important events resulting in a modification of the indicator correlation. The widget also filters the p-values lower than 0.05.

In the widget for the countries a lot of maximum correlation are found, this is caused when there is a low of data input.

In first place, define a method were will be stored all the results:

In [7]:
def init_highest_table(indicators):    
    df_highest = pd.DataFrame(columns={"Indicator"})
    df_highest["Indicator"] = indicators
    df_highest["Year range"] = 0
    df_highest["Highest positive Spearman corr"] = 0
    df_highest["Year range "] = 0
    df_highest["Highest negative Spearman corr"] = 0
    df_highest.set_index("Indicator", inplace= True)

    return df_highest

Iterable: a list of all the combinations for the year ranges with length of five years.

Indicators: a list of all the indicators.

In [8]:
indicators = list(df_gold_index.columns)
indicators.remove('GDP')


In [12]:
def generate_years_combinations(min_diff: int, min, max):
    iterable = list(range(min, max + 1))
    iterable = list(itertools.combinations(iterable, 2))

    #The year length must be higher than 5. All entries with a lower range are deleted.
    for years in iterable.copy():
        if (years[1] - years[0]) < min_diff:
            iterable.remove(years)
    return iterable

In [13]:
def table_high_country(by, zone):
    if by == 'Country':
        if len(zone_drop_high.options) == len(region_list): 
            zone_drop_high.options = country_list
            return
        #Search for the entries of the country.
        df_zone  = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == zone]
        min_diff = 5
    
        
    elif by == 'Region':
        if len(zone_drop_high.options) == len(country_list): 
            zone_drop_high.options = region_list
            return
        #Search for the entries of the region and normalize.
        df_zone = countries_by_region[zone]
        min_diff = 2
    
    df_highest = init_highest_table(indicators)
    
    i = 0
    computing_text = "Loading "
    print(computing_text, end="\r")
    
    #For all the combination of years...
    for years in generate_years_combinations(min_diff, min_year, max_year):
        
        i = (i + 1) % 50
        print (computing_text + "".join(["." for _ in range(i)]), end="\r")
        
        df_aux = searchTimeSeries(0, years[0], years[1], True, df_zone)
        #Delete indicators which are not available that year
        indicators_inter = list(set(indicators) & set(list(df_aux.index)))

        #For all the indicators availble that year....
        for indicator in indicators_inter:
            #Algorithm to search for the highest value
            indicator_corr_pos_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest positive Spearman corr"][0]
            indicator_corr_neg_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest negative Spearman corr"][0]
            
          
            indicator_corr_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["GDP Spearman Corr"][0]
            indicator_p_value_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["P-value Spearman"][0]

            if indicator_corr_aux != nan and indicator_p_value_aux < PVALUE_VAR:
                if indicator_corr_pos_last < indicator_corr_aux and indicator_corr_aux > 0:
                    df_highest.at[indicator, "Year range"] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest positive Spearman corr"] = indicator_corr_aux
                elif indicator_corr_neg_last > indicator_corr_aux and indicator_corr_aux < 0:
                    df_highest.at[indicator, "Year range "] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest negative Spearman corr"] = indicator_corr_aux

    df_highest = df_highest.replace(0, nan).dropna(axis=0, how='all').fillna("-")
    
    print("                                                                                    ", end="\r")


    display(df_highest)

by_drop_high = widgets.Dropdown(
    options= ['Country', 'Region'],
    description='By: ',
)

zone_drop_high = widgets.Dropdown(
    options= country_list,
    value ='Afghanistan',
    description='Zone:',
)

widgets.interact(table_high_country, by = by_drop_high, zone = zone_drop_high)


interactive(children=(Dropdown(description='By: ', options=('Country', 'Region'), value='Country'), Dropdown(d…

<function __main__.table_high_country(by, zone)>

This widget below has been used to check all the results throughout the programming of this notebook. It is also a great tool to see the fluctuations of the GDP and the indicators along the years. It also prints the spearman correlation and the p-value. 

**Remember: the correlation must be higher than 0.7 to be considered correlated and the p-value must be lower than 0.05 to have statistical significance.**

In [31]:

def plot_widget( df_gold = None, countries = None):
    global df_gold_index
    if countries != None:
        countries_by_region = countries
    if df_gold != None:
        df_gold_index = df_gold
    

    def plot_year_range(by, zone, indicator, years):
    
        if by == 'Country':
            if list(zone_drop_plot.options) != country_list:
                zone_drop_plot.options = country_list
                return
            df_zone  = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == zone]
        
        elif by == 'Region':
            if list(zone_drop_plot.options) != region_list:
                zone_drop_plot.options = region_list
                return
            df_zone = countries_by_region[zone]

        df_zone = df_zone.loc[(df_zone.index.get_level_values("Year") >= years[0]) & (df_zone.index.get_level_values("Year") <= years[1])]
        spear = stats.spearmanr(df_zone[indicator], df_zone['GDP'])
        df_zone = normalize_by_country(df_zone)


        #In case if by is region it groups by year.
        df_zone = df_zone.loc[df_zone.index.get_level_values(by_drop_plot.value) == zone, ['GDP', indicator]].groupby(level = 'Year').median()

        print(spear)
        plt.figure(figsize=(6,6))
        plt.plot(df_zone.index.get_level_values("Year"), df_zone["GDP"], color="red", label = 'GDP')
        plt.plot(df_zone.index.get_level_values("Year"), df_zone[indicator], color="green", label = indicator)
        plt.legend(loc="lower right")
    return plot_year_range
    
by_drop_plot = widgets.Dropdown(
    options= ['Country', 'Region'],
    description='By: ',
)

zone_drop_plot = widgets.Dropdown(
    options= country_list,
    value='Afghanistan',
    description='Country: ',
)

indicator_drop_plot = widgets.Dropdown(
    options= sorted(indicators),
    value='AgriShareGDP',
    description='Indicator: :',
)

intslider_plot = widgets.IntRangeSlider(
    value=[min(set(df_gold['Year'])), max(set(df_gold['Year']))],
    min= min(set(df_gold['Year'])),
    max= max(set(df_gold['Year'])),
    step=1,
    description='Years:',
)

# TODO By Region: Say Y axis is Qualitative (Not real values but Normalized to observe evolution vs GDP - Tendendency)

widgets.interact(plot_widget(), by = by_drop_plot,  zone = zone_drop_plot, indicator = indicator_drop_plot, years = intslider_plot)

interactive(children=(Dropdown(description='By: ', options=('Country', 'Region'), value='Country'), Dropdown(d…

<function __main__.plot_widget.<locals>.plot_year_range(by, zone, indicator, years)>