# Window Time Series

In this notebook it will be performed an analysis of the indicators with the GDP through a time window. This consists in selecting a year range instead of the whole available data, allowing to see specific periods of time and their impact. 

The structure of the notebook is the following:
- Time window country/region: see the indicators correlation of the selected year range for the country/region
- Highest correlations country/region: finds the highest positive/negative correlations for the selected country/region.

We will start importing all the libraries and necessary dataframes.

In [1]:
import os
import pandas as pd
import ipywidgets as widgets
from ipywidgets import Layout
from ipywidgets import AppLayout, Button, GridspecLayout
from ipywidgets import interact, interact_manual
from math import nan
from IPython.display import display_html
import itertools
from scipy import stats
import warnings
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
%store -r PVALUE_VAR

from Project.Utils.visualize import  search, searchTimeSeries, norm

write_path = os.getcwd() + '/Output' #Path to the folder you want to store the dataframes
output_path = os.getcwd() + '/Output/'

col_country = 'Country'
col_year = 'Year'
col_region = 'Region'

df_gold = pd.read_csv(output_path + 'GoldDataframe.csv')
df_gold_index = df_gold.set_index(['Country', 'Year', 'Region'])
corr_df_spearman = pd.read_csv(output_path + 'Corr_DF_pearson.csv', index_col = col_country)



### Styler

A simple codeblock used to improve the visualization of the dataframes. It also catches the exception when no data is availble to dislpay.

In [2]:
def styler_method(df, name):    
    try:
            styles = [dict(selector="caption", props=[("background-color", "#98D3FF")])]
            left1 = pd.Series([PVALUE_VAR], index=['P-value Spearman'])
            left2 = pd.Series([-1], index=['GDP Spearman Corr'])
            left3 = pd.Series([0], index=['GDP Spearman Corr'])
            dfs = df.style.highlight_between(left = left1, right = 1.5, axis = 1, props='color:white; background-color:red;')\
                        .highlight_between(left = left2, right = 1.5, axis = 1, props='color:white; background-color:#929bfc;')\
                        .highlight_between(left = left3, right = 1.5, axis = 1, props='color:white; background-color:#b3b9ff;')\
                        .format('{:,.4f}', subset = ['GDP Spearman Corr'])\
                        .format('{:,.12f}', subset = ['P-value Spearman']) \
                        .set_caption(name).set_table_styles(styles)\
                        .set_table_attributes("style='display:inline'")
    except:
        dfs = 'No indicators have been found for the window dataframe in this range.'
        
    return dfs

## Time Window Country

See the indicators correlation of the selected year range for the country. Select the country and the year range and the results will be displayed in the left side, meanwhile in the right there is the original dataframe for the whole data. It is recommended leaving the 'Threshold' parameter at 0.7 or higher to have statistical significance results.

**Don't select a very narrow year range, as this will generate a result with low data input and higher p-values. A p-value > 0.5 it is not considered to have statisticar significance.**

This widget allows to perform comparisons for specific years, for example the impact of a crysis in the indicators and GDP.

In [None]:
def load_by_region(region):
    df = df_gold_index.loc[df_gold_index.index.get_level_values('Region') == region]
    df = norm(df)
    df.sort_index(level = ['Year', 'Country'], inplace=True)
    return df

In [22]:
def timeWindowCountry(By, Zone, Threshold , Years):
    if By ==  'Region': 
        zone_drop.options = sorted(set(df_gold['Region'].tolist()))
        #Search for the entries of the region and normalize.
        df_zone = load_by_region(Zone)

    else: #By country
        zone_drop.options = sorted(set(df_gold['Country'].tolist()))
        #Search for entries of the country.
        df_zone = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == Zone]
    

    #Load the selected year range and the global range.
    df_time = searchTimeSeries(Threshold, Years[0], Years[1], True, df_zone)
    df_global = searchTimeSeries(Threshold, Years[0], Years[1], False, df_zone)

    if Years[0] > Years[1]: return print("Please, select a valid range of years.")
 
    space = "\xa0" * 10
    try:
        df_time = styler_method(df_time, str(Years[0]) + '-' + str(Years[1]))._repr_html_()
    except: 
        df_time = 'No indicators available for the selected parameters'
    
    try:
        df_global = styler_method(df_global, '2000-2020')._repr_html_()
    except: 
        df_global = 'No indicators available for the selected parameters'

    display_html(df_time + space  + df_global, raw=True)
    
by_radio = widgets.Dropdown(
    options= ['Country', 'Region'],
    description='By: ',
)

intslider = widgets.IntRangeSlider(
    value=[min(set(df_gold['Year'])), max(set(df_gold['Year']))],
    min= min(set(df_gold['Year'])),
    max= max(set(df_gold['Year'])),
    step=1,
    description='Years:',
)

zone_drop = widgets.Dropdown(
    options=sorted(set(df_gold['Country'].tolist())),
    value='Afghanistan',
    description='Zone:',
)

floatslider = widgets.FloatSlider(
    value=0.7,
    min=0,
    max=1.0,
    step=0.05,
    description='Threshold:',
)

widgets.interact(timeWindowCountry, By = by_radio, Zone = zone_drop, Threshold = floatslider, Years = intslider)

interactive(children=(Dropdown(description='By: ', options=('Country', 'Region'), value='Country'), Dropdown(d…

<function __main__.timeWindowCountry(By, Zone, Threshold, Years)>

## Time Window Region

The same as before but for the region. The only difference consists in normalizing the dataframe due to having different countries.

Having more countries in the data input allows to generate a better statistical result (lower p-values) compared to the country widget.


In [11]:
def timeWindowRegion(Zone, Threshold , Years):

    #Search for the entries of the region and normalize.
    df_region = load_by_region(Zone)

    #Load the selected year range and the global range.
    df_time = searchTimeSeries(Threshold, Years[0], Years[1], True, df_region)
    df_global = searchTimeSeries(Threshold, Years[0], Years[1], False, df_region)


    if Years[0] > Years[1]: return print("Please, select a valid range of years.")

    dataframe_name =  str(Years[0]) + '-' + str(Years[1])
    
    space = "\xa0" * 10
    try:
        df_time = styler_method(df_time, dataframe_name)._repr_html_()
    except: 
        df_time = 'No indicators available for the selected parameters'
    
    try:
        df_global = styler_method(df_global, '2000-2020')._repr_html_()
    except: 
        df_global = 'No indicators available for the selected parameters'
    display_html(df_time + space  + df_global, raw=True)
    
intslider = widgets.IntRangeSlider(
    value=[min(set(df_gold['Year'])), max(set(df_gold['Year']))],
    min= min(set(df_gold['Year'])),
    max= max(set(df_gold['Year'])),
    step=1,
    description='Years:',
)

region_drop = widgets.Dropdown(
    options=sorted(set(df_gold['Region'].tolist())),
    value='South Asia',
    description='Country:',
)

floatslider = widgets.FloatSlider(
    value=0.7,
    min=0,
    max=1.0,
    step=0.05,
    description='Threshold:',
)

widgets.interact(timeWindowRegion, Zone = region_drop, Threshold = floatslider, Years = intslider)

interactive(children=(Dropdown(description='Country:', index=5, options=('East Asia and Pacific', 'Europe and …

<function __main__.timeWindowRegion(Zone, Threshold, Years)>

# Highest correlations country

Finds the highest positive/negative correlations for the selected country. The correlation found it is displayed next to the year range where has been located. This widget allows to detect periods of time with important events resulting in a modification of the indicator correlation. The widget also filters the p-values lower than 0.05.

In the widget for the countries a lot of maximum correlation are found, this is caused because there is not a low of data to generate the dataframes.

In first place, define a method were will be stored all the results:

In [6]:
def generate_table():    
    df_highest = pd.DataFrame(columns={"Indicator"})
    df_highest["Indicator"] = indicators
    df_highest["Year range"] = 0
    df_highest["Highest positive Spearman corr"] = 0
    df_highest["Year range "] = 0
    df_highest["Highest negative Spearman corr"] = 0
    df_highest.set_index("Indicator", inplace= True)

    return df_highest

Iterable: a list of all the combinations for the year ranges with length of five years.

Indicators: a list of all the indicators.

In [7]:
iterable = list(range(min(set(df_gold['Year'])), max(set(df_gold['Year'])) + 1))
iterable = list(itertools.combinations(iterable, 2))
iterable_aux = iterable.copy()

#The year length must be higher than 5. All entries with a lower range are deleted.
for years in iterable_aux:
    if (years[1] - years[0]) < 4:
        iterable.remove(years)

indicators = list(df_gold.columns[3:])
indicators.remove('GDP')

In [8]:
def tableHighCountry(Country):
    df_highest = generate_table()
    #Search for the entries of the country.
    df_country  = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == Country]

    #For all the combination of years...
    for years in iterable:
        df_aux = searchTimeSeries(0, years[0], years[1], True, df_country)
        #Delete indicators which are not available that year
        indicators_inter = list(set(indicators) & set(list(df_aux.index)))

        #For all the indicators availble that year....
        for indicator in indicators_inter:
            #Algorithm to search for the highest value
            indicator_corr_pos_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest positive Spearman corr"][0]
            indicator_corr_neg_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest negative Spearman corr"][0]
            
          
            indicator_corr_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["GDP Spearman Corr"][0]
            indicator_p_value_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["P-value Spearman"][0]

            if indicator_corr_aux != nan and indicator_p_value_aux < PVALUE_VAR:
                if indicator_corr_pos_last < indicator_corr_aux and indicator_corr_aux > 0:
                    df_highest.at[indicator, "Year range"] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest positive Spearman corr"] = indicator_corr_aux
                elif indicator_corr_neg_last > indicator_corr_aux and indicator_corr_aux < 0:
                    df_highest.at[indicator, "Year range "] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest negative Spearman corr"] = indicator_corr_aux

    df_highest = df_highest.replace(0, nan).dropna(axis=0, how='all').fillna("-")
    display(df_highest)

@interact(
    Country = sorted(set(df_gold['Country'].tolist())))
def g(Country = 'Afghanistan'):
    return tableHighCountry(Country)

interactive(children=(Dropdown(description='Country', options=('Afghanistan', 'Albania', 'Algeria', 'Angola', …

This widget below has been used to check all the results throughout the programming of this notebook. It is also a great tool to see the fluctuations of the GDP and the indicators along the years. It also prints the spearman correlation and the p-value. 

**Remember: the correlation must be higher than 0.7 to be considered correlated and the p-value must be lower than 0.05 to have statistical significance.**

In [9]:
indicators = list(df_gold_index.columns)
indicators.remove('GDP')

def plotYearRange(Zone, Indicator, Years):
    df_aux = df_gold_index.loc[df_gold_index.index.get_level_values('Country') == Zone]
    df_aux = df_aux.loc[(df_aux.index.get_level_values("Year") >= Years[0]) & (df_aux.index.get_level_values("Year") <= Years[1])]
    spear = stats.spearmanr(df_aux[Indicator], df_aux['GDP'])
    df_aux = norm(df_aux).reset_index(drop=False)
    
    print(spear)
    plt.figure(figsize=(6,6))
    plt.plot(df_aux["GDP"], color="red", label = 'GDP')
    plt.plot(df_aux[Indicator], color="green", label = Indicator)
    plt.legend(loc="lower right")
    



country_drop = widgets.Dropdown(
    options=sorted(set(df_gold['Country'].tolist())),
    value='Afghanistan',
    description='Country: ',
)

indicator_drop = widgets.Dropdown(
    options= sorted(indicators),
    value='AgriShareGDP',
    description='Indicator: :',
)

intslider = widgets.IntRangeSlider(
    value=[min(set(df_gold['Year'])), max(set(df_gold['Year']))],
    min= min(set(df_gold['Year'])),
    max= max(set(df_gold['Year'])),
    step=1,
    description='Years:',
)


widgets.interact(plotYearRange, Zone = country_drop, Indicator = indicator_drop, Years = intslider)

interactive(children=(Dropdown(description='Country: ', options=('Afghanistan', 'Albania', 'Algeria', 'Angola'…

<function __main__.plotYearRange(Zone, Indicator, Years)>

# Highest correlations region

A similar widget as the one for the country but using a normalized dataframe.

In [10]:
def tableHighRegion(Region):

    df_highest = generate_table()

    #Search for the entries of the region and normalize.
    df = load_by_region(Region)

    #For all the combination of years...
    for years in iterable:
        df_aux = searchTimeSeries(0, years[0], years[1], True, df)
        #Delete indicators which are not available that year
        indicators_inter = list(set(indicators) & set(list(df_aux.index)))

        #For all the indicators availble that year....
        for indicator in indicators_inter:
            #Algorithm to search for the highest value
            indicator_corr_pos_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest positive Spearman corr"][0]
            indicator_corr_neg_last = df_highest[df_highest.index.get_level_values(0) == indicator]["Highest negative Spearman corr"][0]
            
            indicator_corr_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["GDP Spearman Corr"][0]
            indicator_p_value_aux = df_aux[df_aux.index.get_level_values(0) == indicator]["P-value Spearman"][0]

            if indicator_corr_aux != nan and indicator_p_value_aux < PVALUE_VAR:
                if indicator_corr_pos_last < indicator_corr_aux and indicator_corr_aux > 0:
                    df_highest.at[indicator, "Year range"] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest positive Spearman corr"] = indicator_corr_aux
                elif indicator_corr_neg_last > indicator_corr_aux and indicator_corr_aux < 0:
                    df_highest.at[indicator, "Year range "] = str(years[0]) + '-' + str(years[1])
                    df_highest.at[indicator, "Highest negative Spearman corr"] = indicator_corr_aux

    df_highest = df_highest.replace(0, nan).dropna(axis=0, how='all').fillna("-")
    display(df_highest)

@interact(
    Region = sorted(set(df_gold['Region'].tolist())))
def g(Region = 'South Asia'):
    return tableHighRegion(Region)

interactive(children=(Dropdown(description='Region', index=5, options=('East Asia and Pacific', 'Europe and Ce…