In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import scipy.linalg as la
from scipy.stats import linregress, spearmanr, kendalltau, ttest_1samp
import sys
sys.path.append('../')
from draw_network import draw_network

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import matplotlib
matplotlib.rc('xtick',labelsize=22)
matplotlib.rc('ytick',labelsize=16)
matplotlib.rc('font',size = 24)
matplotlib.rc('legend',fontsize = 17)
matplotlib.rc('figure',titlesize = 25)

# Matrix of events in the randomized time series

In [2]:
def randomize_timeseries(time_series):

    """
    Randomizes the time series by shuffling only the non-NaN values.
    Input:
        time_series: numpy array
    Output:
        time_series_copy: numpy array, randomized time series
    """

    # Shuffle only the non-NaN values
    time_series_copy = time_series.copy()   # note for myself: I need to make a copy because otherwise the original time series gets randomized outside this function too
    mask_non_nan = ~np.isnan(time_series_copy)
    shuffled_non_nan = time_series_copy[mask_non_nan]
    np.random.shuffle(shuffled_non_nan)

    # Place the shuffled non-NaN values back into the original time series using the mask
    time_series_copy[mask_non_nan] = shuffled_non_nan

    return time_series_copy




def get_event_matrix(randomize = False, countries = None, years = None, sign_binary = False, threshold = 1):

    """
    Computes the event matrix based on the K_loc time series.
    Input:

        randomize: bool, if True, the time series are randomized
        countries: list of strings, countries to consider. If None, uses all countries in the K_loc time series
        years: list of integers, years to consider. If None, uses all years in the K_loc time series
        sign_binary: bool, if True, the output matrix is binary with only 1 and -1
        threshold: int, if sign_binary is True, this is the threshold for the binary matrix.
    Output:
        events_matrix: numpy array, shape (len(years), len(countries))
    """

    # load the df
    K_loc = pd.read_csv('data/K_loc_timeseries.csv', index_col = 0)

    # initialize stuff
    if np.any(years == None): years = K_loc.index
    if np.any(countries == None): countries = K_loc.columns
    events_matrix = np.zeros((len(years), len(countries)))   

    for i, country in enumerate(countries):

        Kloc_country = K_loc[country].values

        if randomize == True:
            Kloc_country = randomize_timeseries(Kloc_country)

        # compute the difference between consecutive years
        balance_diff = np.nan*np.ones(len(Kloc_country))
        balance_diff[1:] = Kloc_country[1:]-Kloc_country[:-1]

        # find drops and increases and store them as a matrix
        for j, year in enumerate(years):
            
            if balance_diff[j] >= 0.5 and Kloc_country[j] >= 0.5:  # high-level increase events
                events_matrix[j, i] = 3

            elif balance_diff[j] >= 0.3 and Kloc_country[j] >= 0.5:  # medium-level increase events
                events_matrix[j, i] = 2

            elif balance_diff[j] >= 0.1 and Kloc_country[j] >= 0.5:   # low-level increase events
                events_matrix[j, i] = 1
            
            elif balance_diff[j] <= -0.5 and Kloc_country[j] <= 0.5:   # high-level drop events
                events_matrix[j, i] = -3

            elif balance_diff[j] <= -0.3 and Kloc_country[j] <= 0.5:   # medium-level drop events
                events_matrix[j, i] = -2

            elif balance_diff[j] <= -0.1 and Kloc_country[j] <= 0.5:   # low-level drop events
                events_matrix[j, i] = -1

    if sign_binary == True:
        # convert the matrix to one with only 1 and -1
        if threshold == 1:
            # For threshold = 1, it is enough to take the sign of the entries
            events_matrix = np.sign(events_matrix)
        else:
            # For threshold = 2, we keep high and mid-level events (i.e. +-3 and +-2 is mapped to +-1 and the rest is mapped to zero)
            # For threshold = 3, we only keep high-level events
            events_matrix = (events_matrix/threshold).astype(int)

    return events_matrix



def matrix_similarity(A,B, method = 'Frobenius'):
    
    """
    Computes the similarity between two matrices A and B.
    Input:
        A: numpy array or pandas DataFrame
        B: numpy array or pandas DataFrame
        method: string, either 'Pearson', 'spectral_distance' or 'Frobenius'
    Output:
        similarity: float
    """

    if method == 'Pearson':

        if type(A) == pd.DataFrame: A = A.values
        if type(B) == pd.DataFrame: B = B.values

        A_vec = A.flatten()
        B_vec = B.flatten()

        similarity = np.corrcoef(A_vec, B_vec)[0,1]  # Pearson correlation coefficient
        return similarity


    elif method == 'spectral_distance':

        sv_A = np.linalg.svd(A, compute_uv=False)
        sv_B = np.linalg.svd(B, compute_uv=False)
        spectral_distance = np.linalg.norm(sv_A-sv_B)

        return 10/(10+spectral_distance)

    elif method == 'Frobenius':
        
        num = np.trace(A.T@B)
        denom = np.sqrt(np.trace(A.T@A)*np.trace(B.T@B))
        return num/denom

    else:
        raise ValueError('Invalid method. Use either "Pearson", "spectral_distance" or "Frobenius"')
    



# Similarity between matrices

In [4]:
# parameters 
with_signs = True  # consider the nature of events through sign (peak/valley; violent/non-violent)
threshold = 1
M = 500   # number of randomizations
print('With signs:', with_signs, '; Threshold:', threshold)

for method in ['Pearson', 'Frobenius']:

    # load ground truth matrix
    GT_matrix = pd.read_excel('data/Complete_Ground_Truth_Matrix.xlsx', index_col = 0)
    GT_matrix.index = GT_matrix.index.astype(int)

    print('METHOD:', method)

    # compute similarity with the local balance matrix
    LBE_matrix = get_event_matrix(randomize = False, sign_binary = True, threshold = threshold)
    if with_signs == True:
        similarity_LBE = matrix_similarity(GT_matrix, LBE_matrix, method = method)
    else:
        similarity_LBE = matrix_similarity(np.abs(GT_matrix), np.abs(LBE_matrix), method = method)

    # compute similarity with the randomized local balance matrix
    similarity_rand_LBE_v = np.zeros(M)
    for i in range(M):

        rand_LBE_matrix = get_event_matrix(randomize = True, sign_binary = True, threshold = threshold)

        if with_signs == True:
            similarity_rand_LBE_v[i] = matrix_similarity(GT_matrix, rand_LBE_matrix, method = method) 
        else:
            similarity_rand_LBE_v[i] = matrix_similarity(np.abs(GT_matrix), np.abs(rand_LBE_matrix), method = method)

    # statistical test for similarity
    t_statistic, p_value = ttest_1samp(similarity_rand_LBE_v, similarity_LBE)

    print('Similarity GT-LBE:', similarity_LBE)
    print('Similarity GT-randomized LBE:', similarity_rand_LBE_v.mean())
    print('p-value:', p_value)
    print('----------------------------------')


With signs: True ; Threshold: 1
METHOD: Pearson
Similarity GT-LBE: 0.3064842898222374
Similarity GT-randomized LBE: 0.038429012543968
p-value: 0.0
----------------------------------
METHOD: Frobenius
Similarity GT-LBE: 0.3136010090592925
Similarity GT-randomized LBE: 0.051150658053030866
p-value: 0.0
----------------------------------
