### Extraction of features from a given correlation matrix

https://gmarti.gitlab.io//qfin/2020/08/14/correlation-matrix-features.html

In [1]:
%matplotlib inline

import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from scipy.stats import rankdata
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import squareform
import fastcluster
import networkx as nx
from statsmodels.stats.correlation_tools import corr_nearest
import matplotlib.pyplot as plt
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

In [2]:
def compute_mst_stats(corr):
    dist = (1 - corr) / 2
    G = nx.from_numpy_matrix(dist) 
    mst = nx.minimum_spanning_tree(G)

    features = pd.Series()
    features['mst_avg_shortest'] = nx.average_shortest_path_length(mst)


    closeness_centrality = (pd
                            .Series(list(nx
                                         .closeness_centrality(mst)
                                         .values()))
                            .describe())
    for stat in closeness_centrality.index[1:]:
        features[f'mst_centrality_{stat}'] = closeness_centrality[stat]

    return features

In [3]:
def compute_features_from_correl(model_corr):
    n = model_corr.shape[0]
    a, b = np.triu_indices(n, k=1)
    
    features = pd.Series()
    coefficients = model_corr[a, b].flatten()

    coeffs = pd.Series(coefficients)
    coeffs_stats = coeffs.describe()
    for stat in coeffs_stats.index[1:]:
        features[f'coeffs_{stat}'] = coeffs_stats[stat]
    features['coeffs_1%'] = coeffs.quantile(q=0.01)
    features['coeffs_99%'] = coeffs.quantile(q=0.99)
    features['coeffs_10%'] = coeffs.quantile(q=0.1)
    features['coeffs_90%'] = coeffs.quantile(q=0.9)


    # eigenvals
    eigenvals, eigenvecs = np.linalg.eig(model_corr)
    permutation = np.argsort(eigenvals)[::-1]
    eigenvals = eigenvals[permutation]
    eigenvecs = eigenvecs[:, permutation]

    pf_vector = eigenvecs[:, np.argmax(eigenvals)]
    if len(pf_vector[pf_vector < 0]) > len(pf_vector[pf_vector > 0]):
        pf_vector = -pf_vector

    features['varex_eig1'] = float(eigenvals[0] / sum(eigenvals))
    features['varex_eig_top5'] = (float(sum(eigenvals[:5])) / 
                                  float(sum(eigenvals)))
    features['varex_eig_top30'] = (float(sum(eigenvals[:30])) / 
                                   float(sum(eigenvals)))
    # Marcenko-Pastur (RMT)
    T, N = 252, n
    MP_cutoff = (1 + np.sqrt(N / T))**2
    # variance explained by eigenvals outside of the MP distribution
    features['varex_eig_MP'] = (
        float(sum([e for e in eigenvals if e > MP_cutoff])) /
        float(sum(eigenvals)))
    
    # determinant
    features['determinant'] = np.prod(eigenvals)
    
    # condition number
    features['condition_number'] = abs(eigenvals[0]) / abs(eigenvals[-1])


    # stats of the first eigenvector entries
    pf_stats = pd.Series(pf_vector).describe()
    if pf_stats['mean'] < 1e-5:
        return None
    for stat in pf_stats.index[1:]:
        features[f'pf_{stat}'] = float(pf_stats[stat])


    # stats on the MST
    features = pd.concat([features, compute_mst_stats(model_corr)],
                         axis=0)

    # stats on the linkage
    dist = np.sqrt(2 * (1 - model_corr))
    for algo in ['ward', 'single', 'complete', 'average']:
        Z = fastcluster.linkage(dist[a, b], method=algo)
        features[f'coph_corr_{algo}'] = cophenet(Z, dist[a, b])[0]

    return features.sort_index()

In [4]:
def compute_dataset_features(mats):
    all_features = []
    for i in range(mats.shape[0]):
        model_corr = mats[i, :, :]

        features = compute_features_from_correl(model_corr)

        if features is not None:
            all_features.append(features)
    
    return pd.concat(all_features, axis=1).T

In [5]:
empirical_matrices = np.load('empirical_matrices.npy')

empirical_features = compute_dataset_features(empirical_matrices)

empirical_features.describe()

FileNotFoundError: [Errno 2] No such file or directory: 'empirical_matrices.npy'