In [1]:
import os.path
import numpy as np
import matplotlib.pyplot as plt
import glob
import pandas as pd
import scipy
import scipy.cluster.hierarchy as sch
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
import pickle
from SPD_SURE import *

In [2]:
def cor_cluster(corr, label, threshold = 0.5, n_node = None, plot = False):
    #corr = corr.to_numpy()
    
    corr[corr > 1] = 1
    corr[corr < -1] = -1
    
    # force to be symmetric
    corr = (corr + corr.T)/2

    if n_node == None:
        n_node = corr.shape[0]
    dissimilarity = 1 - np.abs(corr)
    hierarchy = linkage(squareform(dissimilarity), method='average')
    ind = fcluster(hierarchy, threshold*corr.max(), criterion='distance')
    
    order = np.argsort(ind)[::-1][:n_node]

    columns = [label[i] for i in list((order))]
    corr_ = corr[:, order][order]
    if plot:
        # Plot the correlation matrix
        fig_size = 10
        fig, ax = plt.subplots(figsize=(fig_size, fig_size))
        cax = ax.matshow(corr_, cmap='RdYlGn')
        plt.xticks(range(len(columns)), columns, rotation=90);
        plt.yticks(range(len(columns)), columns);

        # Add the colorbar legend
        cbar = fig.colorbar(cax, ticks=[-1, 0, 1], aspect=40, shrink=.8)
    
    return ind

def check_SPD(X):
    # X be a n x N x N array
    # check if X[i]'s are SPD
    n = X.shape[0]
    N = X.shape[1]
    I = np.eye(N)
    res = np.zeros(X.shape)
    for i in range(n):
        min_eigval = np.min(np.linalg.eigvalsh(X[i]))
        if min_eigval < 0:
            res[i] = X[i] + (abs(min_eigval) + 1e-3)*I
        else:
            res[i] = X[i]
            
    return res

# ADHD200_CC200

In [3]:
sub_list = pd.read_csv('UMCD_dataset/ADHD200_CC200/ADHD200_CC200_list.csv', header = 0)
sub_id = sub_list['network_name']

sub_list.value_counts('subject_pool')

subject_pool
Typically Developing          330
ADHD-Combined                 109
ADHD-Inattentive               74
ADHD-Hyperactive/Impulsive      7
dtype: int64

In [4]:
con_mat_list = glob.glob('UMCD_dataset/ADHD200_CC200/ADHD200_CC200/*_connectivity_matrix*.txt')
con_mat_TD = []
con_mat_ADHD_C = []
con_mat_ADHD_I = []

for i in sub_id:
    fname = 'UMCD_dataset/ADHD200_CC200/ADHD200_CC200/' + i + '_connectivity_matrix_file.txt'
    if not os.path.isfile(fname):
        continue
    con_mat = np.loadtxt(fname)
    np.fill_diagonal(con_mat, 1)  
    if sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'Typically Developing':
        con_mat_TD.append(con_mat)
    elif sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'ADHD-Combined':
        con_mat_ADHD_C.append(con_mat)
    elif sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'ADHD-Inattentive':
        con_mat_ADHD_I.append(con_mat)
    else:
        continue

    

con_mat_TD = np.array(con_mat_TD)
mean_con_mat_TD = np.mean(con_mat_TD, axis = 0)

con_mat_ADHD_C = np.array(con_mat_ADHD_C)
mean_con_mat_ADHD_C = np.mean(con_mat_ADHD_C, axis = 0)

con_mat_ADHD_I = np.array(con_mat_ADHD_I)
mean_con_mat_ADHD_I = np.mean(con_mat_ADHD_I, axis = 0)

np.savetxt('UMCD_dataset/ADHD200_CC200/mean_connectivity_matrix_TD.txt', mean_con_mat_TD, delimiter=',') 
np.savetxt('UMCD_dataset/ADHD200_CC200/mean_connectivity_matrix_ADHD_C.txt', mean_con_mat_ADHD_C, delimiter=',') 
np.savetxt('UMCD_dataset/ADHD200_CC200/mean_connectivity_matrix_ADHD_I.txt', mean_con_mat_ADHD_I, delimiter=',') 

In [5]:
#label = np.genfromtxt('UMCD_dataset/ADHD200_CC200/ADHD200_CC200/KKI_1043241_region_names_abbrev_file.txt', dtype='str')
label = np.array([i for i in range(mean_con_mat_TD.shape[0])])

ind = cor_cluster(mean_con_mat_TD, label, threshold = 0.55)
#print(np.unique(ind, return_counts=True))
TD_region = label[ind == 77][0:10]

ind = cor_cluster(mean_con_mat_ADHD_C, label, threshold = 0.55)
#print(np.unique(ind, return_counts=True))
ADHD_C_region = label[ind == 65][0:10]

ind = cor_cluster(mean_con_mat_ADHD_I, label, threshold = 0.55)
#print(np.unique(ind, return_counts=True))
ADHD_I_region = label[ind == 61][0:10]

print('Typically Developing: ', TD_region)
print('ADHD_Combined: ', ADHD_C_region)
print('ADHD-Inattentive: ', ADHD_I_region)

Typically Developing:  [  4  27  60 108 126 128 134 145 147 166]
ADHD_Combined:  [  4  27  60 108 126 128 134 145 147 166]
ADHD-Inattentive:  [  4  27  60 108 126 128 134 145 147 166]


# PRURIM

In [6]:
sub_list = pd.read_csv('UMCD_dataset/PRURIM/PRURIM_list.csv', header = 0)
sub_id = sub_list['network_name']

sub_list.value_counts('subject_pool')

subject_pool
Healthy      15
Psoriasis    14
dtype: int64

In [7]:
con_mat_H = []
con_mat_P = []

for i in sub_id:
    fname = 'UMCD_dataset/PRURIM/PRURIM/' + i + '_connectivity_matrix_file.txt'
    if not os.path.isfile(fname):
        continue
    con_mat = np.loadtxt(fname)
    np.fill_diagonal(con_mat, 1) 
    if sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'Healthy':
        con_mat_H.append(con_mat)
    elif sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'Psoriasis':
        con_mat_P.append(con_mat)
    else:
        continue

con_mat_H = np.array(con_mat_H)
mean_con_mat_H = np.mean(con_mat_H, axis = 0)

con_mat_P = np.array(con_mat_P)
mean_con_mat_P= np.mean(con_mat_P, axis = 0)

np.savetxt('UMCD_dataset/PRURIM/mean_connectivity_matrix_Healthy.txt', mean_con_mat_H, delimiter=',') 
np.savetxt('UMCD_dataset/PRURIM/mean_connectivity_matrix_Psoriasis.txt', mean_con_mat_P, delimiter=',') 

In [8]:
label = np.array([i for i in range(mean_con_mat_H.shape[0])])

ind = cor_cluster(mean_con_mat_H, label, threshold = 0.65)
#print(np.unique(ind, return_counts=True))
H_region = label[ind == 15][0:10]

ind = cor_cluster(mean_con_mat_P, label, threshold = 0.5)
#print(np.unique(ind, return_counts=True))
P_region = label[ind == 8][0:10]

print('Healthy: ', H_region)
print('Psoriasis: ', P_region)


Healthy:  [16 17 28 29 62 63 78 79 80 81]
Psoriasis:  [42 43 44 45 46 47 48 49 50 51]


# UCSF_MAC_PSP

In [9]:
sub_list = pd.read_csv('UMCD_dataset/UCSF_MAC_PSP/UCSF_MAC_PSP_list.csv', header = 0)
sub_id = sub_list['network_name']

sub_list.value_counts('subject_pool')

subject_pool
Control                           40
Progressive Supranuclear Palsy    24
dtype: int64

In [10]:
con_mat_CON = []
con_mat_PSP = []

for i in sub_id:
    fname = 'UMCD_dataset/UCSF_MAC_PSP/UCSF_MAC_PSP/' + i + '_connectivity_matrix_file.txt'
    if not os.path.isfile(fname):
        continue
    con_mat = np.loadtxt(fname)
    np.fill_diagonal(con_mat, 1) 
    if sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'Control':
        con_mat_CON.append(con_mat)
    elif sub_list.loc[sub_list['network_name'] == i].subject_pool.item() == 'Progressive Supranuclear Palsy':
        con_mat_PSP.append(con_mat)
    else:
        continue

con_mat_CON = np.array(con_mat_CON)
mean_con_mat_CON = np.mean(con_mat_CON, axis = 0)

con_mat_PSP = np.array(con_mat_PSP)
mean_con_mat_PSP= np.mean(con_mat_PSP, axis = 0)

np.savetxt('UMCD_dataset/UCSF_MAC_PSP/mean_connectivity_matrix_Control.txt', mean_con_mat_CON, delimiter=',') 
np.savetxt('UMCD_dataset/UCSF_MAC_PSP/mean_connectivity_matrix_PSP.txt', mean_con_mat_PSP, delimiter=',') 

In [11]:
#label = np.genfromtxt('UMCD_dataset/UCSF_MAC_PSP/UCSF_MAC_PSP/HC_1_t1_region_names_abbrev_file.txt', dtype='str')
label = np.array([i for i in range(mean_con_mat_PSP.shape[0])])

ind = cor_cluster(mean_con_mat_CON, label, threshold = 0.5)
#print(np.unique(ind, return_counts=True))
CON_region = label[ind == 3][0:10]

ind = cor_cluster(mean_con_mat_PSP, label, threshold = 0.75)
#print(np.unique(ind, return_counts=True))
PSP_region = label[ind == 2][0:10]


print('Control: ', CON_region)
print('Progressive Supranuclear Palsy: ', PSP_region)

Control:  [ 0  1  3  4  5  6  8  9 11 12]
Progressive Supranuclear Palsy:  [ 0  1  5  7  8  9 13 15 16 18]


# Save results

In [12]:
np.savez('connectivity_matrix', con_mat_TD = con_mat_TD, con_mat_ADHD_C = con_mat_ADHD_C, 
                                con_mat_ADHD_I = con_mat_ADHD_I, con_mat_H = con_mat_H, 
                                con_mat_P = con_mat_P, con_mat_CON = con_mat_CON, 
                                con_mat_PSP = con_mat_PSP, TD_region = TD_region, ADHD_C_region = ADHD_C_region,
                                ADHD_I_region = ADHD_I_region, H_region = H_region, P_region = P_region, 
                                CON_region = CON_region, PSP_region = PSP_region)

# Compute LE mean and cov for the selected sub-network 

In [14]:
mat = np.load('connectivity_matrix.npz') 

names = np.array(['TD', 'ADHD_C', 'ADHD_I', 'H', 'P', 'CON', 'PSP'])
N = 10 # number of regions/nodes 
p = len(names)
q = int(N*(N + 1)/2)


M = np.zeros((p, N, N))
Sigma = np.zeros((p, q, q))


for i, name in enumerate(names):
    tmp = mat['con_mat_' + name]
    region = mat[name + '_region']
    tmp1 = check_SPD(tmp[:, :, region][:, region])
    #print(tmp[:, :, region][:, region].shape)
    M[i] = FM_logE(tmp1)
    #print(M[i].shape)
    Sigma[i] = cov_logE(tmp1)
    
Sigma = check_SPD(Sigma)

In [15]:
[np.min(np.linalg.eigvalsh(x)) for x in Sigma]

[0.0047747733991427765,
 0.0014275247074175077,
 0.00023901388920429708,
 0.0010000000000000093,
 0.0010000000000000785,
 0.0010000000000000388,
 0.0010000000000001353]

In [16]:
np.savez('LE_mean_cov', M = M, Sigma = Sigma, name = name)