In [None]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import pearsonr,spearmanr
from scipy import stats

In [None]:
# fdr multiple testing correction
from statsmodels.stats import multitest

def fdr_correction(P):
    size = P.shape
    temp_p = P.flatten()
    Ps = multitest.multipletests(temp_p,alpha=0.05,method='fdr_bh')
    P_corrected = Ps[1].reshape(size)

    return P_corrected

In [None]:
# the function of calculating Jaccard index
from sklearn.metrics import confusion_matrix,jaccard_score
def Jaccard_index(data1,data2,t=None):
    s = data1.shape
    odata1 = np.zeros((s[0],2))
    for i in range(s[0]):
        tmp_l = []
        for j in range(s[1]):
            corr_map1 = data1[i,j,:,:]
            corr_map2 = data2[i,j,:,:]
            if t == 'fdr':
                correct_P1 = fdr_correction(corr_map1[:,1])
                correct_P2 = fdr_correction(corr_map2[:,1])
                
                l1 = np.where(correct_P1 < 0.05)[0].tolist()
                l2 = np.where(correct_P2 < 0.05)[0].tolist()
            elif t == 'bonferroni':
                pt = 0.05/s[2]
                l1 = np.where(corr_map1[:,1] < pt)[0].tolist()
                l2 = np.where(corr_map2[:,1] < pt)[0].tolist()
            else:
                l1 = np.where(corr_map1[:,1] < float(t))[0].tolist()
                l2 = np.where(corr_map2[:,1] < float(t))[0].tolist()
                
            seqs = np.zeros((s[2],2))
            seqs[l1,0] = 1
            seqs[l2,1] = 1
            if len(l1) == 0 and len(l2) == 0:
                confusion_matrix1 = np.zeros((2,2))
                confusion_matrix1[0,0] = s[2]
            else:
                confusion_matrix1 = confusion_matrix(seqs[:,0],seqs[:,1])
                
            confusion_matrix2 = np.zeros((2,2))
            confusion_matrix2[0,0] = (confusion_matrix1[0,0] + confusion_matrix1[0,1])*(confusion_matrix1[0,0] + confusion_matrix1[1,0])
            confusion_matrix2[1,1] = (confusion_matrix1[1,1] + confusion_matrix1[0,1])*(confusion_matrix1[1,1] + confusion_matrix1[1,0])
            confusion_matrix2[0,1] = (confusion_matrix1[1,1] + confusion_matrix1[0,1])*(confusion_matrix1[0,0] + confusion_matrix1[0,1])
            confusion_matrix2[1,0] = (confusion_matrix1[1,1] + confusion_matrix1[1,0])*(confusion_matrix1[0,0] + confusion_matrix1[1,0])
            
            n1 = confusion_matrix1[1,1] + confusion_matrix1[1,0] + confusion_matrix1[0,1]
            if n1 == 0:
                S = 0
            else:
                S = confusion_matrix1[1,1]/ n1
            
            n2 = confusion_matrix2[1,1] + confusion_matrix2[1,0] + confusion_matrix2[0,1]
            if n2 == 0:
                ES = 0
            else:
                ES = confusion_matrix2[1,1]/n2
            
            dd = (S - ES)/(1-ES)
            tmp_l.append(dd)
                
#             l = list(set(l1) & set(l2))
#             n = len(l)
#             m = len(l1) + len(l2) - len(l)
#             if m > 0:
#                 tmp_l.append(n/m)
#             else:
#                 tmp_l.append(0)
        odata1[i,0] = np.mean(tmp_l)
        odata1[i,1] = np.std(tmp_l)
    return odata1

In [None]:
# # the function of calculating Intraclass Correlation Coefficient
import pingouin as pg
from scipy import stats

def calculate_ICC(data1,data2,t = None):
    subsampling_times = data1.shape[0]
    random_num = data1.shape[1]
    ICC_data = np.zeros((subsampling_times,2)) 
    t = float(t)
    region_num = round(t * data1.shape[2])
    for i in range(subsampling_times):
        print(i)
        temp_icc = []
        for j in range(random_num):
            
            my_targets = []
            my_raters = []
            my_ratings = []
            
            tdata1 = data1[i,j,:,:]
            tdata2 = data2[i,j,:,:]
            zdata1 = stats.zscore(tdata1[:,0])
            zdata2 = stats.zscore(tdata2[:,0])
            argindexs1 = np.argsort(tdata1[:,1])[:region_num]
            tmp1 = zdata1[argindexs1]
            tmp2 = zdata2[argindexs1]
            

            for k in range(region_num):
                my_targets.append(k+1)
                my_raters.append('A')
                my_ratings.append(tmp1[k])
  
                my_targets.append(k+1)
                my_raters.append('B')
                my_ratings.append(tmp2[k])
        
            my_df_data = pd.DataFrame(data=my_targets,columns=['region'])
            my_df_data['random_time'] = my_raters
            my_df_data['corr'] = my_ratings
            # ICC2: A random sample of k raters rate each target. The measure is one of absolute agreement in the ratings.
            my_icc = pg.intraclass_corr(data=my_df_data, targets='region', raters='random_time', ratings='corr')['ICC'].values[1]
            temp_icc.append(my_icc)
    
        ICC_data[i,0] = np.mean(temp_icc)
        ICC_data[i,1] = np.std(temp_icc)
    return ICC_data

In [None]:
# conducting the calculation of Jaccard index
file_path = '/data/sliu/sampling_ukbb_analysis/new_results/'
files = os.listdir(file_path)

# t indictaes the significance thresholds: p<0.05, p<0.01, fdr_p<0.05, fdr_bonferroni<0.05

t = 'bonferroni'
for f in files:
    print(f)
    CT_file_path1 = os.path.join(file_path,f) + '/random/random_data_CT1.npy'
    CT_file_path2 = os.path.join(file_path,f) + '/random/random_data_CT2.npy'
    CSA_file_path1 = os.path.join(file_path,f) + '/random/random_data_CSA1.npy'
    CSA_file_path2 = os.path.join(file_path,f) + '/random/random_data_CSA2.npy'
    FC_file_path1 = os.path.join(file_path,f) + '/random/random_data_FC1.npy'
    FC_file_path2 = os.path.join(file_path,f) + '/random/random_data_FC2.npy'
    
    random_data_CSA1 = np.load(CSA_file_path1)
    random_data_CT1 = np.load(CT_file_path1)
    random_data_FC1= np.load(FC_file_path1)
        
    random_data_CSA2 = np.load(CSA_file_path2)
    random_data_CT2 = np.load(CT_file_path2)
    random_data_FC2 = np.load(FC_file_path2)
    
    CT_reliability = Jaccard_index(random_data_CT1,random_data_CT2,t=t)
    CSA_reliability = Jaccard_index(random_data_CSA1,random_data_CSA2,t=t)
    FC_reliability = Jaccard_index(random_data_FC1,random_data_FC2,t=t)
    
    CSA_file_name = 'new_results/'+f+'/CSA_Jaccard_index_'+t+'.csv'
    CT_file_name = 'new_results/'+f+'/CT_Jaccard_index_'+t+'.csv'
    FC_file_name = 'new_results/'+f+'/FC_Jaccard_index_'+t+'.csv'

    data1 = pd.DataFrame(data=CT_reliability)
    data2 = pd.DataFrame(data=CSA_reliability)
    data3 = pd.DataFrame(data=FC_reliability)

    data1.to_csv(CT_file_name,index=False)
    data2.to_csv(CSA_file_name,index=False)
    data3.to_csv(FC_file_name,index=False)

In [None]:
file_path = '/data/sliu/sampling_ukbb_analysis/new_results/'
files = os.listdir(file_path)

# t indicates how many brain measures are included to calculate the ICC: 10%,15%,20%,25%,50%,100%
t = '0.2'
for f in files:
    print(f)
    CT_file_path1 = os.path.join(file_path,f) + '/random/random_data_CT1.npy'
    CT_file_path2 = os.path.join(file_path,f) + '/random/random_data_CT2.npy'
    CSA_file_path1 = os.path.join(file_path,f) + '/random/random_data_CSA1.npy'
    CSA_file_path2 = os.path.join(file_path,f) + '/random/random_data_CSA2.npy'
    FC_file_path1 = os.path.join(file_path,f) + '/random/random_data_FC1.npy'
    FC_file_path2 = os.path.join(file_path,f) + '/random/random_data_FC2.npy'
    
    random_data_CSA1 = np.load(CSA_file_path1)
    random_data_CT1 = np.load(CT_file_path1)
    random_data_FC1= np.load(FC_file_path1)
        
    random_data_CSA2 = np.load(CSA_file_path2)
    random_data_CT2 = np.load(CT_file_path2)
    random_data_FC2 = np.load(FC_file_path2)
    
    CT_reliability = calculate_ICC(random_data_CT1,random_data_CT2,t=t)
    CSA_reliability = calculate_ICC(random_data_CSA1,random_data_CSA2,t=t)
    FC_reliability = calculate_ICC(random_data_FC1,random_data_FC2,t=t)
    
    CSA_file_name = 'new_results/'+f+'/CSA_ICC_'+t+'.csv'
    CT_file_name = 'new_results/'+f+'/CT_ICC_'+t+'.csv'
    FC_file_name = 'new_results/'+f+'/FC_ICC_'+t+'.csv'

    data1 = pd.DataFrame(data=CT_reliability)
    data2 = pd.DataFrame(data=CSA_reliability)
    data3 = pd.DataFrame(data=FC_reliability)

    data1.to_csv(CT_file_name,index=False)
    data2.to_csv(CSA_file_name,index=False)
    data3.to_csv(FC_file_name,index=False)