In [None]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import ttest_ind

In [None]:
# load imaging measures and variable
data_path = '/data/sliu/sampling_ukbb_analysis/data/'
t = 'Numeric_memory'
data_path2 = os.path.join(data_path,t)
CSA_file_name = 'CSA_with_controlling_for_total_brain.csv'
CSA_file_path = os.path.join(data_path2,CSA_file_name)
CT_file_name = 'CT_with_controlling_for_total_brain.csv'
CT_file_path = os.path.join(data_path2,CT_file_name)
FC_file_name = 'FC_data.csv'
FC_file_path = os.path.join(data_path2,FC_file_name)
variable_name = 'variable.csv'
variable_file_path = os.path.join(data_path2,variable_name)

CSA_data = pd.read_csv(CSA_file_path).iloc[:,:-1].values
CT_data = pd.read_csv(CT_file_path).iloc[:,:-1].values
FC_data = pd.read_csv(FC_file_path).iloc[:,:-1].values
variable_data = pd.read_csv(variable_file_path).iloc[:,0].values

In [None]:
def split_sample(data,variable):
    half_N = int(variable.shape[0]/2)
    l = np.argsort(variable)
    l1 = l[:half_N]
    l2 = l[half_N:]
    
    data1 = data[l1,:]
    data2 = data[l2,:]
    return data1, data2 

In [None]:
# calculate the t statistics between high and low groups
def calculate_t_values(CSA_data, CT_data, FC_data, variable):
    
    s1 = CSA_data.shape
    s2 = FC_data.shape
    subsampling_times = int(s1[0]/200)
    random_num = 100
    
    random_data_CSA1 = np.zeros((subsampling_times,random_num,s1[1],2))
    random_data_CT1 = np.zeros((subsampling_times,random_num,s1[1],2))
    random_data_FC1 = np.zeros((subsampling_times,random_num,s2[1],2))

    random_data_CSA2 = np.zeros((subsampling_times,random_num,s1[1],2))
    random_data_CT2 = np.zeros((subsampling_times,random_num,s1[1],2))
    random_data_FC2 = np.zeros((subsampling_times,random_num,s2[1],2))

    for i in range(subsampling_times):
        print(i)
        random_sample = 100*(i+1)
        for j in range(random_num):
            total_list = np.arange(s1[0]).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            
            variable1 = variable[random_inds1]
            variable2 = variable[random_inds2]
            
            X1 = CSA_data[random_inds1,:]
            X2 = CSA_data[random_inds2,:]
            Low_X1, High_X1 = split_sample(X1,variable1)
            Low_X2, High_X2 = split_sample(X2,variable2)

            Y1 = CT_data[random_inds1,:]
            Y2 = CT_data[random_inds2,:]
            Low_Y1, High_Y1 = split_sample(Y1,variable1)
            Low_Y2, High_Y2 = split_sample(Y2,variable2)

            Z1 = FC_data[random_inds1,:]
            Z2 = FC_data[random_inds2,:]
            Low_Z1, High_Z1 = split_sample(Z1,variable1)
            Low_Z2, High_Z2 = split_sample(Z2,variable2)
            
            

            for k in range(s1[1]):
                t1,p1 = ttest_ind(High_X1[:,k],Low_X1[:,k])
                t2,p2 = ttest_ind(High_X2[:,k],Low_X2[:,k])
                random_data_CSA1[i,j,k,0] = t1
                random_data_CSA1[i,j,k,1] = p1
                random_data_CSA2[i,j,k,0] = t2
                random_data_CSA2[i,j,k,1] = p2

                t1,p1 = ttest_ind(High_Y1[:,k],Low_Y1[:,k])
                t2,p2 = ttest_ind(High_Y2[:,k],Low_Y2[:,k])
                random_data_CT1[i,j,k,0] = t1
                random_data_CT1[i,j,k,1] = p1
                random_data_CT2[i,j,k,0] = t2
                random_data_CT2[i,j,k,1] = p2


            for k in range(s2[1]):
                t1,p1 = ttest_ind(High_Z1[:,k],Low_Z1[:,k])
                t2,p2 = ttest_ind(High_Z2[:,k],Low_Z2[:,k])
                random_data_FC1[i,j,k,0] = t1
                random_data_FC1[i,j,k,1] = p1
                random_data_FC2[i,j,k,0] = t2
                random_data_FC2[i,j,k,1] = p2
                
    return random_data_CSA1,random_data_CT1,random_data_FC1,random_data_CSA2,random_data_CT2,random_data_FC2

In [None]:
# conducting two-sample t-test while sampling from 100 to maximal sampling size
random_data_CSA1,random_data_CT1,random_data_FC1,random_data_CSA2,random_data_CT2,random_data_FC2 = calculate_t_values(CSA_data,CT_data,FC_data,variable_data)
CSA_file_name1 = 'two_sample_ttest/results/'+t+'/random1/random_data_CSA1.npy'
CSA_file_name2 = 'two_sample_ttest/results/'+t+'/random1/random_data_CSA2.npy'

CT_file_name1 = 'two_sample_ttest/results/'+t+'/random1/random_data_CT1.npy'
CT_file_name2 = 'two_sample_ttest/results/'+t+'/random1/random_data_CT2.npy'

FC_file_name1 = 'two_sample_ttest/results/'+t+'/random1random_data_FC1.npy'
FC_file_name2 = 'two_sample_ttest/results/'+t+'/random1/random_data_FC2.npy'

np.save(CSA_file_name1,random_data_CSA1)
np.save(CSA_file_name2,random_data_CSA2)

np.save(CT_file_name1,random_data_CT1)
np.save(CT_file_name2,random_data_CT2)

np.save(FC_file_name1,random_data_FC1)
np.save(FC_file_name2,random_data_FC2)