In [None]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import ttest_ind

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

def plot_distribution(variable,trait):
    
    plot_data = pd.DataFrame(data=variable,columns=['value'])
    fig = plt.figure(figsize=(4,4))
    sns.displot(plot_data, x="value", discrete=True)
    plt.tick_params(direction='out', length=6, width=2,labelsize=15)
    plt.ylabel('Count',fontsize=18)
    plt.xlabel('Numeric memory',fontsize=18)
    Q1= np.quantile(variable,0.25)
    Q2 = np.quantile(variable,0.5)
    Q3 = np.quantile(variable,0.75)
    
    plt.vlines((Q1, Q2, Q3,), 0, 6000, colors = ("r", "black", "b"),
         linestyles = ("solid", "solid", "solid"))
#     plt.xlim(0,np.max(variable)+2,1)
    x = np.arange(1,np.max(variable)+1)
    x_labels = []
    for i in x:
        x_labels.append(str(int(i)))
    plt.xticks(x,x_labels)
    output_path = 'two_sample_ttest/results/'+trait+'/'+trait+'_distribution.png'
#     plt.show()
    plt.savefig(output_path,dpi=300,bbox_inches = 'tight')

In [None]:
# show the variable distribution and select the first and fourth quarters
plot_distribution(variable_data,t)

Q1= np.quantile(variable_data,0.25)
Q2 = np.quantile(variable_data,0.5)
Q3 = np.quantile(variable_data,0.75)
print(Q1,Q3,np.sum(variable_data <= Q1),np.sum(variable_data <= Q1)/variable_data.shape[0],np.sum(variable_data >= Q3),np.sum(variable_data >= Q3)/variable_data.shape[0])

In [None]:
def split_variable(variable):
    Q1= np.quantile(variable,0.25)
    Q2 = np.quantile(variable,0.5)
    Q3 = np.quantile(variable,0.75)
    l1 = np.where(variable < Q2)[0]
    l2 = np.where(variable >= Q2)[0]
    return l1,l2

In [None]:
# calculate the t statistics between the first and fourth quarters
def calculate_t_values(CSA_data, CT_data, FC_data, variable):
    
    l1,l2 = split_variable(variable)
    low_CSA = CSA_data[l1,:]
    high_CSA = CSA_data[l2,:]
    
    low_CT = CT_data[l1,:]
    high_CT = CT_data[l2,:]
    
    low_FC = FC_data[l1,:]
    high_FC = FC_data[l2,:]
    
    N1 = CSA_data.shape[1]
    N2 = FC_data.shape[1]
    if len(l1) < len(l2):
        subsampling_times = int(len(l1)/100)
    else:
        subsampling_times = int(len(l2)/100)
    random_num = 100
    
    random_data_CSA1 = np.zeros((subsampling_times,random_num,N1,2))
    random_data_CT1 = np.zeros((subsampling_times,random_num,N1,2))
    random_data_FC1 = np.zeros((subsampling_times,random_num,N2,2))

    random_data_CSA2 = np.zeros((subsampling_times,random_num,N1,2))
    random_data_CT2 = np.zeros((subsampling_times,random_num,N1,2))
    random_data_FC2 = np.zeros((subsampling_times,random_num,N2,2))

    for i in range(subsampling_times):
        print(i)
        random_sample = 50*(i+1)
        for j in range(random_num):
            total_list = np.arange(len(l1)).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            
            Low_X1 = low_CSA[random_inds1,:]
            Low_X2 = low_CSA[random_inds2,:]
            
            Low_Y1 = low_CT[random_inds1,:]
            Low_Y2 = low_CT[random_inds2,:]

            Low_Z1 = low_FC[random_inds1,:]
            Low_Z2 = low_FC[random_inds2,:] 
            
            total_list = np.arange(len(l2)).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)
            
            High_X1 = high_CSA[random_inds1,:]
            High_X2 = high_CSA[random_inds2,:]
            
            High_Y1 = high_CT[random_inds1,:]
            High_Y2 = high_CT[random_inds2,:]
            
            High_Z1 = high_FC[random_inds1,:]
            High_Z2 = high_FC[random_inds2,:]

            for k in range(N1):
                t1,p1 = ttest_ind(High_X1[:,k],Low_X1[:,k])
                t2,p2 = ttest_ind(High_X2[:,k],Low_X2[:,k])
                random_data_CSA1[i,j,k,0] = t1
                random_data_CSA1[i,j,k,1] = p1
                random_data_CSA2[i,j,k,0] = t2
                random_data_CSA2[i,j,k,1] = p2

                t1,p1 = ttest_ind(High_Y1[:,k],Low_Y1[:,k])
                t2,p2 = ttest_ind(High_Y2[:,k],Low_Y2[:,k])
                random_data_CT1[i,j,k,0] = t1
                random_data_CT1[i,j,k,1] = p1
                random_data_CT2[i,j,k,0] = t2
                random_data_CT2[i,j,k,1] = p2


            for k in range(N2):
                t1,p1 = ttest_ind(High_Z1[:,k],Low_Z1[:,k])
                t2,p2 = ttest_ind(High_Z2[:,k],Low_Z2[:,k])
                random_data_FC1[i,j,k,0] = t1
                random_data_FC1[i,j,k,1] = p1
                random_data_FC2[i,j,k,0] = t2
                random_data_FC2[i,j,k,1] = p2
                
    return random_data_CSA1,random_data_CT1,random_data_FC1,random_data_CSA2,random_data_CT2,random_data_FC2

In [None]:
random_data_CSA1,random_data_CT1,random_data_FC1,random_data_CSA2,random_data_CT2,random_data_FC2 = calculate_t_values(CSA_data,CT_data,FC_data,variable_data)

In [None]:
CSA_file_name1 = 'two_sample_ttest/results/'+t+'/random2/random_data_CSA1.npy'
CSA_file_name2 = 'two_sample_ttest/results/'+t+'/random2/random_data_CSA2.npy'

CT_file_name1 = 'two_sample_ttest/results/'+t+'/random2/random_data_CT1.npy'
CT_file_name2 = 'two_sample_ttest/results/'+t+'/random2/random_data_CT2.npy'

FC_file_name1 = 'two_sample_ttest/results/'+t+'/random2/random_data_FC1.npy'
FC_file_name2 = 'two_sample_ttest/results/'+t+'/random2/random_data_FC2.npy'

np.save(CSA_file_name1,random_data_CSA1)
np.save(CSA_file_name2,random_data_CSA2)

np.save(CT_file_name1,random_data_CT1)
np.save(CT_file_name2,random_data_CT2)

np.save(FC_file_name1,random_data_FC1)
np.save(FC_file_name2,random_data_FC2)