In [None]:
import pandas as pd
import numpy as np
import os
import random
from scipy.stats import pearsonr,spearmanr
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

In [None]:
# load data
data_path = '/data/sliu/sampling_ukbb_analysis/data/'
t = 'Age'
data_path2 = os.path.join(data_path,t)
CSA_file_name = 'CSA_with_controlling_for_total_brain.csv'
CSA_file_path = os.path.join(data_path2,CSA_file_name)
CT_file_name = 'CT_with_controlling_for_total_brain.csv'
CT_file_path = os.path.join(data_path2,CT_file_name)
FC_file_name = 'FC_data.csv'
FC_file_path = os.path.join(data_path2,FC_file_name)
variable_name = 'variable_with_controlling_for_total_brain.csv'
variable_file_path = os.path.join(data_path2,variable_name)

CSA_data = pd.read_csv(CSA_file_path).iloc[:,:-1].values
CT_data = pd.read_csv(CT_file_path).iloc[:,:-1].values
FC_data = pd.read_csv(FC_file_path).iloc[:,:-1].values
variable = pd.read_csv(variable_file_path).iloc[:,0].values

In [None]:
def compute_feature_importances(imaging_data,variable):
    s1 = imaging_data.shape
    max_samples_num = int(s1[0]/200)
    random_samples = []

    for i in range(max_samples_num):
        if i < 10:
            random_samples.append(int(100*(i+1)))
        else:
            tmp = 1000 + 1000*(i-9)
            if tmp < s1[0]/2:
                random_samples.append(tmp)
            else:
                random_samples.append(int(s1[0]/2))
                break
#     random_samples = random_samples[:-4] %birth month
    subsampling_times = len(random_samples)
    random_num = 50
    stability = np.zeros((subsampling_times,random_num,s1[1],2))
    
    for i in range(subsampling_times):
        random_sample = random_samples[i]
        print(random_sample)
        for j in range(random_num):
            total_list = np.arange(s1[0]).tolist()
            random_inds1 = random.sample(total_list,random_sample)
            rest_total_list = list(set(total_list) - set(random_inds1))
            random_inds2 = random.sample(rest_total_list,random_sample)

            Y1 = imaging_data[random_inds1,:]
            Y2 = imaging_data[random_inds2,:]

            m1 = variable[random_inds1]
            m2 = variable[random_inds2]

            reg1 = RandomForestRegressor(n_estimators=20)
            reg2 = RandomForestRegressor(n_estimators=20)
            reg1 = reg1.fit(Y1,m1)
            reg2 = reg2.fit(Y2,m2)
            stability[i,j,:,0] = reg1.feature_importances_
            stability[i,j,:,1] = reg2.feature_importances_
    
    return stability

In [None]:
#calculating feature importance and saving
CT_selection_stability = compute_feature_importances(CT_data,variable)
CSA_selection_stability = compute_feature_importances(CSA_data,variable)
FC_selection_stability = compute_feature_importances(FC_data,variable)

CSA_file_name = 'stable_slection/'+t+'/feature_importances_CSA.npy'
CT_file_name = 'stable_slection/'+t+'/feature_importances_CT.npy'
FC_file_name = 'stable_slection/'+t+'/feature_importances_FC.npy'

np.save(CSA_file_name,CSA_selection_stability)
np.save(CT_file_name,CT_selection_stability)
np.save(FC_file_name,FC_selection_stability)

In [None]:
#calculating Jaccard index
from sklearn.metrics import confusion_matrix

def Jaccard_index(data,t=None):
    s = data.shape
    odata1 = np.zeros((s[0],2))
    num_feature = int(t*s[2])
    for i in range(s[0]):
        tmp_l = []
        for j in range(s[1]):
            importances1 = data[i,j,:,0]
            importances2 = data[i,j,:,1]
            l1 = np.argsort(-importances1)[:num_feature]
            l2 = np.argsort(-importances2)[:num_feature]
            
            seqs = np.zeros((s[2],2))
            seqs[l1,0] = 1
            seqs[l2,1] = 1
            confusion_matrix1 = confusion_matrix(seqs[:,0],seqs[:,1])
            confusion_matrix2 = np.zeros((2,2))
            confusion_matrix2[0,0] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)**2
            confusion_matrix2[1,1] = ((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)**2
            confusion_matrix2[0,1] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)\
            *((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)
            confusion_matrix2[1,0] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)\
            *((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)
            
            S = confusion_matrix1[1,1]/(confusion_matrix1[1,1] + confusion_matrix1[1,0] + confusion_matrix1[0,1])
            ES = confusion_matrix2[1,1]/(confusion_matrix2[1,1] + confusion_matrix2[1,0] + confusion_matrix2[0,1])
            
            dd = (S - ES)/(1-ES)
            tmp_l.append(dd)
            
        odata1[i,0] = np.mean(tmp_l)
        odata1[i,1] = np.std(tmp_l)
    return odata1

In [None]:
# plot Jaccard index
files = ['Age', 'BMI','IQ','Numeric_memory', 'Neuroticism', 'Alcohol','Birth']
# How many important features are included?:5%, 10%, 15%, 20%, 25%
ts = [0.05,0.1,0.15,0.2,0.25]
orders = [1,2,3,4,5]

types = ['CSA','CT','FC']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#8c564b', '#9467bd', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

for k in range(5):
    my_order = orders[k]
    t = ts[k]
    for j in range(len(types)):  
        my_type = types[j]
        fig = plt.figure(figsize=(16,8))
        for i in range(len(files)):
            f = files[i]
            
            importance_file = 'stable_slection/'+f+'/feature_importances_'+my_type+'.npy'
            importance_data = np.load(importance_file)
            
            reliability = Jaccard_index(importance_data,t)
            
            sample_sizes = pd.read_csv('stable_slection/'+f+'/'+my_type+'_selection_stability.csv').iloc[:,0].values

            if f == 'Birth':
                x = sample_sizes[:-4]
            else:
                x = sample_sizes
            
            y = reliability[:,0]

            plt.plot(x,y,'k',color=colors[i],label=f,linewidth=4)
#         if my_type == 'FC':
#             plt.hlines(y=f_random_line,xmin=-500,xmax=15000,linestyles='dashed',linewidth=3,color ='black')
#         else:
#             plt.hlines(y=s_random_line,xmin=-500,xmax=15000,linestyles='dashed',linewidth=3,color ='black')
        ax = plt.gca()
        ax.spines['top'].set_linewidth(2)
        ax.spines['bottom'].set_linewidth(2)
        ax.spines['left'].set_linewidth(2)
        ax.spines['right'].set_linewidth(2)
        plt.ylabel('Jaccard index',fontsize=30)
        plt.title(my_type+ ": top " + str(int(100*t))+'% important features',fontsize=30)
        plt.ylim(-0.05,1.1)
        plt.xlim(-500,15000)
        plt.xlabel('sample size',fontsize=30)
        plt.tick_params(direction='out', length=10, width=4,labelsize=30)
        output_file = 'stable_slection/correct_Jaccard_'+my_type+'_'+str(t)+'.png'
        fig.savefig(output_file,dpi=300,bbox_inches = 'tight') 