In [1]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

In [2]:
from sklearn.metrics import confusion_matrix

def Jaccard_index(data,t=None):
    s = data.shape
    odata1 = np.zeros((s[0],2))
    num_feature = int(t*s[2])
    for i in range(s[0]):
        tmp_l = []
        for j in range(s[1]):
            importances1 = data[i,j,:,0]
            importances2 = data[i,j,:,1]
            l1 = np.argsort(-importances1)[:num_feature]
            l2 = np.argsort(-importances2)[:num_feature]
            
            seqs = np.zeros((s[2],2))
            seqs[l1,0] = 1
            seqs[l2,1] = 1
            confusion_matrix1 = confusion_matrix(seqs[:,0],seqs[:,1])
            confusion_matrix2 = np.zeros((2,2))
            confusion_matrix2[0,0] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)**2
            confusion_matrix2[1,1] = ((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)**2
            confusion_matrix2[0,1] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)\
            *((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)
            confusion_matrix2[1,0] = ((2*confusion_matrix1[0,0] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)\
            *((2*confusion_matrix1[1,1] + confusion_matrix1[0,1] + confusion_matrix1[1,0])/2)
            
            S = confusion_matrix1[1,1]/(confusion_matrix1[1,1] + confusion_matrix1[1,0] + confusion_matrix1[0,1])
            ES = confusion_matrix2[1,1]/(confusion_matrix2[1,1] + confusion_matrix2[1,0] + confusion_matrix2[0,1])
            
            dd = (S - ES)/(1-ES)
            tmp_l.append(dd)
            
        odata1[i,0] = np.mean(tmp_l)
        odata1[i,1] = np.std(tmp_l)
    return odata1

In [1]:
files = ['Age', 'BMI','IQ','NM', 'NS', 'alcohol','BM']
ts = [0.05,0.1,0.15,0.2,0.25]
orders = [1,2,3,4,5]

types = ['CSA','CT','FC']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#8c564b', '#9467bd', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

for k in range(5):
    my_order = orders[k]
    t = ts[k]
    for j in range(len(types)):  
        my_type = types[j]
        fig = plt.figure(figsize=(16,8))
        for i in range(len(files)):
            f = files[i]
            
            importance_file = 'bootstrap_feature_selection/'+f+'/feature_importances_'+my_type+'.npy'
            importance_data = np.load(importance_file)
            
            reliability = Jaccard_index(importance_data,t)
            
            x = []
            for tmp in range(reliability.shape[0]-1):
                tmp1 = tmp + 1
                if tmp1 <= 10:
                    x.append(100*tmp1)
                else:
                    x.append(1000+(tmp1-10)*1000)
            
            y = reliability[:-1,0]

            plt.plot(x,y,'k',color=colors[i],label=f,linewidth=4)
#         if my_type == 'FC':
#             plt.hlines(y=f_random_line,xmin=-500,xmax=15000,linestyles='dashed',linewidth=3,color ='black')
#         else:
#             plt.hlines(y=s_random_line,xmin=-500,xmax=15000,linestyles='dashed',linewidth=3,color ='black')
        ax = plt.gca()
        ax.spines['top'].set_linewidth(3)
        ax.spines['bottom'].set_linewidth(3)
        ax.spines['left'].set_linewidth(3)
        ax.spines['right'].set_linewidth(3)
        plt.ylabel('Jaccard index',fontsize=30)
        plt.title(my_type+ ": top " + str(int(100*t))+'% important features',fontsize=30)
        plt.ylim(-0.05,1.1)
        plt.xlim(-500,18000)
        plt.xlabel('sample size',fontsize=30)
        plt.tick_params(direction='out', length=10, width=4,labelsize=30)
        output_file = 'figures/feature_selection_'+my_type+'_'+str(t)+'.png'
        fig.savefig(output_file,dpi=300,bbox_inches = 'tight')    