In [1]:
import warnings
warnings.filterwarnings('ignore')
from numpy import *
import numpy as np
import pandas as pd
import sys, os
import torch
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from GCFNN import GCFNN
import torch.nn.functional as F
from numpy.random import randint

In [2]:
def concatenated_tensor(X_set, M):
    concatenated_tensor_list = []
    # 遍历索引范围为 0 到 M-1 的值
    for i in range(M.shape[1]):
        # 拼接 X_set[i]
        concatenated_tensor_list.append(X_set[i])
        ## 将 adj_list[i] 转换为密集张量并拼接
        #adj_dense_tensor = adj_list[i].to_dense()
        #concatenated_tensor_list.append(adj_dense_tensor)
        
    # 将 tr_M 转换为张量并拼接
    M_tensor = torch.from_numpy(M)
    concatenated_tensor_list.append(M_tensor)
    # 沿着 dim=1 拼接所有张量
    concatenated_tensor = torch.cat(concatenated_tensor_list, dim=1)
    return concatenated_tensor

In [3]:
# read data
meta_data=pd.read_csv('meta.csv',index_col=0)
metabolomics=pd.read_csv('metabolomics.csv',index_col=0).fillna(0)
metagenomics=pd.read_csv('metagenomics_species.csv',index_col=0)
data_class={'Healthy':0,'Stage_0':1, 'Stage_I_II':1, 'Stage_III_IV':1, 'HS':1, 'MP':1}
meta_data['Study.Group']=meta_data['Study.Group'].map(data_class)

x1=metagenomics
x2=metabolomics
print('输入各组学数据表：',x1.shape,x2.shape,meta_data.shape)

#重构数据表和标签表
X1=pd.concat([x1,x2],axis=1).iloc[:,:x1.shape[1]]
X2=pd.concat([x1,x2],axis=1).iloc[:,x1.shape[1]:x1.shape[1]+x2.shape[1]]
index_new=X1.index
print(index_new)
missing_rate=(len(x1)+len(x2))/(len(index_new)*2)
print('数据完整率missing_rate :',missing_rate)
                                        
meta_data=meta_data.reindex(index_new)
X1=array(X1.reindex(index_new))
X2=array(X2.reindex(index_new))
print('集合所有样本重构数据表：',X1.shape,X2.shape,meta_data.shape)
X_set=[X1,X2]
M        = len(X_set)
Mask     = np.ones([np.shape(X_set[0])[0], M])
#生成表示组学缺失情况的掩码矩阵，将缺失组学部分填补为0
for m_idx in range(M):
    Mask[np.isnan(X_set[m_idx]).all(axis=1), m_idx] = 0
    #X_set[m_idx][Mask[:, m_idx] == 0] = np.mean(X_set[m_idx][Mask[:, m_idx] == 1], axis=0)
    X_set[m_idx][Mask[:, m_idx] == 0] = 0
    
#对宏基因组数据预处理
def log(base,x):
    return np.log(x)/np.log(base)
X_set[0]=log(2,2*X_set[0]+0.00001)

#print(Mask)
#print(X_set)
Y = meta_data['Study.Group'].tolist()
Y = np.array(Y)
Y_onehot = np.zeros((Y.shape[0], 2))
Y_onehot[np.arange(Y.shape[0]), Y] = 1
print('标签转换为one-hot编码：', Y_onehot.shape, np.sum(Y_onehot[:, 0]))

输入各组学数据表： (347, 851) (347, 450) (347, 1)
Int64Index([10021, 10023, 10025, 10029, 10031, 10033, 10034, 10036, 10037,
            10038,
            ...
            10818, 10825, 10827, 10829, 10835, 10838, 10839, 10847, 10850,
            10853],
           dtype='int64', name='Sample', length=347)
数据完整率missing_rate : 1.0
集合所有样本重构数据表： (347, 851) (347, 450) (347, 1)
标签转换为one-hot编码： (347, 2) 127.0


In [4]:
# 指定保存模型的文件夹路径
folder_path = 'model_shap'
# 创建文件夹
os.makedirs(folder_path, exist_ok=True)

cv = StratifiedKFold(n_splits=5,shuffle = True,random_state = 50)
auc_cv = []
acc_cv = []
f1_cv = []
for i,(train_index, test_index) in enumerate(cv.split(X_set[0],np.argmax(Y_onehot,axis=1))):
    tr_X_set, te_X_set, va_X_set = {}, {}, {}
    for m in range(len(X_set)):
        tr_X_set[m],tr_Y_onehot,tr_M = X_set[m][train_index],Y_onehot[train_index],Mask[train_index]
        te_X_set[m],te_Y_onehot,te_M = X_set[m][test_index],Y_onehot[test_index],Mask[test_index]
        #归一化
        #print(te_Y_onehot)
    def Normalize(data):
        """
        :param data:Input data
        :return:normalized data
        """
        mean = np.mean(data)
        mx = np.max(data)
        mn = np.min(data)
        return mean, mx, mn
    
    for m in range(M):
        mean, mx, mn = Normalize(tr_X_set[m])
        tr_X_set[m] = (tr_X_set[m] - mean) / (mx - mn)
        te_X_set[m] = (te_X_set[m] - mean) / (mx - mn)
    
    x_dim_set    = [tr_X_set[m].shape[1] for m in range(len(tr_X_set))]
    y_dim        = np.shape(tr_Y_onehot)[1]
    
    input_dims = {
        'x_dim_set': x_dim_set,
        'y_dim': y_dim,
        'z_dim': 50,
        #'steps_per_batch': steps_per_batch
    }
    network_settings = {
        'dim_enc': 100,
        'num_layers_enc': 3,   #view-specific
        'dim_specificpre': 100,
        'num_layers_specificpre': 2,  #multi-view
        'dim_joint_pre': 100,
        'num_layers_jointpre': 2,
        'dropout': 0.5,
        'edge_per_node': 4, 
        'ITERATION': 2000
    }
    
    for m in range(M):
        tr_X_set[m] = torch.from_numpy(tr_X_set[m])
        te_X_set[m] = torch.from_numpy(te_X_set[m])
    tr_Y_onehot = torch.from_numpy(tr_Y_onehot)
    te_Y_onehot = torch.from_numpy(te_Y_onehot)
    #tansform
    tr_input = concatenated_tensor(tr_X_set, tr_M)
    te_input = concatenated_tensor(te_X_set, te_M)
    GPU_use = True
    model = GCFNN(input_dims, network_settings, cuda = GPU_use)
    if GPU_use: 
        model = model.cuda()
        tr_input = tr_input.cuda()
        te_input = te_input.cuda()
        tr_Y_onehot = tr_Y_onehot.cuda()
        te_Y_onehot = te_Y_onehot.cuda()
    model_save, maxf1, maxacc, maxauc = model.train_model(train_input = tr_input, test_input = te_input, alpha = 1.0, beta = 0.01, l_rate = 0.0005, tr_Y_onehot = tr_Y_onehot, te_Y_onehot = te_Y_onehot)
    model_path = os.path.join(folder_path, 'model_'+str(i)+'.pth')
    torch.save(model_save, model_path)
    f1_cv.append(maxf1)
    acc_cv.append(maxacc)
    auc_cv.append(maxauc)
print('f1', f1_cv)
print('average f1: {}±{}'.format(np.mean(f1_cv), np.std(f1_cv)))
print('acc', acc_cv)
print('average acc: {}±{}'.format(np.mean(acc_cv), np.std(acc_cv)))
print('auc', auc_cv)
print('average auc: {}±{}'.format(np.mean(auc_cv), np.std(auc_cv)))

Train F1: 0.7664
00050: TRAIN| LT=2.114 LP=0.670 LKL=10.428 LPS=1.326 LKLS=1.410 | 
Test F1: 0.7719
Test ACC: 0.6286
Test AUC: 0.6346
Train F1: 0.8030
00100: TRAIN| LT=1.883 LP=0.570 LKL=10.985 LPS=1.139 LKLS=6.340 | 
Test F1: 0.6818
Test ACC: 0.6000
Test AUC: 0.6617
Train F1: 0.8378
00150: TRAIN| LT=1.637 LP=0.412 LKL=11.443 LPS=1.023 LKLS=8.780 | 
Test F1: 0.6098
Test ACC: 0.5429
Test AUC: 0.6914
Train F1: 0.8515
00200: TRAIN| LT=1.622 LP=0.382 LKL=11.844 LPS=1.028 LKLS=9.408 | 
Test F1: 0.7253
Test ACC: 0.6429
Test AUC: 0.7264
Train F1: 0.8678
00250: TRAIN| LT=1.525 LP=0.322 LKL=12.090 LPS=0.982 LKLS=10.008 | 
Test F1: 0.7059
Test ACC: 0.6429
Test AUC: 0.7439
Train F1: 0.8901
00300: TRAIN| LT=1.503 LP=0.335 LKL=12.093 LPS=0.952 LKLS=9.511 | 
Test F1: 0.7579
Test ACC: 0.6714
Test AUC: 0.7649
Train F1: 0.8934
00350: TRAIN| LT=1.392 LP=0.262 LKL=12.274 LPS=0.909 LKLS=9.765 | 
Test F1: 0.7416
Test ACC: 0.6714
Test AUC: 0.7675
Train F1: 0.9192
00400: TRAIN| LT=1.383 LP=0.244 LKL=12.520 L