In [1]:
import warnings
warnings.filterwarnings('ignore')
from numpy import *
import numpy as np
import pandas as pd
import sys, os
import torch
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from GCFNN import GCFNN
import torch.nn.functional as F
from numpy.random import randint

In [2]:
def concatenated_tensor(X_set, M):
    concatenated_tensor_list = []
    # 遍历索引范围为 0 到 M-1 的值
    for i in range(M.shape[1]):
        # 拼接 X_set[i]
        concatenated_tensor_list.append(X_set[i])
        ## 将 adj_list[i] 转换为密集张量并拼接
        #adj_dense_tensor = adj_list[i].to_dense()
        #concatenated_tensor_list.append(adj_dense_tensor)
        
    # 将 tr_M 转换为张量并拼接
    M_tensor = torch.from_numpy(M)
    concatenated_tensor_list.append(M_tensor)
    # 沿着 dim=1 拼接所有张量
    concatenated_tensor = torch.cat(concatenated_tensor_list, dim=1)
    return concatenated_tensor

In [3]:
# read data
meta_data=pd.read_csv('meta.csv',index_col=0)
metabolomics=pd.read_csv('metabolomics.csv',index_col=0).fillna(0)
metagenomics=pd.read_csv('metagenomics_species.csv',index_col=0)
data_class={'Control':0,'ESRD':1}
meta_data['Study.Group']=meta_data['Study.Group'].map(data_class)

x1=metagenomics
x2=metabolomics
print('输入各组学数据表：',x1.shape,x2.shape,meta_data.shape)

#重构数据表和标签表
X1=pd.concat([x1,x2],axis=1).iloc[:,:x1.shape[1]]
X2=pd.concat([x1,x2],axis=1).iloc[:,x1.shape[1]:x1.shape[1]+x2.shape[1]]
index_new=X1.index
print(index_new)
missing_rate=(len(x1)+len(x2))/(len(index_new)*2)
print('数据完整率missing_rate :',missing_rate)
                                        
meta_data=meta_data.reindex(index_new)
X1=array(X1.reindex(index_new))
X2=array(X2.reindex(index_new))
print('集合所有样本重构数据表：',X1.shape,X2.shape,meta_data.shape)
X_set=[X1,X2]
M        = len(X_set)
Mask     = np.ones([np.shape(X_set[0])[0], M])
#生成表示组学缺失情况的掩码矩阵，将缺失组学部分填补为0
for m_idx in range(M):
    Mask[np.isnan(X_set[m_idx]).all(axis=1), m_idx] = 0
    #X_set[m_idx][Mask[:, m_idx] == 0] = np.mean(X_set[m_idx][Mask[:, m_idx] == 1], axis=0)
    X_set[m_idx][Mask[:, m_idx] == 0] = 0
    
#对宏基因组数据预处理
def log(base,x):
    return np.log(x)/np.log(base)
X_set[0]=log(2,2*X_set[0]+0.00001)

#print(Mask)
#print(X_set)
Y = meta_data['Study.Group'].tolist()
Y = np.array(Y)
Y_onehot = np.zeros((Y.shape[0], 2))
Y_onehot[np.arange(Y.shape[0]), Y] = 1
print('标签转换为one-hot编码：', Y_onehot.shape, np.sum(Y_onehot[:, 0]))

输入各组学数据表： (287, 780) (287, 276) (287, 1)
Index(['CON-001', 'CON-002', 'CON-003', 'CON-004', 'CON-005', 'CON-006',
       'CON-007', 'CON-008', 'CON-009', 'CON-010',
       ...
       'KD-214', 'KD-215', 'KD-216', 'KD-217', 'KD-218', 'KD-219', 'KD-220',
       'KD-221', 'KD-222', 'KD-223'],
      dtype='object', name='Sample', length=287)
数据完整率missing_rate : 1.0
集合所有样本重构数据表： (287, 780) (287, 276) (287, 1)
标签转换为one-hot编码： (287, 2) 67.0


In [4]:
# 指定保存模型的文件夹路径
folder_path = 'model_shap'
# 创建文件夹
os.makedirs(folder_path, exist_ok=True)

cv = StratifiedKFold(n_splits=5,shuffle = True,random_state = 20)
auc_cv = []
acc_cv = []
f1_cv = []
for i,(train_index, test_index) in enumerate(cv.split(X_set[0],np.argmax(Y_onehot,axis=1))):
    tr_X_set, te_X_set, va_X_set = {}, {}, {}
    for m in range(len(X_set)):
        tr_X_set[m],tr_Y_onehot,tr_M = X_set[m][train_index],Y_onehot[train_index],Mask[train_index]
        te_X_set[m],te_Y_onehot,te_M = X_set[m][test_index],Y_onehot[test_index],Mask[test_index]
        #归一化
        #print(te_Y_onehot)
    def Normalize(data):
        """
        :param data:Input data
        :return:normalized data
        """
        mean = np.mean(data)
        mx = np.max(data)
        mn = np.min(data)
        return mean, mx, mn
    
    for m in range(M):
        mean, mx, mn = Normalize(tr_X_set[m])
        tr_X_set[m] = (tr_X_set[m] - mean) / (mx - mn)
        te_X_set[m] = (te_X_set[m] - mean) / (mx - mn)
    
    x_dim_set    = [tr_X_set[m].shape[1] for m in range(len(tr_X_set))]
    y_dim        = np.shape(tr_Y_onehot)[1]
    
    input_dims = {
        'x_dim_set': x_dim_set,
        'y_dim': y_dim,
        'z_dim': 50,
        #'steps_per_batch': steps_per_batch
    }
    network_settings = {
        'dim_enc': 100,
        'num_layers_enc': 3,   #view-specific
        'dim_specificpre': 100,
        'num_layers_specificpre': 2,  #multi-view
        'dim_joint_pre': 100,
        'num_layers_jointpre': 2,
        'dropout': 0.5,
        'edge_per_node': 1, 
        'ITERATION': 2000
    }
    
    for m in range(M):
        tr_X_set[m] = torch.from_numpy(tr_X_set[m])
        te_X_set[m] = torch.from_numpy(te_X_set[m])
    tr_Y_onehot = torch.from_numpy(tr_Y_onehot)
    te_Y_onehot = torch.from_numpy(te_Y_onehot)
    #tansform
    tr_input = concatenated_tensor(tr_X_set, tr_M)
    te_input = concatenated_tensor(te_X_set, te_M)
    GPU_use = True
    model = GCFNN(input_dims, network_settings, cuda = GPU_use)
    if GPU_use: 
        model = model.cuda()
        tr_input = tr_input.cuda()
        te_input = te_input.cuda()
        tr_Y_onehot = tr_Y_onehot.cuda()
        te_Y_onehot = te_Y_onehot.cuda()
    model_save, maxf1, maxacc, maxauc = model.train_model(train_input = tr_input, test_input = te_input, alpha = 1.0, beta = 0.01, l_rate = 0.0005, tr_Y_onehot = tr_Y_onehot, te_Y_onehot = te_Y_onehot)
    model_path = os.path.join(folder_path, 'model_'+str(i)+'.pth')
    torch.save(model_save, model_path)
    f1_cv.append(maxf1)
    acc_cv.append(maxacc)
    auc_cv.append(maxauc)
print('f1',f1_cv)
print('average maxf1',sum(f1_cv)/5)
print('acc',acc_cv)
print('average maxacc',sum(acc_cv)/5)
print('auc',auc_cv)
print('average maxauc',sum(auc_cv)/5)

Train F1: 0.8878
00050: TRAIN| LT=1.551 LP=0.428 LKL=12.331 LPS=0.883 LKLS=11.708 | 
Test F1: 0.8800
Test ACC: 0.7931
Test AUC: 0.9562
Train F1: 0.9915
00100: TRAIN| LT=0.949 LP=0.106 LKL=12.963 LPS=0.570 LKLS=14.372 | 
Test F1: 0.9462
Test ACC: 0.9138
Test AUC: 0.9724
Train F1: 0.9972
00150: TRAIN| LT=0.855 LP=0.054 LKL=12.467 LPS=0.559 LKLS=11.720 | 
Test F1: 0.9462
Test ACC: 0.9138
Test AUC: 0.9756
Train F1: 0.9972
00200: TRAIN| LT=0.777 LP=0.035 LKL=12.355 LPS=0.513 LKLS=10.641 | 
Test F1: 0.9362
Test ACC: 0.8966
Test AUC: 0.9821
Train F1: 0.9857
00250: TRAIN| LT=0.731 LP=0.055 LKL=12.324 LPS=0.446 LKLS=10.716 | 
Test F1: 0.9362
Test ACC: 0.8966
Test AUC: 0.9805
Train F1: 0.9943
00300: TRAIN| LT=0.609 LP=0.016 LKL=12.602 LPS=0.361 LKLS=10.575 | 
Test F1: 0.9348
Test ACC: 0.8966
Test AUC: 0.9821
Train F1: 0.9943
00350: TRAIN| LT=0.538 LP=0.011 LKL=12.557 LPS=0.294 LKLS=10.764 | 
Test F1: 0.9247
Test ACC: 0.8793
Test AUC: 0.9805
Train F1: 0.9972
00400: TRAIN| LT=0.468 LP=0.013 LKL=12