In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import random
import os
import time#可以用来简单地记录时间
import matplotlib.pyplot as plt#画图
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import  average_precision_score

import torch#深度学习的pytoch平台
from torch import Tensor
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

In [2]:
# from models.lstm import LSTMModel
# from trainer.train_and_evaluate import ModelTrainer, ModelEvaluator
# from data_provider.dataset_generate import TimeSeriesDataset
from models.lstm_itransformer import lstm_itransformerModel

## 加载数据

In [3]:
# database_names_all = ['ams', 'eicu', 'inspire', 'mimiciii', 'mimiciv', 'salz', 'zhejiang']
database_names_all = ['ams', 'eicu', 'mimic','salz', 'inspire', 'zhejiang']

selected_database_num = 2
internal_database = database_names_all[selected_database_num]  # ''
external_database = database_names_all.copy()
external_database.remove(internal_database) # []
print(internal_database, external_database)

mimic ['ams', 'eicu', 'salz', 'inspire', 'zhejiang']


In [4]:
file_path = "E:\\Research\\Time series research\\Federated learning Time series research\\0.data\\Multi-center time series data\\"
internal_data_path = 'icu_mortality_' + internal_database + '.csv'

In [5]:
df_internal = pd.read_csv(file_path + internal_data_path)
df_internal

Unnamed: 0,id,gender,age,height,weight,bmi,admission_type,death_hosp,los_icu_day,ethnicity,...,oasis,sapsii,respiration,coagulation,liver,cardiovascular,cns,renal,sofa,mods
0,5,0,1.017520,-0.021828,-0.79298,-0.998913,1,0,1.73,other,...,33.0,35.0,0,1,0,1,4,4,10.0,0
1,5,0,1.017520,-0.021828,-0.79298,-0.998913,1,0,1.73,other,...,33.0,37.0,0,1,0,1,4,4,10.0,0
2,5,0,1.017520,-0.021828,-0.79298,-0.998913,1,0,1.73,other,...,33.0,37.0,0,1,0,1,4,4,10.0,0
3,5,0,1.017520,-0.021828,-0.79298,-0.998913,1,0,1.73,other,...,35.0,37.0,0,1,0,1,4,4,10.0,0
4,5,0,1.017520,-0.021828,-0.79298,-0.998913,1,0,1.73,other,...,33.0,35.0,0,1,0,1,4,4,10.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3918410,76271,1,-0.637632,0.583969,-0.54191,-0.931153,0,0,1.17,white,...,6.0,21.0,2,0,0,1,1,0,4.0,0
3918411,76271,1,-0.637632,0.583969,-0.54191,-0.931153,0,0,1.17,white,...,6.0,21.0,0,0,0,0,0,0,,0
3918412,76271,1,-0.637632,0.583969,-0.54191,-0.931153,0,0,1.17,white,...,6.0,21.0,0,0,0,0,0,0,,0
3918413,76271,1,-0.637632,0.583969,-0.54191,-0.931153,0,0,1.17,white,...,6.0,21.0,0,0,0,0,0,0,,0


In [6]:
print(df_internal.groupby('id')['death_hosp'].last().value_counts())

death_hosp
0    35436
1     5614
Name: count, dtype: int64


In [7]:
train_ids, test_ids = train_test_split(
    df_internal['id'].unique(),  # 按患者ID划分
    test_size=0.2,      # 测试集比例
    random_state=42,    # 随机种子
    stratify=df_internal.groupby('id')['death_hosp'].last()  # 按患者最终标签分层
)
train_ids

array([40804, 52798, 37874, ..., 24565, 47758, 43247], dtype=int64)

In [8]:
# 获取划分后的DataFrame
train_df = df_internal[df_internal['id'].isin(train_ids)]
test_df = df_internal[df_internal['id'].isin(test_ids)]

print("训练集比例:")
print(train_df.groupby('id')['death_hosp'].last().value_counts(normalize=True))
print("\n测试集比例:")
print(test_df.groupby('id')['death_hosp'].last().value_counts(normalize=True))

训练集比例:
death_hosp
0    0.863246
1    0.136754
Name: proportion, dtype: float64

测试集比例:
death_hosp
0    0.863216
1    0.136784
Name: proportion, dtype: float64


In [9]:
print("训练集:")
print(train_df.groupby('id')['death_hosp'].last().value_counts())
print("\n测试集:")
print(test_df.groupby('id')['death_hosp'].last().value_counts())

训练集:
death_hosp
0    28349
1     4491
Name: count, dtype: int64

测试集:
death_hosp
0    7087
1    1123
Name: count, dtype: int64


## 数据生成器

In [10]:
class TimeSeriesDataset(Dataset):
    def __init__(self, df, feature_cols, window_size=24, forecast_horizon=24, stride=1, 
                 mode='sliding', shuffle=True, label=['death_hosp'], random_state=42, max_len=None):
        """
        初始化时间序列数据集
        
        参数:
            df: 包含所有数据的DataFrame
            feature_cols: 使用的特征列名列表
            window_size: 输入序列长度
            forecast_horizon: 预测时间范围
            mode: 'sliding'滑动窗口，'cumulative'累积窗口，‘fix’固定时间窗口
            shuffle: 是否打乱数据顺序
            random_state: 随机种子
            label: 标签
        """
        self.df = df
        self.feature_cols = feature_cols
        self.window_size = window_size
        self.forecast_horizon = forecast_horizon
        self.mode = mode
        self.shuffle = shuffle
        self.random_state = random_state
        self.max_len = max_len
        self.indices = []
        self.stride = stride
        self.label = label
        

        # if len(label)==1:
        #     self.label = label[0]
        # else:
        #     self.label = label
        
        # 预计算所有可能的序列索引
        self._precompute_indices()
        
    def _precompute_indices(self):
        """计算所有有效的序列索引"""
        random.seed(self.random_state)
        
        for pid, group in tqdm(self.df.groupby('id')):
            group = group.sort_values('hr')
            max_hr = group['hr'].max()
            
            if self.mode == 'sliding':
                for start in range(1, max_hr - self.window_size, self.stride):
                    end = start + self.window_size
                    forecast_end = end + self.forecast_horizon
                    if len(group[(group['hr'] >= start) & (group['hr'] < end)]) == self.window_size:
                        y = []
                        for label in self.label:
                            condition = (group['hr'] >= end) & (group['hr'] < forecast_end) & (group[label] == 1)
                            y.append(int(condition.any()))
                        self.indices.append((pid, start, end, y))
                        
            elif self.mode == 'cumulative':
                # for end in range(max_hr, max_hr+1, self.stride):
                # for end in range(13, 14, self.stride):
                for end in range(1, max_hr + 1, self.stride):
                    start = 1
                    forecast_end = end + self.forecast_horizon
                    y = []
                    for label in self.label:
                        condition = (group['hr'] >= end) & (group['hr'] <= forecast_end) & (group[label] == 1)
                        y.append(int(condition.any()))
                    self.indices.append((pid, start, end, y))
                    
            elif self.mode == 'fix':
                start = 1
                end = start + self.window_size
                if len(group[(group['hr'] >= start) & (group['hr'] < end)]) == self.window_size:
                    y = []
                    for label in self.label:
                        condition = (group['hr'] >= end) & (group[label] == 1)
                        y.append(int(condition.any()))
                    self.indices.append((pid, start, end, y))
                        
                    # # condition = (group['hr'] >= end) & (group['death_hosp'] == 1)
                    # condition = group['death_hosp'] == 1
                    # y = int(condition.any())
                    # self.indices.append((pid, start, end, y))
                
        
        if self.shuffle:
            random.shuffle(self.indices)
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        pid, start, end, y = self.indices[idx]
        group = self.df[self.df['id'] == pid].sort_values('hr')
        
        # 获取特征序列
        X = group[(group['hr'] >= start) & (group['hr'] <= end)][self.feature_cols].values
        
        # 转换为torch张量
        X_tensor = torch.FloatTensor(X)
        y_tensor = torch.FloatTensor(y)

        if self.max_len is not None:
            X_tensor = X_tensor[-self.max_len:]  # 截断到最大长度
            
        seq_len = len(X_tensor)
        
        return X_tensor, y_tensor, seq_len

In [11]:
mask_cols = ['heart_rate', 'sbp', 'mbp', 'resp_rate', 'temperature', 'spo2', 'albumin', 'aniongap', 'bun', 'calcium', 'chloride', 
                'creatinine', 'glucose', 'sodium', 'potassium', 'fibrinogen', 'inr', 'pt', 'ptt', 'hematocrit', 'hemoglobin', 'platelet', 'wbc', 
                'alt', 'ast', 'bilirubin', 'pao2', 'paco2', 'fio2', 'pao2fio2ratio', 'ph', 'baseexcess', 'lactate', 'sao2', 'troponin', 'magnesium', 
                'bnp', 'neutrophils', 'gcs', 'alkaline_phosphatase', 'norepinephrine', 'epinephrine', 'dobutamine', 'dopamine', 'ventilation',
                'lymphocytes', 'bicarbonate', 'urineoutput',]

mask_cols = [i+'_mask' for i in mask_cols]
# mask_cols = mask_cols + ['hr_encode']

In [12]:
# 定义特征列
feature_cols = ['gender', 'age', 'height', 'weight', 'bmi',
                'heart_rate', 'sbp', 'mbp',  'resp_rate', 'temperature', 'spo2', 'albumin', 'aniongap', 'bun', 'calcium', 'chloride', 
                'creatinine', 'glucose', 'sodium', 'potassium', 'fibrinogen', 'inr', 'pt', 'ptt', 'hematocrit', 'hemoglobin', 'platelet', 'wbc', 
                'alt', 'ast', 'bilirubin', 'pao2', 'paco2', 'fio2', 'pao2fio2ratio', 'ph', 'baseexcess', 'lactate', 'sao2', 'troponin', 'magnesium', 
                'bnp', 'neutrophils', 'gcs', 'alkaline_phosphatase', 'norepinephrine', 'epinephrine', 'dobutamine', 'dopamine', 'ventilation',
                'lymphocytes', 'bicarbonate', 'urineoutput', 'hr_encode',
               ] 
# feature_cols = feature_cols + mask_cols    

# 创建数据集和数据加载器
# train_dataset = TimeSeriesDataset(train_df, feature_cols, window_size=24, forecast_horizon=24, mode='sliding', shuffle=True)
# test_dataset = TimeSeriesDataset(test_df, feature_cols, window_size=24, forecast_horizon=24, mode='sliding', shuffle=False)

train_dataset = TimeSeriesDataset(train_df, feature_cols, mode='cumulative', shuffle=False, stride=4, label=['death_hosp'])
test_dataset = TimeSeriesDataset(test_df, feature_cols, mode='cumulative', shuffle=False, stride=4, label=['death_hosp'])

100%|███████████████████████████████████████████████████████████████████████████| 32840/32840 [01:08<00:00, 476.85it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 8210/8210 [00:17<00:00, 466.64it/s]


In [13]:
# 提取标签列
ys = np.array([y[0] for (pid, start, end, y) in train_dataset.indices])
unique, counts = np.unique(ys, return_counts=True)
label_dist = dict(zip(unique, counts))

print("\n使用numpy统计:")
for label, count in label_dist.items():
    print(f"{label}: {count} 个 ({(count/len(ys))*100:.1f}%)")


使用numpy统计:
0: 764577 个 (96.5%)
1: 27870 个 (3.5%)


In [14]:
# 提取标签列
ys = np.array([y[0] for (pid, start, end, y) in test_dataset.indices])
unique, counts = np.unique(ys, return_counts=True)
label_dist = dict(zip(unique, counts))

print("\n使用numpy统计:")
for label, count in label_dist.items():
    print(f"{label}: {count} 个 ({(count/len(ys))*100:.1f}%)")


使用numpy统计:
0: 194489 个 (96.5%)
1: 6973 个 (3.5%)


In [15]:
train_dataset.indices[:10]

[(5, 1, 1, [0]),
 (5, 1, 5, [0]),
 (5, 1, 9, [0]),
 (5, 1, 13, [0]),
 (5, 1, 17, [0]),
 (5, 1, 21, [0]),
 (5, 1, 25, [0]),
 (5, 1, 29, [0]),
 (5, 1, 33, [0]),
 (5, 1, 37, [0])]

In [16]:
def collate_fn(batch, padding_strategy='last_value'):
    """
    支持多种填充策略的collate函数
    padding_strategy: 'last_value', 'zero', 'mean', 'repeat'
    """
    sequences, targets, lengths = zip(*batch)
    
    # 排序
    lengths = torch.tensor(lengths)
    lengths, sort_idx = lengths.sort(descending=True)
    sequences = [sequences[i] for i in sort_idx]
    targets = torch.stack([targets[i] for i in sort_idx])
    
    max_len = max(lengths)
    batch_size = len(sequences)
    feature_dim = sequences[0].shape[-1]
    
    sequences_padded = torch.zeros(batch_size, max_len, feature_dim)
    
    for i, seq in enumerate(sequences):
        seq_len = lengths[i]
        sequences_padded[i, :seq_len] = seq
        
        # 填充策略
        if seq_len < max_len:
            if padding_strategy == 'last_value':
                sequences_padded[i, seq_len:] = seq[-1]  # 最后一个值
            elif padding_strategy == 'zero':
                sequences_padded[i, seq_len:] = 0  # 零填充
            elif padding_strategy == 'mean':
                sequences_padded[i, seq_len:] = seq.mean(dim=0)  # 序列均值
            elif padding_strategy == 'repeat':
                # 重复整个序列直到填满
                repeat_times = (max_len - seq_len + seq_len - 1) // seq_len + 1
                repeated = seq.repeat(repeat_times, 1)
                sequences_padded[i, seq_len:] = repeated[:max_len - seq_len]
    
    # padding mask
    padding_mask = torch.arange(max_len).expand(batch_size, max_len) < lengths.unsqueeze(1)
    padding_mask = padding_mask.float().unsqueeze(-1)
    
    return sequences_padded, targets, padding_mask

In [17]:
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_fn)

In [18]:
len(train_dataset)

792447

In [19]:
# for x, y, padding_mask in train_loader:
#     print(x.shape)
#     print(y)
#     break

## 检查设备

In [20]:
# Check device 
# Get the GPU device name if available.
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available. {}'.format(torch.cuda.device_count()))
    print('We will use the GPU: {}'.format(torch.cuda.get_device_name(0)))

# If we dont have GPU but a CPU, training will take place on CPU instead
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
torch.cuda.empty_cache()
    
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

There are %d GPU(s) available. 1
We will use the GPU: NVIDIA GeForce RTX 2080


## 定义模型

In [21]:
import torch
import torch.nn as nn


class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.3, num_class=1):
        """
        LSTM模型初始化
        
        参数:
            input_size: 输入特征维度
            hidden_size: 隐藏层大小
            num_layers: LSTM层数
            dropout: Dropout比率
        """
        super(LSTMModel, self).__init__()
        self.num_class = num_class
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size, self.num_class)
        
    def forward(self, x, padding_mask):
        # LSTM层
        lstm_out, _ = self.lstm(x)
        
        # # 只取最后一个时间步的输出
        # last_out = lstm_out[:, -1, :]

        # lstm_out = lstm_out * padding_mask
        # 取每个序列最后一个非padding位置的输出
        batch_size = x.size(0)
        lengths = padding_mask.squeeze(-1).sum(dim=1).long()  # 各序列实际长度
        last_out = lstm_out[torch.arange(batch_size), lengths-1, :]  # (batch_size, hidden_size)
        
        # Dropout和全连接层
        out = self.dropout(last_out)
        out = self.fc(out)

        if self.num_class == 1:
            out = torch.sigmoid(out)
        
        return out

In [22]:
# 初始化模型
model = LSTMModel(input_size=len(feature_cols), hidden_size=128, num_layers=2, dropout=0.1, num_class=2)
# model = lstm_itransformerModel(seq_len=24, d_model=64, d_ff=64, e_layers=3, enc_in=len(feature_cols), ,num_class=2)

model_path = './weights_lstm_cum_24_2.pth'
model.load_state_dict(torch.load(model_path))
model.to(device)

LSTMModel(
  (lstm): LSTM(54, 128, num_layers=2, batch_first=True, dropout=0.1)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

## 训练模型

In [23]:
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import  average_precision_score

# 训练与评估框架
class ModelTrainer:
    def __init__(self, model, train_loader, val_loader, criterion, optimizer,device='cuda'):
        """
        初始化训练器
        参数:
            model: 模型实例
            train_loader: 训练数据加载器
            val_loader: 验证数据加载器
            device: 训练设备
        """
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.criterion = criterion
        self.optimizer = optimizer
        self.best_auc = 0
        self.best_model = None
        
    def train_epoch(self, epoch):
        """训练一个epoch"""
        self.model.train()
        total_loss = 0
        progress_bar = tqdm(self.train_loader, desc=f"Epoch {epoch + 1} [Train]")
        
        for X, y, padding_mask in progress_bar:
            X, y = X.to(self.device), y.to(self.device)
            padding_mask = padding_mask.to(self.device)
            
            # 前向传播
            outputs = self.model(X,padding_mask)
            if y.shape[1] == 1:
                loss = self.criterion(outputs, y.long().squeeze(-1))
            else:
                loss = torch.tensor(0.0, device=self.device)
                for i in range(len(outputs)):
                    loss = loss + self.criterion(outputs[i], y[:, i].long().squeeze(-1))
            # loss = self.criterion(outputs, y)
            # loss = self.criterion(outputs, y.long().squeeze(-1))
            
            # 反向传播和优化
            self.optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=4.0)
            self.optimizer.step()
            
            total_loss += loss.item() * X.size(0)
            progress_bar.set_postfix(loss=loss.item())
        
        return total_loss / len(self.train_loader.dataset)
    
    def validate(self, testloader):
        """验证模型"""
        self.model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []

        # total_loss = []
        # preds = []
        # trues = []
        
        with torch.no_grad():
            for X, y, padding_mask in tqdm(testloader, desc="Validating"):
                X, y = X.to(self.device), y.to(self.device)
                padding_mask = padding_mask.to(self.device)
                outputs = self.model(X,padding_mask)
                if y.shape[1] == 1:
                    loss = self.criterion(outputs, y.long().squeeze(-1))
                else:
                    loss = torch.tensor(0.0, device=self.device)
                    for i in range(len(outputs)):
                        loss = loss + self.criterion(outputs[i], y[:, i].long().squeeze(-1))
                # loss = self.criterion(outputs, y)
                
                val_loss += loss.item() * X.size(0)
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(y.cpu().numpy())

        
        val_loss /= len(testloader.dataset)
        val_auc = roc_auc_score(all_labels, np.array(all_preds)[:,1], average='macro')
        val_auprc = average_precision_score(all_labels, np.array(all_preds)[:,1])
        
        # 保存最佳模型
        if val_auc > self.best_auc:
            self.best_auc = val_auc
            self.best_model = self.model.state_dict().copy()
        
        return val_loss, val_auc, val_auprc, np.array(all_preds), all_labels
    
    def train(self, num_epochs=50, early_stop_patience=5):
        """完整训练流程"""
        train_losses = []
        val_losses = []
        val_aucs = []
        
        no_improve = 0
        for epoch in range(num_epochs):
            train_loss = self.train_epoch(epoch)
            val_loss, val_auc, val_auprc = self.validate(self.val_loader)
            
            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_aucs.append(val_auc)
            
            print(f"Epoch {epoch + 1}/{num_epochs}:")
            print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
            print(f"Val AUC: {val_auc:.4f} | Val AUPRC: {val_auprc:.4f}")
            
            # # 早停机制
            # if val_auc > self.best_auc:
            #     no_improve = 0
            # else:
            #     no_improve += 1
            #     if no_improve >= early_stop_patience:
            #         print(f"Early stopping at epoch {epoch + 1}")
            #         break
        
        # 加载最佳模型
        self.model.load_state_dict(self.best_model)
        return train_losses, val_losses, val_aucs

In [24]:
# 训练模型
# criterion = nn.BCELoss()
criterion =  nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# 训练模型
trainer = ModelTrainer(model, train_loader, test_loader, criterion, optimizer)
# train_losses, val_losses, val_aucs = trainer.train(num_epochs=2)

In [25]:
total_loss, auroc, auprc, all_preds, all_labels = trainer.validate(test_loader)

Validating: 100%|██████████████████████████████████████████████████████████████████| 3148/3148 [03:22<00:00, 15.51it/s]


In [26]:
# train_loader

In [27]:
# torch.save(model.state_dict(), 'weights_lstm_cum_24_2.pth')

In [28]:
# for i in external_database:
#     print("Database name: " + i)
#     external_data_path = 'icu_mortality_' + i + '.csv'
#     df_external = pd.read_csv(file_path + external_data_path)

#     external_dataset = TimeSeriesDataset(df_external, feature_cols, mode='cumulative', shuffle=False, stride=2, label=['death_hosp'])
#     # external_dataset = TimeSeriesDataset(df_external, feature_cols, window_size=24, forecast_horizon=24, mode='sliding', shuffle=False)
#     external_loader = DataLoader(external_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_fn)

#     total_loss, auroc, auprc = trainer.validate(external_loader)
#     print("total_loss, auroc, auprc: ", total_loss, auroc, auprc)

# 外部验证

In [30]:
import numpy as np
from sklearn.metrics import (roc_auc_score, precision_recall_curve, 
                             accuracy_score, f1_score, precision_score, 
                             recall_score, confusion_matrix, average_precision_score)

def calculate_all_metrics(all_preds, all_labels, threshold=0.5):
    # 将概率转换为二分类预测
    binary_preds = (all_preds >= threshold).astype(int)
    
    # 计算混淆矩阵
    tn, fp, fn, tp = confusion_matrix(all_labels, binary_preds).ravel()
    
    # 计算各项指标
    metrics = {}
    
    # AUROC
    metrics['AUROC'] = roc_auc_score(all_labels, all_preds)
    
    # Sensitivity (Recall)
    metrics['Sensitivity'] = recall_score(all_labels, binary_preds)
    
    # Specificity
    metrics['Specificity'] = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # Accuracy
    metrics['Accuracy'] = accuracy_score(all_labels, binary_preds)
    
    # F1 score
    metrics['F1_score'] = f1_score(all_labels, binary_preds)
    
    # Precision
    metrics['Precision'] = precision_score(all_labels, binary_preds)
    
    # AUPRC
    metrics['AUPRC'] = average_precision_score(all_labels, all_preds)
    
    # 额外添加一些可能有用的指标
    metrics['TP'] = tp
    metrics['FP'] = fp
    metrics['TN'] = tn
    metrics['FN'] = fn
    
    return metrics

In [29]:
external_data_path = 'icu_mortality_' + 'zhejiang' + '.csv'
df_external = pd.read_csv(file_path + external_data_path)
print(df_external.shape)

# df_external = pd.concat([df_external, df_external_1], axis=0)
# print(df_external.shape)

external_dataset = TimeSeriesDataset(df_external, feature_cols, mode='cumulative', shuffle=False, stride=4, label=['death_hosp'])
external_loader = DataLoader(external_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)

total_loss, auroc, auprc, all_preds, all_labels = trainer.validate(external_loader)
print("total_loss, auroc, auprc: ", total_loss, auroc, auprc)

(235970, 68)


100%|█████████████████████████████████████████████████████████████████████████████| 1247/1247 [00:05<00:00, 243.75it/s]
Validating: 100%|██████████████████████████████████████████████████████████████████| 3717/3717 [00:56<00:00, 65.71it/s]


total_loss, auroc, auprc:  0.16621928574758776 0.7841673402908435 0.20897451726565947


In [40]:
print( "Asia: \n") 
print(calculate_all_metrics(np.array(all_preds)[:, 1].reshape(-1), np.array(all_labels).reshape(-1), threshold=-2.25))

Asia: 

{'AUROC': 0.7841673402908435, 'Sensitivity': 0.7200229489386116, 'Specificity': 0.703107677383592, 'Accuracy': 0.7036034369692792, 'F1_score': 0.1246461737100859, 'Precision': 0.06822877025116886, 'AUPRC': 0.20897451726565947, 'TP': 1255, 'FP': 17139, 'TN': 40589, 'FN': 488}


In [41]:
external_data_path = 'icu_mortality_' + 'ams' + '.csv'
df_external_1 = pd.read_csv(file_path + external_data_path)
print(df_external_1.shape)

external_data_path = 'icu_mortality_' + 'salz' + '.csv'
df_external = pd.read_csv(file_path + external_data_path)
print(df_external.shape)

# df_external = pd.concat([df_external, df_external_1], axis=0)
# print(df_external.shape)

# 计算df1的最大id，并顺延df2的id
max_id = df_external_1['id'].max()
df_external['id'] = df_external['id'] + max_id

# 合并DataFrame
df_external = pd.concat([df_external_1, df_external], ignore_index=True)
print(df_external.shape)

external_dataset = TimeSeriesDataset(df_external, feature_cols, mode='cumulative', shuffle=False, stride=4, label=['death_hosp'])
external_loader = DataLoader(external_dataset, batch_size=16, shuffle=False, num_workers=0, collate_fn=collate_fn)

total_loss, auroc, auprc, all_preds, all_labels = trainer.validate(external_loader)
print("total_loss, auroc, auprc: ", total_loss, auroc, auprc)

(341758, 75)
(685979, 75)
(1027737, 75)


100%|█████████████████████████████████████████████████████████████████████████████| 9084/9084 [00:23<00:00, 386.93it/s]
Validating: 100%|████████████████████████████████████████████████████████████████| 16276/16276 [06:18<00:00, 43.00it/s]


total_loss, auroc, auprc:  0.16613505996387173 0.7592584734084125 0.19525585178148336


In [47]:
print( "Europe: \n") 
print(calculate_all_metrics(np.array(all_preds)[:, 1].reshape(-1), np.array(all_labels).reshape(-1), threshold=-2.1))

Europe: 

{'AUROC': 0.7592584734084125, 'Sensitivity': 0.699469652327637, 'Specificity': 0.6798186298669295, 'Accuracy': 0.6803308590584956, 'F1_score': 0.10239158094498717, 'Precision': 0.055238848686506734, 'AUPRC': 0.19525585178148336, 'TP': 4748, 'FP': 81206, 'TN': 172419, 'FN': 2040}


In [48]:
external_data_path = 'icu_mortality_' + 'eicu' + '.csv'
df_external = pd.read_csv(file_path + external_data_path)
print(df_external.shape)

# df_external = pd.concat([df_external, df_external_1], axis=0)
# print(df_external.shape)

external_dataset = TimeSeriesDataset(df_external, feature_cols, mode='cumulative', shuffle=False, stride=4, label=['death_hosp'])
external_loader = DataLoader(external_dataset, batch_size=64, shuffle=False, num_workers=0, collate_fn=collate_fn)

total_loss, auroc, auprc, all_preds, all_labels = trainer.validate(external_loader)
print("total_loss, auroc, auprc: ", total_loss, auroc, auprc)

(3084332, 76)


100%|███████████████████████████████████████████████████████████████████████████| 36768/36768 [01:08<00:00, 535.08it/s]
Validating: 100%|████████████████████████████████████████████████████████████████| 12265/12265 [29:51<00:00,  6.85it/s]


total_loss, auroc, auprc:  0.1568996512350814 0.7449005941099592 0.14328891976870295


In [53]:
print( "US: \n") 
print(calculate_all_metrics(np.array(all_preds)[:, 1].reshape(-1), np.array(all_labels).reshape(-1), threshold=-2.3))

US: 

{'AUROC': 0.7449005941099592, 'Sensitivity': 0.6806397098223487, 'Specificity': 0.6797501702428623, 'Accuracy': 0.679777664251617, 'F1_score': 0.11613293433808869, 'Precision': 0.06348223896663079, 'AUPRC': 0.14328891976870295, 'TP': 16513, 'FP': 243607, 'TN': 517071, 'FN': 7748}


In [54]:
total_loss, auroc, auprc, all_preds, all_labels = trainer.validate(test_loader)

Validating: 100%|██████████████████████████████████████████████████████████████████| 3148/3148 [03:24<00:00, 15.39it/s]


In [57]:
print( "Internal test set: \n") 
print(calculate_all_metrics(np.array(all_preds)[:, 1].reshape(-1), np.array(all_labels).reshape(-1), threshold=-2.2))

Internal test set: 

{'AUROC': 0.8030108504018683, 'Sensitivity': 0.7365552846694393, 'Specificity': 0.7128115214742222, 'Accuracy': 0.7136333402825347, 'F1_score': 0.1511388382084633, 'Precision': 0.08420914561164762, 'AUPRC': 0.31134726687206205, 'TP': 5136, 'FP': 55855, 'TN': 138634, 'FN': 1837}
