In [55]:
import os 
import timeit
from pathlib import Path

import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset 

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, log_loss, roc_curve, precision_recall_curve, average_precision_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler

from tqdm import tqdm 
import matplotlib
import matplotlib.pyplot as plt

In [56]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, output_size = 1):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        self.sigmoid = nn.Sigmoid() 

    def forward(self, x):
        x = self.linear(x)
        x = self.sigmoid(x)
        return x

class CLDataSet(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = torch.Tensor(x_data)
        self.y_data = torch.Tensor(y_data)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return len(self.x_data)


In [57]:
VAR_LIST = ['age','bmi','sex','emop','dept_DN','dept_GS','dept_OG','dept_OL','dept_OS','dept_PS','dept_UR','dept_nan','anetype_GA','anetype_NA','anetype_NB','anetype_nan','anetype_MAC','preop_pt','preop_aptt', 'preop_alb','preop_bun','preop_cr','preop_glu','preop_gpt','preop_got','preop_hb','preop_k','preop_na','preop_plt','preop_wbc']



In [58]:
def load_data(data_path: str, hosid: str = 'amc', target = 'death30', var_list = VAR_LIST, impute: str = 'median') -> pd.DataFrame:  
  df = pd.read_csv(data_path)
  
  # drop features
  if hosid == 'snuh':
    df.drop(columns=["hoslos", "opdur","anedur","op_code","asa_1","asa_2","asa_3","asa_4","asa_5","asa_nan","iculos","icu1"],inplace=True)
  elif hosid == 'eumc': 
    df.drop(columns=["hoslos", "opdur","anedur","op_code","asa_1","asa_2","asa_3","asa_4","asa_5","asa_nan","iculos","icu1"],inplace=True)
  elif hosid == 'brmh': 
    df.drop(columns=["hoslos", "opdur","anedur","op_code","asa_1","asa_2","asa_3","asa_4","asa_5","asa_nan","iculos","icu1"],inplace=True)
  
  X = df[var_list]
  y = df[target]
  
  # missing imputation 
  X.fillna(X.median(), inplace = True)
  return X, y

In [77]:
def train_LR(x_train, y_train, batch_size: int, epoch_size: int, learning_rate = .1, l1_ratio = 0.5, C = 0.1):

    train_dataset = CLDataSet(x_train, y_train) 
    train_loader = DataLoader(train_dataset,
                              batch_size = batch_size,
                              pin_memory=True)

    input_size = 30 #$x_train.shape[1]
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LogisticRegression(input_size, 1)#.to(device)

    criterion = nn.BCELoss() #nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
     
    lambda1 = l1_ratio 
    lambda2 = 1 - l1_ratio 

    # 4. 모델 학습
    for epoch in range(epoch_size):
        for data, target in train_loader: 
            #data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)

            # Elastic Net Regularization 추가
            l1 = 0
            l2 = 0
            for param in model.parameters():
                l1 += torch.norm(param, 1)
                l2 += torch.norm(param, 2)
            loss = loss + lambda1 * l1 + lambda2 * l2
            #l1_norm = sum(p.abs().sum() for p in model.parameters())
            #l1_loss = l1_ratio * l1_norm
            
            #l2_norm = sum(p.pow(2.0).sum()
            #            for p in model.parameters())
            #l2_loss = (1 / C) * l2_norm

            #loss = loss + l1_loss + l2_loss

            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epoch_size, loss.item()))
    
    # save the model
    #torch.save(model.state_dict(), 'saved_model.pth')
    return model, loss

In [78]:
def test_LR(model, x_test, y_test, batch_size):
  # Set model to evaluation mode
  model.eval()
  
  test_dataset = CLDataSet(x_test, y_test)
  test_loader = DataLoader(test_dataset, 
                           batch_size = batch_size,
                           shuffle=False,
                           pin_memory=True)

  # Make predictions on test set
  y_true = []
  y_pred = []
  with torch.no_grad():
    for data, target in test_loader: 
      output = model(data)
      y_pred.extend(output.tolist())
      y_true.extend(target.tolist())
  
  # Compute AUROC and AUPRC
  auroc = roc_auc_score(y_true, y_pred)
  auprc = average_precision_score(y_true, y_pred)
  print(f"AUROC: {auroc:.4f}, AUPRC: {auprc:.4f}")
  
  # Compute ROC and PR curves
  fpr, tpr, _ = roc_curve(y_true, y_pred)
  precision, recall, _ = precision_recall_curve(y_true, y_pred)
  
  # Plot ROC and PR curves
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
  
  ax1.plot(fpr, tpr, label=f"AUROC = {auroc:.4f}")
  ax1.plot([0, 1], [0, 1], "k--")
  ax1.set_xlabel("False Positive Rate")
  ax1.set_ylabel("True Positive Rate")
  ax1.set_title("Receiver Operating Characteristic (ROC) Curve")
  ax1.legend()
  
  ax2.plot(recall, precision, label=f"AUPRC = {auprc:.4f}")
  ax2.set_xlabel("Recall")
  ax2.set_ylabel("Precision")
  ax2.set_title("Precision-Recall Curve")
  ax2.legend()
  
  plt.show()
  
  return auroc, auprc, precision, recall 


In [79]:
random_state = 120
np.random.seed(random_state) 

In [87]:
validation_auroc_list = []
validation_auprc_list = []
validation_loss_list = []

num_bootstraps = 50
sampling_size = 5000
batch_size = 5000
epoch_size = 300
learning_rate = 0.05
l1_ratio = 0.1
C = 0.1
optimizer = 'SGD'



In [88]:
train_loss = []
val_auroc = []
val_auprc = []

X, y = load_data("/data/abel.eo/data/amc/raw_data/210612_amc_light.csv")

X = X.values
y = y.values

num_classes = len(np.unique(y))
#test_size = len(y) - sampling_size
#test_ratio = test_size / len(y)
test_ratio = .2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(X.median(), inplace = True)


In [89]:
for sample in range(num_bootstraps):
  X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=test_ratio, shuffle=True, stratify=y)
  print(f'X size : {X_train.shape}')
  scaler = MinMaxScaler()

  x_train = scaler.fit_transform(X_train).tolist()
  y_train = scaler.fit_transform(Y_train.reshape(-1, 1)).tolist()
  
  x_test = scaler.fit_transform(X_test).tolist()
  y_test = scaler.fit_transform(Y_test.reshape(-1, 1)).tolist()
  
  #x_train = torch.tensor(x_train, dtype = torch.float)
  #x_test = torch.tensor(x_test, dtype = torch.float)
  #y_train = torch.tensor(y_train, dtype= torch.float)
  #y_test = torch.tensor(y_test, dtype = torch.float)
  #y_test = torch.tensor(y_test.reshape(-1, 1), dtype = torch.float)
  
  #train logistic regression 
  model, sample_loss = train_LR(x_train, y_train, 
                                batch_size, 
                                epoch_size, 
                                learning_rate, 
                                l1_ratio, 
                                C)
  train_loss.append(sample_loss.cpu().detach().numpy())

  sample_auroc, sample_auprc, _, _ = test_LR(model, x_test, y_test, batch_size)
  val_auroc.append(sample_auroc)#.detach().numpy())
  val_auprc.append(sample_auprc)#.detach().numpy())

X size : (53217, 30)
Epoch [10/300], Loss: 0.7373


In [54]:
sampling_size, epoch_size, batch_size, np.median(train_loss), np.median(val_auroc), np.median(val_auprc)

(5000, 100, 15000, 0.7185607, 0.6356524994349431, 0.004016987266158922)

In [81]:
sampling_size, epoch_size, batch_size, np.median(train_loss), np.median(val_auroc), np.median(val_auprc)

(5000, 300, 1024, 0.72877204, 0.48196816331785713, 0.013489136200691197)

In [84]:
sampling_size, epoch_size, batch_size, np.median(train_loss), np.median(val_auroc), np.median(val_auprc)

(5000, 300, 2048, 0.25735503, 0.5382600027548575, 0.01272536218042938)

In [87]:
sampling_size, epoch_size, batch_size, np.median(train_loss), np.median(val_auroc), np.median(val_auprc)

(5000, 300, 5000, 0.7050331, 0.4283803780944674, 0.008402691838680101)

In [75]:
sampling_size, epoch_size, batch_size, np.median(train_loss), np.median(val_auroc), np.median(val_auprc)

(10000, 120, 10000, 1.3223097, 0.43399444035765056, 0.011111961742432973)