In [1]:
import psycopg2
import logging
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
from transformers import BertTokenizerFast, BertForSequenceClassification 
from catalyst.utils import set_global_seed
from pathlib import Path
from transformers import AutoConfig, AutoModel

conn_string = "host='168.131.30.66' dbname='mimic_iii' user='dbuser' password='jnudl1'"
conn = psycopg2.connect(conn_string)
cur=conn.cursor()

cur.execute("SELECT * FROM capstone.cap_data;")
result = cur.fetchall()

In [None]:
columns = ["row_id","subject_id","hadm_id","intime","outtime","information",'readmission']
df = pd.DataFrame(result,columns = columns)
df.head()

In [None]:
import re

def preprocess(x):
    y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
    y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    # remove, digits, spaces
    result = " ".join(y.split())
    
    return result

def make_str(lst):
    string = ""
    for sentence in lst:
        string += (sentence+"\n")
        
    return string

In [2]:
# Wrapper around Torch Dataset to perform classification
class MIMICDataset(Dataset):
    def __init__( self, texts, labels, label_dict = None,
                 max_seq_length = 100000, model_name = "bvanaken/CORe-clinical-mortality-prediction"):
        self.texts = texts   # 리스트[str]
        self.labels = labels # 정수형
        self.label_dict = label_dict # dictionary[str] = int
        self.max_seq_length = max_seq_length # 최대 시퀀스 길이
        
        if self.label_dict is None and labels is not None:
            self.label_dict = dict(zip(sorted(set(labels)), range(len(set(labels)))))  
                                   
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # tokenizer 경고 X
        logging.getLogger("transformers.tokenization_utils").setLevel(logging.FATAL)

        # special tokens for transformers
        # in the simplest case a [CLS] token is added in the beginning
        # and [SEP] token is added in the end of a piece of text
        # [CLS] <indexes text tokens> [SEP] .. <[PAD]>
        self.sep_vid = self.tokenizer.vocab["[SEP]"]
        self.cls_vid = self.tokenizer.vocab["[CLS]"]
        self.pad_vid = self.tokenizer.vocab["[PAD]"]

    # 텍스트의 길이 반환
    def __len__(self):
        return len(self.texts)

    # 인덱스에 해당하는 요소 반환
    def __getitem__(self, index): #return type Mapping[str, torch.Tensor]
        # encoding the text
        tmp = self.texts[index]

        # dictionary[`input_ids`] & dictionary[`attention_mask'] : key로 사용할 input_ids와 attn_mask
        output_dict = self.tokenizer.encode_plus(
            tmp,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_seq_length,
            return_tensors="pt",
            truncation=True,
            return_attention_mask=True,
        )

        # for Catalyst, there needs to be a key called features
        output_dict["features"] = output_dict["input_ids"].squeeze(0)
        del output_dict["input_ids"]

        # encoding target
        if self.labels is not None:
            y = self.labels[index]
            y_encoded = torch.Tensor([self.label_dict.get(y, -1)]).long().squeeze(0)
            output_dict["targets"] = y_encoded

        return output_dict 

In [None]:
def read_data(params): # parms -> dictionary 형태
    """
    A custom function that reads data from CSV files, creates PyTorch datasets and
    data loaders. The output is provided to be easily used with Catalyst
    :param params: a dictionary read from the config.yml file
    """
    
    # reading CSV files to Pandas dataframes
    train_df = pd.read_csv(Path(params["data"]["path_to_data"]) / params["data"]["train_filename"])
    
    valid_df = pd.read_csv(Path(params["data"]["path_to_data"]) / params["data"]["validation_filename"])
    
    test_df = pd.read_csv(Path(params["data"]["path_to_data"]) / params["data"]["test_filename"])

    # creating PyTorch Datasets
    train_dataset = MIMICDataset(
        texts=train_df[params["data"]["text_field_name"]].values.tolist(),
        labels=train_df[params["data"]["label_field_name"]].values,
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    valid_dataset = MIMICDataset(
        texts=valid_df[params["data"]["text_field_name"]].values.tolist(),
        labels=valid_df[params["data"]["label_field_name"]].values,
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )

    test_dataset = MIMICDataset(
        texts=test_df[params["data"]["text_field_name"]].values.tolist(),
        labels=test_df[params["data"]["label_field_name"]].values,
        max_seq_length=params["model"]["max_seq_length"],
        model_name=params["model"]["model_name"],
    )
    
    train_val_loaders = {
        "train": DataLoader(
            dataset=train_dataset,
            batch_size=params["training"]["batch_size"],
            shuffle=False,
        ),
        "valid": DataLoader(
            dataset=valid_dataset,
            batch_size=params["training"]["batch_size"],
            shuffle=False,
        ),
    }

    test_loaders = {
        "test": DataLoader(
            dataset=test_dataset,
            batch_size=params["training"]["batch_size"],
            shuffle=False,
        )
    }
    return train_val_loaders, test_loaders # tuple with 2 dictionaries

In [None]:
class Classifier(nn.Module):
    """
    Simplified version of the same class by HuggingFace.
    See transformers/modeling_distilbert.py in the transformers repository.
    """

    def __init__(self, pretrained_model_name, num_classes=2, dropout: float = 0.3):
        super().__init__()

        config = AutoConfig.from_pretrained(
            pretrained_model_name, num_labels=num_classes
        )

        self.model = AutoModel.from_pretrained(pretrained_model_name, config=config)
        self.classifier = nn.Linear(config.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)
    
    # propagation
    def forward(self, features, attention_mask=None, head_mask=None):
        """
        Args:
            features (torch.Tensor): ids of each token,
                size ([bs, seq_length]
            attention_mask (torch.Tensor): binary tensor, used to select
                tokens which are used to compute attention scores
                in the self-attention heads, size [bs, seq_length]
            head_mask (torch.Tensor): 1.0 in head_mask indicates that
                we keep the head, size: [num_heads]
                or [num_hidden_layers x num_heads]
        Returns:
            PyTorch Tensor with predicted class scores
        """
        assert attention_mask is not None, "attention mask is none"

        # taking BERTModel output
        # see https://huggingface.co/transformers/model_doc/bert.html#transformers.BertModel
        bert_output = self.model(
            input_ids=features, attention_mask=attention_mask, head_mask=head_mask
        )
        # we only need the hidden state here and don't need
        # transformer output, so index 0
        seq_output = bert_output[0]  # (bs, seq_len, dim)
        # mean pooling, i.e. getting average representation of all tokens
        pooled_output = seq_output.mean(axis=1)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        scores = self.classifier(pooled_output)  # (bs, num_classes)

        return scores

In [None]:
import yaml
from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import (
    AccuracyCallback,
    CheckpointCallback,
    InferCallback,
    OptimizerCallback,
)
from catalyst.utils import prepare_cudnn, set_global_seed

In [None]:
with open(str(project_root / "config.yml")) as f:
    params = yaml.load(f, Loader=yaml.FullLoader)
    

train_val_loaders, test_loaders = read_data(params)


In [None]:
model = Classifer(
    pretrained_model_name=params["model"]["model_name"],
    num_classes=params["model"]["num_classes"],
)

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=float(params["training"]["learn_rate"])
)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)

In [None]:
# reproducibility
set_global_seed(params["general"]["seed"])
prepare_cudnn(deterministic=True)

# here we specify that we pass masks to the runner. So model's forward method will be called with
# these arguments passed to it.
trainning = SupervisedRunner(input_key=("features", "attention_mask"))

# finally, training the model with Catalyst
trainning.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=train_val_loaders,
    callbacks=[
        AccuracyCallback(num_classes=int(params["model"]["num_classes"])),
        OptimizerCallback(accumulation_steps=int(params["training"]["accum_steps"])),
    ],
    logdir=params["training"]["log_dir"],
    num_epochs=int(params["training"]["num_epochs"]),
    verbose=True,
)

# and running inference
torch.cuda.empty_cache()
trainning.infer(
    model=model,
    loaders=test_loaders,
    callbacks=[
        CheckpointCallback(
            resume=f"{params['training']['log_dir']}/checkpoints/best.pth"
        ),
        InferCallback(),
    ],
    verbose=True,
)

# lastly, saving predicted scores for the test set
predicted_scores = trainning.callbacks[0].predictions["logits"]
np.savetxt(X=predicted_scores, fname=params["data"]["path_to_test_pred_scores"])

In [None]:
from transformers import Trainer, TrainingArguments

# Trainer에서 사용할 하이퍼 파라미터 지정
training_args = TrainingArguments(
    output_dir='./results',          # 모형 예측이나 체크포인트 출력 폴더, 반드시 필요함
    num_train_epochs=2,              # 학습 에포크 수
    per_device_train_batch_size=8,   # 학습에 사용할 배치 사이즈
    per_device_eval_batch_size=16,   # 평가에 사용할 배치 사이즈
)

trainer = Trainer(
    model=model,                     # 학습할 모형
    args=training_args,              # 위에서 정의한 학습 매개변수
    train_dataset=train_dataset,     # 훈련 데이터셋
    compute_metrics=compute_metrics,
)

# 미세조정학습 실행
trainer.train()

In [None]:
trainer.evaluate(eval_dataset=test_dataset)