# ICD Tokenize

The data set is a list of ICD labels and their descriptions.  
The goal is to tokenize the descriptions and train a model to predict the ICD labels.

In [17]:
! pip install -U transformers datasets accelerate

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1


In [2]:
import torch
import numpy as np
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import precision_recall_fscore_support

In [3]:
dataset = load_dataset("eddielin0926/chinese-icd", split="train")
print(dataset)

Dataset({
    features: ['year', 'month', 'no', 'death', 'input_code', 'result_code', 'check', 'serial_no', 'catalog', 'inputs', 'results', 'icds', 'encodes'],
    num_rows: 1477240
})


In [4]:
df = dataset.to_pandas()
display(df.head())
print(df.shape)

Unnamed: 0,year,month,no,death,input_code,result_code,check,serial_no,catalog,inputs,results,icds,encodes
0,2022,1,1,1,1,1,True,6710,0,[急性腎衰竭],[急性腎衰竭],[N179],[2086]
1,2022,1,1,1,1,1,True,6710,1,[],[],[],[]
2,2022,1,1,1,1,1,True,6710,2,[],[],[],[]
3,2022,1,1,1,1,1,True,6710,3,[],[],[],[]
4,2022,1,1,1,1,1,True,6710,4,[],[],[],[]


(1477240, 13)


In [5]:
df = df[df["inputs"].map(lambda d: len(d)) > 0]
display(df.head())
print(df.shape)

Unnamed: 0,year,month,no,death,input_code,result_code,check,serial_no,catalog,inputs,results,icds,encodes
0,2022,1,1,1,1,1,True,6710,0,[急性腎衰竭],[急性腎衰竭],[N179],[2086]
5,2022,1,2,1,1,1,True,1621,0,[心室顫動],[心室顫動],[I490],[2569]
6,2022,1,2,1,1,1,True,1621,1,[菌血症],[菌血症],[R788],[1058]
7,2022,1,2,1,1,1,True,1621,2,[慢性腎臟疾病],[慢性腎臟疾病],[N039],[635]
8,2022,1,2,1,1,1,True,1621,3,"[冠狀動脈疾病, 繞道手術後, 心臟衰竭]","[冠狀動脈疾病, 心臟衰竭]","[I251, I509]","[1695, 2781]"


(638632, 13)


In [6]:
dataset = Dataset.from_pandas(df, features=dataset.features, preserve_index=False)
print(dataset)

Dataset({
    features: ['year', 'month', 'no', 'death', 'input_code', 'result_code', 'check', 'serial_no', 'catalog', 'inputs', 'results', 'icds', 'encodes'],
    num_rows: 638632
})


In [7]:
dataset = dataset.train_test_split(train_size = 0.1, test_size = 0.01)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['year', 'month', 'no', 'death', 'input_code', 'result_code', 'check', 'serial_no', 'catalog', 'inputs', 'results', 'icds', 'encodes'],
        num_rows: 63863
    })
    test: Dataset({
        features: ['year', 'month', 'no', 'death', 'input_code', 'result_code', 'check', 'serial_no', 'catalog', 'inputs', 'results', 'icds', 'encodes'],
        num_rows: 6387
    })
})


### Configuration
Defining some key variables that will be used later on in the training

In [8]:
MODEL_CHECKPOINT = "bert-base-chinese"
BATCH_SIZE = 128
EPOCHS = 1
MAX_LEN = 80
LEARNING_RATE = 1e-05

## Pre-processing

In [9]:
class_label = dataset['train'].features["encodes"].feature
num_labels = class_label.num_classes
print("number of labels: ", num_labels)

number of labels:  3568


In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)
print(tokenizer)

BertTokenizerFast(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [11]:
def preprocess(ds):
    texts = [" ".join(txt) for txt in ds["inputs"]]
    encoded_data = tokenizer(texts, padding="max_length", truncation=True)

    labels_matrix = []
    # fill numpy array
    for idx, labels in enumerate(ds["encodes"]):
        labels_matrix.append([1.0 if i in labels else 0.0 for i in range(num_labels)])

    encoded_data["labels"] = labels_matrix

    return encoded_data

tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset['train'].column_names)
print(tokenized_datasets['train'][0])

Map:   0%|          | 0/63863 [00:00<?, ? examples/s]

Map:   0%|          | 0/6387 [00:00<?, ? examples/s]

{'input_ids': [101, 5591, 2814, 6752, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

print(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [13]:
def preprocess_logits_for_metrics(logits: torch.Tensor, labels: torch.Tensor):
    probs = torch.sigmoid(logits)
    preds = torch.where(probs > 0.5, 1, 0)
    return preds

In [14]:
def compute_metrics(eval_pred: EvalPrediction):
    preds, refs = eval_pred

    precision, recall, fbeta_score, support = precision_recall_fscore_support(refs, preds, average="macro", zero_division=0)

    result = {
        'precision': precision,
        'recall': recall,
        'fbeta_score': fbeta_score
    }

    return result

In [15]:
model_name = MODEL_CHECKPOINT.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-icd-{num_labels}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    save_total_limit=5,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    load_best_model_at_end=True
)

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,Fbeta Score
1,0.003,0.002884,0.0,0.0,0.0


TrainOutput(global_step=7983, training_loss=0.0260864375274641, metrics={'train_runtime': 6234.0537, 'train_samples_per_second': 10.244, 'train_steps_per_second': 1.281, 'total_flos': 1.7341056286900224e+16, 'train_loss': 0.0260864375274641, 'epoch': 1.0})