This notebook uses a BERT-based model to predict the 'M' label of the TNM staging classification.

In [2]:
import os
import pickle
from datetime import timedelta
import numpy as np
import pandas as pd
import time
import copy

import sys
sys.path.append('..')
import utils
import llm_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import torch
import torchinfo
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed


In [3]:
# Constants and arguments
seq_len = 4096
epochs = 10
lr = 2e-5
bs = 4
cuda_gpu_id = "0"

tnm_label = 'm'

model_name = "yikuan8/Clinical-BigBird"
data_dir = "../../data/tnm_stage"
out_path = "./model_weights"
out_preds_path = "./model_preds"

In [4]:
if cuda_gpu_id != "-1":
    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_gpu_id
_ = torch.device('cuda')
torch.backends.cuda.matmul.allow_tf32 = True
assert torch.cuda.is_available()
print("Number of GPUs available:", torch.cuda.device_count())

Number of GPUs available: 1


# Data loading

In [7]:
label_enc = LabelEncoder()

## Training

In [8]:
df_train = pd.read_csv(os.path.join(data_dir, "train_tcga_reports_tnm_stage.csv"))

In [9]:
df_train.shape

(1947, 6)

In [10]:
df_train[f'{tnm_label}_label'].value_counts()

m_label
M0    1821
M1     126
Name: count, dtype: int64

In [11]:
df_train[f'{tnm_label}_class'] = label_enc.fit_transform(df_train[f'{tnm_label}_label'])

## Validation

In [12]:
df_val = pd.read_csv(os.path.join(data_dir, "val_tcga_reports_tnm_stage.csv"))

In [13]:
df_val.shape

(780, 6)

In [14]:
df_val[f'{tnm_label}_class'] = label_enc.fit_transform(df_val[f'{tnm_label}_label'])

## Test

In [15]:
df_test = pd.read_csv(os.path.join(data_dir, "test_tcga_reports_tnm_stage.csv"))

In [16]:
df_test.shape

(1170, 6)

In [17]:
df_test[f'{tnm_label}_class'] = label_enc.fit_transform(df_test[f'{tnm_label}_label'])

# Model training

## Tokenization

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [19]:
arr_train_text = df_train['text'].apply(str).to_list()
arr_train_label = df_train[f'{tnm_label}_class'].apply(int).to_list()

arr_val_text = df_val['text'].apply(str).to_list()
arr_val_label = df_val[f'{tnm_label}_class'].apply(int).to_list()

arr_test_text = df_test['text'].apply(str).to_list()
arr_test_label = df_test[f'{tnm_label}_class'].apply(int).to_list()

We first analyze the token length of each document in the corpus:

In [20]:
arr_corpus_text = arr_train_text + arr_val_text + arr_test_text
arr_tok = []
for document in arr_corpus_text:
    tokens = tokenizer(
        document,
        truncation=False,
        padding=False
    )
    arr_tok.append(tokens['input_ids'])

Token indices sequence length is longer than the specified maximum sequence length for this model (5074 > 4096). Running this sequence through the model will result in indexing errors


In [21]:
arr_tok_len = pd.Series([len(seq) for seq in arr_tok])
print(arr_tok_len.describe())

count    3897.000000
mean      877.958686
std       824.078881
min        27.000000
25%       242.000000
50%       634.000000
75%      1242.000000
max      5447.000000
dtype: float64


In [22]:
print(pd.DataFrame({
    "abs": (arr_tok_len <= seq_len).value_counts(normalize=False),
    "rel": (arr_tok_len <= seq_len).value_counts(normalize=True)
}))
print()

        abs       rel
True   3877  0.994868
False    20  0.005132



Only 20 documents do not fit into the model.

In [23]:
train_encodings = tokenizer(
    arr_train_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [24]:
val_encodings = tokenizer(
    arr_val_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [25]:
test_encodings = tokenizer(
    arr_test_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [26]:
train_dataset = llm_utils.CustomDataset(
    encodings=train_encodings,
    labels=torch.tensor(arr_train_label)
)

In [27]:
val_dataset = llm_utils.CustomDataset(
    encodings=val_encodings,
    labels=torch.tensor(arr_val_label)
)

In [28]:
test_dataset = llm_utils.CustomDataset(
    encodings=test_encodings,
    labels=torch.tensor(arr_test_label)
)

In [29]:
print("Train data length:", len(train_dataset))
print("Val data length:", len(val_dataset))
print("Test data length:", len(test_dataset))

Train data length: 1947
Val data length: 780
Test data length: 1170


## Model fine-tuning

In [30]:
set_seed(0)

In [31]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_enc.classes_)
)

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-BigBird and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
print(torchinfo.summary(model))

Layer (type:depth-idx)                                            Param #
BigBirdForSequenceClassification                                  --
├─BigBirdModel: 1-1                                               --
│    └─BigBirdEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        38,674,944
│    │    └─Embedding: 3-2                                        3,145,728
│    │    └─Embedding: 3-3                                        1,536
│    │    └─LayerNorm: 3-4                                        1,536
│    │    └─Dropout: 3-5                                          --
│    └─BigBirdEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,054,464
│    └─Linear: 2-3                                                590,592
│    └─Tanh: 2-4                                                  --
├─BigBirdClassificationHead: 1-2                                

In [33]:
torch.backends.cuda.matmul.allow_tf32 = True

training_args = TrainingArguments(
    tf32=True,
    dataloader_num_workers=4,
    output_dir=out_path,          # output directory
    disable_tqdm=False,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,  # batch size per device during training
    per_device_eval_batch_size=bs,   # batch size for evaluation
    learning_rate=lr,
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,
    seed=0
)

In [34]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=llm_utils.compute_metrics_text_class
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [35]:
start_time = time.time()

trainer.train()

end_time = time.time()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Pred,Support
1,0.2976,0.295119,93.7,87.8,93.7,90.7,0,49
2,0.2774,0.30976,93.7,87.8,93.7,90.7,0,49
3,0.2806,0.276799,93.7,87.8,93.7,90.7,0,49
4,0.2705,0.294411,93.7,87.8,93.7,90.7,0,49
5,0.2506,0.29724,93.1,90.2,93.1,91.2,13,49
6,0.2357,0.286851,93.1,89.8,93.1,91.0,11,49
7,0.1984,0.335915,92.4,89.6,92.4,90.8,18,49
8,0.1562,0.396941,91.8,91.0,91.8,91.4,39,49
9,0.1022,0.575433,89.6,90.1,89.6,89.8,54,49
10,0.0695,0.659178,88.5,90.2,88.5,89.3,67,49


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [36]:
print("Total training time:", str(timedelta(seconds=end_time - start_time)))

Total training time: 0:38:17.868699


In [37]:
arr_train_logs = copy.deepcopy(trainer.state.log_history)

In [38]:
train_stats = arr_train_logs.pop()

In [39]:
print("Training stats:")
print(train_stats)

Training stats:
{'train_runtime': 2296.1607, 'train_samples_per_second': 8.479, 'train_steps_per_second': 2.121, 'total_flos': 4.126477313654784e+16, 'train_loss': 0.21387272076929864, 'epoch': 10.0, 'step': 4870}


In [40]:
assert len(arr_train_logs) == epochs * 2

arr_print_logs = []
for i in range(0, len(arr_train_logs), 2):
    arr_print_logs.append({**arr_train_logs[i], **arr_train_logs[i+1]})

df_print_logs = pd.DataFrame(
    arr_print_logs,
    index=range(1, epochs+1)
)

In [41]:
df_print_logs

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_pred,eval_support,eval_runtime,eval_samples_per_second,eval_steps_per_second
1,0.2976,0.25247,1.8e-05,1.0,487,0.295119,93.7,87.8,93.7,90.7,0,49,18.5442,42.062,10.515
2,0.2774,0.144672,1.6e-05,2.0,974,0.30976,93.7,87.8,93.7,90.7,0,49,18.6768,41.763,10.441
3,0.2806,0.22728,1.4e-05,3.0,1461,0.276799,93.7,87.8,93.7,90.7,0,49,18.6772,41.762,10.441
4,0.2705,0.142246,1.2e-05,4.0,1948,0.294411,93.7,87.8,93.7,90.7,0,49,18.7218,41.663,10.416
5,0.2506,0.137976,1e-05,5.0,2435,0.29724,93.1,90.2,93.1,91.2,13,49,18.6786,41.759,10.44
6,0.2357,0.365701,8e-06,6.0,2922,0.286851,93.1,89.8,93.1,91.0,11,49,18.6868,41.741,10.435
7,0.1984,23.874207,6e-06,7.0,3409,0.335915,92.4,89.6,92.4,90.8,18,49,18.7061,41.698,10.424
8,0.1562,0.071523,4e-06,8.0,3896,0.396941,91.8,91.0,91.8,91.4,39,49,18.6869,41.74,10.435
9,0.1022,0.035296,2e-06,9.0,4383,0.575433,89.6,90.1,89.6,89.8,54,49,18.6794,41.757,10.439
10,0.0695,0.047155,0.0,10.0,4870,0.659178,88.5,90.2,88.5,89.3,67,49,18.5389,42.074,10.518


# Evaluation

## Validation

In [42]:
val_preds = trainer.predict(val_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [43]:
print("Performance on val set:", llm_utils.compute_metrics_text_class(val_preds))

Performance on val set: {'accuracy': 91.8, 'precision': 91.0, 'recall': 91.8, 'f1': 91.4, 'pred': 39, 'support': 49}


In [44]:
arr_val_label_preds = label_enc.inverse_transform(val_preds[0].argmax(axis=1))

In [45]:
accuracy_score(
    y_true=df_val[f'{tnm_label}_label'].values,
    y_pred=arr_val_label_preds
)

0.9179487179487179

In [46]:
utils.calculate_performance(
    arr_gs=df_val[f'{tnm_label}_label'].values,
    arr_preds=arr_val_label_preds,
    arr_labels=label_enc.classes_,
    col_label=f"{tnm_label}_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,M0,0.950067,0.963064,0.956522,1821,731
1,M1,0.307692,0.244898,0.272727,126,49


We save the model predictions (probability values):

In [47]:
with open(
    os.path.join(out_preds_path, f"{tnm_label}_label_{model_name.split('/')[-1]}_val_preds.pkl"),
    'wb'
) as file:
    pickle.dump(val_preds[0], file)

## Test

In [48]:
test_preds = trainer.predict(test_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [49]:
print("Performance on test set:", llm_utils.compute_metrics_text_class(test_preds))

Performance on test set: {'accuracy': 92.1, 'precision': 90.4, 'recall': 92.1, 'f1': 91.1, 'pred': 45, 'support': 74}


In [50]:
arr_test_label_preds = label_enc.inverse_transform(test_preds[0].argmax(axis=1))

In [51]:
accuracy_score(
    y_true=df_test[f'{tnm_label}_label'].values,
    y_pred=arr_test_label_preds
)

0.9205128205128205

In [52]:
utils.calculate_performance(
    arr_gs=df_test[f'{tnm_label}_label'].values,
    arr_preds=arr_test_label_preds,
    arr_labels=label_enc.classes_,
    col_label=f"{tnm_label}_label",
    df_data=df_test,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,M0,0.945778,0.970803,0.958127,1821,1096
1,M1,0.288889,0.175676,0.218487,126,74


We save the model predictions (probability values):

In [53]:
with open(
    os.path.join(out_preds_path, f"{tnm_label}_label_{model_name.split('/')[-1]}_test_preds.pkl"),
    'wb'
) as file:
    pickle.dump(test_preds[0], file)