This notebook uses a BERT-based model to predict the 'N' label of the TNM staging classification.

In [1]:
import os
current_dir = "/home/lopezgg/common/projects/ai-campus/AI-Campus-Project-7-NLP/code/tnm_stage"
os.chdir(current_dir)

In [2]:
import os
import pickle
from datetime import timedelta
import numpy as np
import pandas as pd
import time
import copy

import sys
sys.path.append('..')
import utils

from sklearn.preprocessing import LabelEncoder

import torch
import torchinfo
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, set_seed
from sklearn.metrics import accuracy_score


In [3]:
# Constants and arguments
seq_len = 4096
epochs = 10
lr = 2e-5
bs = 6
cuda_gpu_id = "0"

tnm_label = 't'

model_name = "yikuan8/Clinical-BigBird"
data_dir = "../../data/tnm_stage"
out_path = "./model_weights"
out_preds_path = "./model_preds"

In [4]:
if cuda_gpu_id != "-1":
    os.environ["CUDA_VISIBLE_DEVICES"] = cuda_gpu_id
_ = torch.device('cuda')
torch.backends.cuda.matmul.allow_tf32 = True
assert torch.cuda.is_available()
print("Number of GPUs available:", torch.cuda.device_count())

Number of GPUs available: 1


# Data loading

In [5]:
label_enc = LabelEncoder()

## Training

In [6]:
df_train = pd.read_csv(os.path.join(data_dir, "train_tcga_reports_tnm_stage.csv"))

In [7]:
df_train.shape

(1947, 6)

In [8]:
df_train[f'{tnm_label}_label'].value_counts()

t_label
T2    689
T3    596
T1    435
T4    227
Name: count, dtype: int64

In [9]:
df_train[f'{tnm_label}_class'] = label_enc.fit_transform(df_train[f'{tnm_label}_label'])

## Validation

In [10]:
df_val = pd.read_csv(os.path.join(data_dir, "val_tcga_reports_tnm_stage.csv"))

In [11]:
df_val.shape

(780, 6)

In [12]:
df_val[f'{tnm_label}_class'] = label_enc.fit_transform(df_val[f'{tnm_label}_label'])

## Test

In [13]:
df_test = pd.read_csv(os.path.join(data_dir, "test_tcga_reports_tnm_stage.csv"))

In [14]:
df_test.shape

(1170, 6)

In [15]:
df_test[f'{tnm_label}_class'] = label_enc.fit_transform(df_test[f'{tnm_label}_label'])

# Model training

## Tokenization

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [17]:
arr_train_text = df_train['text'].apply(str).to_list()
arr_train_label = df_train[f'{tnm_label}_class'].apply(int).to_list()

arr_val_text = df_val['text'].apply(str).to_list()
arr_val_label = df_val[f'{tnm_label}_class'].apply(int).to_list()

arr_test_text = df_test['text'].apply(str).to_list()
arr_test_label = df_test[f'{tnm_label}_class'].apply(int).to_list()

We first analyze the token length of each document in the corpus:

In [18]:
arr_corpus_text = arr_train_text + arr_val_text + arr_test_text
arr_tok = []
for document in arr_corpus_text:
    tokens = tokenizer(
        document,
        truncation=False,
        padding=False
    )
    arr_tok.append(tokens['input_ids'])

Token indices sequence length is longer than the specified maximum sequence length for this model (5074 > 4096). Running this sequence through the model will result in indexing errors


In [19]:
arr_tok_len = pd.Series([len(seq) for seq in arr_tok])
print(arr_tok_len.describe())

count    3897.000000
mean      877.958686
std       824.078881
min        27.000000
25%       242.000000
50%       634.000000
75%      1242.000000
max      5447.000000
dtype: float64


In [20]:
print(pd.DataFrame({
    "abs": (arr_tok_len <= seq_len).value_counts(normalize=False),
    "rel": (arr_tok_len <= seq_len).value_counts(normalize=True)
}))
print()

        abs       rel
True   3877  0.994868
False    20  0.005132



Only 20 documents do not fit into the model.

In [21]:
train_encodings = tokenizer(
    arr_train_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [22]:
val_encodings = tokenizer(
    arr_val_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [23]:
test_encodings = tokenizer(
    arr_test_text,
    truncation=True,
    padding=True,
    max_length=seq_len,
    return_tensors="pt"
)

In [24]:
train_dataset = utils.CustomDataset(
    encodings=train_encodings,
    labels=torch.tensor(arr_train_label)
)

In [25]:
val_dataset = utils.CustomDataset(
    encodings=val_encodings,
    labels=torch.tensor(arr_val_label)
)

In [26]:
test_dataset = utils.CustomDataset(
    encodings=test_encodings,
    labels=torch.tensor(arr_test_label)
)

In [27]:
print("Train data length:", len(train_dataset))
print("Val data length:", len(val_dataset))
print("Test data length:", len(test_dataset))

Train data length: 1947
Val data length: 780
Test data length: 1170


## Model fine-tuning

In [28]:
set_seed(0)

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_enc.classes_)
)

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at yikuan8/Clinical-BigBird and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
print(torchinfo.summary(model))

Layer (type:depth-idx)                                            Param #
BigBirdForSequenceClassification                                  --
├─BigBirdModel: 1-1                                               --
│    └─BigBirdEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        38,674,944
│    │    └─Embedding: 3-2                                        3,145,728
│    │    └─Embedding: 3-3                                        1,536
│    │    └─LayerNorm: 3-4                                        1,536
│    │    └─Dropout: 3-5                                          --
│    └─BigBirdEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,054,464
│    └─Linear: 2-3                                                590,592
│    └─Tanh: 2-4                                                  --
├─BigBirdClassificationHead: 1-2                                

In [31]:
torch.backends.cuda.matmul.allow_tf32 = True

training_args = TrainingArguments(
    tf32=True,
    dataloader_num_workers=4,
    output_dir=out_path,          # output directory
    disable_tqdm=False,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,  # batch size per device during training
    per_device_eval_batch_size=bs,   # batch size for evaluation
    learning_rate=lr,
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_safetensors=False,
    seed=0
)

In [32]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=utils.compute_metrics_text_class
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [33]:
start_time = time.time()

trainer.train()

end_time = time.time()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.208,1.162644,52.1,52.6,52.1,50.5
2,1.0539,1.099547,54.4,55.6,54.4,53.8
3,0.844,0.73137,74.5,75.1,74.5,73.9
4,0.4977,0.679947,77.3,77.3,77.3,77.2
5,0.3786,0.719719,79.9,80.0,79.9,79.8
6,0.2715,0.882529,78.6,78.8,78.6,78.5
7,0.2113,1.020149,78.3,78.7,78.3,78.3
8,0.1528,1.01135,80.1,80.2,80.1,80.1
9,0.1177,1.07833,79.2,79.5,79.2,79.2
10,0.0864,1.084226,79.2,79.4,79.2,79.2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [34]:
print("Total training time:", str(timedelta(seconds=end_time - start_time)))

Total training time: 0:37:04.148488


In [35]:
arr_train_logs = copy.deepcopy(trainer.state.log_history)

In [36]:
train_stats = arr_train_logs.pop()

In [37]:
print("Training stats:")
print(train_stats)

Training stats:
{'train_runtime': 2222.5959, 'train_samples_per_second': 8.76, 'train_steps_per_second': 1.462, 'total_flos': 4.12655090614272e+16, 'train_loss': 0.48219405834491436, 'epoch': 10.0, 'step': 3250}


In [38]:
assert len(arr_train_logs) == epochs * 2

arr_print_logs = []
for i in range(0, len(arr_train_logs), 2):
    arr_print_logs.append({**arr_train_logs[i], **arr_train_logs[i+1]})

df_print_logs = pd.DataFrame(
    arr_print_logs,
    index=range(1, epochs+1)
)

In [39]:
df_print_logs

Unnamed: 0,loss,grad_norm,learning_rate,epoch,step,eval_loss,eval_accuracy,eval_precision,eval_recall,eval_f1,eval_runtime,eval_samples_per_second,eval_steps_per_second
1,1.208,7.888074,1.8e-05,1.0,325,1.162644,52.1,52.6,52.1,50.5,18.0811,43.139,7.19
2,1.0539,10.183757,1.6e-05,2.0,650,1.099547,54.4,55.6,54.4,53.8,18.2052,42.845,7.141
3,0.844,15.342978,1.4e-05,3.0,975,0.73137,74.5,75.1,74.5,73.9,18.2033,42.849,7.142
4,0.4977,29.082773,1.2e-05,4.0,1300,0.679947,77.3,77.3,77.3,77.2,18.1886,42.884,7.147
5,0.3786,6.901114,1e-05,5.0,1625,0.719719,79.9,80.0,79.9,79.8,18.2359,42.773,7.129
6,0.2715,0.166738,8e-06,6.0,1950,0.882529,78.6,78.8,78.6,78.5,18.2333,42.779,7.13
7,0.2113,14.992931,6e-06,7.0,2275,1.020149,78.3,78.7,78.3,78.3,18.2128,42.827,7.138
8,0.1528,0.058437,4e-06,8.0,2600,1.01135,80.1,80.2,80.1,80.1,18.2172,42.817,7.136
9,0.1177,31.321905,2e-06,9.0,2925,1.07833,79.2,79.5,79.2,79.2,18.2102,42.833,7.139
10,0.0864,0.037645,0.0,10.0,3250,1.084226,79.2,79.4,79.2,79.2,18.1317,43.019,7.17


# Evaluation

## Validation

In [40]:
val_preds = trainer.predict(val_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [41]:
print("Performance on val set:", utils.compute_metrics_text_class(val_preds))

Performance on val set: {'accuracy': 80.1, 'precision': 80.2, 'recall': 80.1, 'f1': 80.1}


In [42]:
arr_val_label_preds = label_enc.inverse_transform(val_preds[0].argmax(axis=1))

In [43]:
accuracy_score(
    y_true=df_val[f'{tnm_label}_label'].values,
    y_pred=arr_val_label_preds
)

0.8012820512820513

In [44]:
utils.calculate_performance(
    arr_gs=df_val[f'{tnm_label}_label'].values,
    arr_preds=arr_val_label_preds,
    arr_labels=label_enc.classes_,
    col_label=f"{tnm_label}_label",
    df_data=df_val,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,T1,0.766667,0.788571,0.777465,435,175
1,T2,0.790941,0.825455,0.807829,689,275
2,T3,0.831224,0.820833,0.825996,596,240
3,T4,0.828947,0.7,0.759036,227,90


We save the model predictions (probability values):

In [45]:
with open(
    os.path.join(out_preds_path, f"{tnm_label}_label_{model_name.split('/')[-1]}_val_preds.pkl"),
    'wb'
) as file:
    pickle.dump(val_preds[0], file)

## Test

In [46]:
test_preds = trainer.predict(test_dataset)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [47]:
print("Performance on test set:", utils.compute_metrics_text_class(test_preds))

Performance on test set: {'accuracy': 79.6, 'precision': 79.6, 'recall': 79.6, 'f1': 79.6}


In [48]:
arr_test_label_preds = label_enc.inverse_transform(test_preds[0].argmax(axis=1))

In [49]:
accuracy_score(
    y_true=df_test[f'{tnm_label}_label'].values,
    y_pred=arr_test_label_preds
)

0.7957264957264957

In [50]:
utils.calculate_performance(
    arr_gs=df_test[f'{tnm_label}_label'].values,
    arr_preds=arr_test_label_preds,
    arr_labels=label_enc.classes_,
    col_label=f"{tnm_label}_label",
    df_data=df_test,
    df_train_data=df_train
)

Unnamed: 0,label,precision,recall,f1,n_train,n_val
0,T1,0.753906,0.736641,0.745174,435,262
1,T2,0.799065,0.830097,0.814286,689,412
2,T3,0.814085,0.802778,0.808392,596,360
3,T4,0.816794,0.786765,0.801498,227,136


We save the model predictions (probability values):

In [51]:
with open(
    os.path.join(out_preds_path, f"{tnm_label}_label_{model_name.split('/')[-1]}_test_preds.pkl"),
    'wb'
) as file:
    pickle.dump(test_preds[0], file)