In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from collections import Counter
import numpy as np
from tqdm import tqdm
import pandas as pd
import os

In [3]:
MODEL_NAME = 'bert-base-uncased'
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 10
EARLY_STOPPING_PATIENCE = 3
RANDOM_SEED = 42
FILE_PATH_TRAIN = '/content/drive/MyDrive/medical-tc/medical_tc_train.csv'
FILE_PATH_TEST = '/content/drive/MyDrive/medical-tc/medical_tc_test.csv'
TEXT_COL = 'medical_abstract'
LABEL_COL = 'condition_label'

In [4]:
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Đang sử dụng GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Không tìm thấy GPU, đang sử dụng CPU.")

Đang sử dụng GPU: Tesla T4


In [6]:
data_train = pd.read_csv(FILE_PATH_TRAIN)
data_test = pd.read_csv(FILE_PATH_TEST)

unique_labels = sorted(data_train[LABEL_COL].unique())
label_to_id = {label: id for id, label in enumerate(unique_labels)}

data_train['label_id'] = data_train[LABEL_COL].map(label_to_id).astype(int)
data_test['label_id'] = data_test[LABEL_COL].map(label_to_id).astype(int)

train_texts = data_train[TEXT_COL].tolist()
train_labels = data_train['label_id'].tolist()
test_texts = data_test[TEXT_COL].tolist()
test_labels = data_test['label_id'].tolist()

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts,
    train_labels,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=train_labels
)

In [8]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class MedicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = MedicalDataset(train_texts, train_labels, tokenizer)
val_dataset = MedicalDataset(val_texts, val_labels, tokenizer)
test_dataset = MedicalDataset(test_texts, test_labels, tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [10]:
num_classes = len(unique_labels)
print(f"Số lượng classes: {num_classes}")
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_classes)
model.to(device)

Số lượng classes: 5


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [12]:
def evaluate(model, dataloader):
    model.eval()
    predictions = []
    actual_labels = []
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1).flatten().cpu().numpy()
            actual = labels.flatten().cpu().numpy()
            predictions.extend(preds)
            actual_labels.extend(actual)
    avg_loss = total_loss / len(dataloader)
    precision, recall, f1, _ = precision_recall_fscore_support(actual_labels, predictions, average='weighted', zero_division=0)
    accuracy = accuracy_score(actual_labels, predictions)
    return avg_loss, accuracy, precision, recall, f1

In [13]:
best_f1 = 0
patience_counter = 0

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    val_loss, val_accuracy, val_precision, val_recall, val_f1 = evaluate(model, val_dataloader)

    print(f'Epoch {epoch+1}/{EPOCHS}')
    print(f'  Train Loss: {avg_train_loss:.4f}')
    print(f'  Val Loss:   {val_loss:.4f}')
    print(f'  Val Accuracy:   {val_accuracy:.4f}')
    print(f'  Val Precision:  {val_precision:.4f}')
    print(f'  Val Recall:     {val_recall:.4f}')
    print(f'  Val F1 Score:   {val_f1:.4f}')

    if val_f1 > best_f1:
        best_f1 = val_f1
        patience_counter = 0
        model.save_pretrained('./best_model')
    else:
        patience_counter += 1
        if patience_counter >= EARLY_STOPPING_PATIENCE:
            print(f'Dừng sớm do không có cải thiện F1 score trong {EARLY_STOPPING_PATIENCE} epochs.')
            break

print(f'\nHuấn luyện hoàn tất! F1 Score tốt nhất trên tập validation: {best_f1:.4f}')

Epoch 1: 100%|██████████| 578/578 [03:15<00:00,  2.95it/s]
Evaluating: 100%|██████████| 145/145 [00:23<00:00,  6.09it/s]


Epoch 1/10
  Train Loss: 1.0501
  Val Loss:   0.9098
  Val Accuracy:   0.6307
  Val Precision:  0.6250
  Val Recall:     0.6307
  Val F1 Score:   0.6199


Epoch 2: 100%|██████████| 578/578 [03:17<00:00,  2.92it/s]
Evaluating: 100%|██████████| 145/145 [00:23<00:00,  6.11it/s]


Epoch 2/10
  Train Loss: 0.8331
  Val Loss:   0.8968
  Val Accuracy:   0.6186
  Val Precision:  0.6082
  Val Recall:     0.6186
  Val F1 Score:   0.6067


Epoch 3: 100%|██████████| 578/578 [03:17<00:00,  2.92it/s]
Evaluating: 100%|██████████| 145/145 [00:23<00:00,  6.13it/s]


Epoch 3/10
  Train Loss: 0.7357
  Val Loss:   0.9059
  Val Accuracy:   0.6234
  Val Precision:  0.6190
  Val Recall:     0.6234
  Val F1 Score:   0.6148


Epoch 4: 100%|██████████| 578/578 [03:17<00:00,  2.92it/s]
Evaluating: 100%|██████████| 145/145 [00:23<00:00,  6.22it/s]

Epoch 4/10
  Train Loss: 0.6450
  Val Loss:   0.9884
  Val Accuracy:   0.5991
  Val Precision:  0.5971
  Val Recall:     0.5991
  Val F1 Score:   0.5834
Dừng sớm do không có cải thiện F1 score trong 3 epochs.

Huấn luyện hoàn tất! F1 Score tốt nhất trên tập validation: 0.6199





In [14]:
best_model = BertForSequenceClassification.from_pretrained('./best_model')
best_model.to(device)
test_loss, test_accuracy, test_precision, test_recall, test_f1 = evaluate(best_model, test_dataloader)
print(f'\nBest Model on Test Set:')
print(f'  Test Loss:    {test_loss:.4f}')
print(f'  Test Accuracy:  {test_accuracy:.4f}')
print(f'  Test Precision: {test_precision:.4f}')
print(f'  Test Recall:    {test_recall:.4f}')
print(f'  Test F1 Score:  {test_f1:.4f}')

Evaluating: 100%|██████████| 181/181 [00:31<00:00,  5.78it/s]


Best Model on Test Set:
  Test Loss:    0.9186
  Test Accuracy:  0.6170
  Test Precision: 0.6082
  Test Recall:    0.6170
  Test F1 Score:  0.6051



