In [1]:
from datasets import load_dataset
from typing import List, Dict
import torch
dataset = load_dataset("parquet", data_files={
    "train": "/kaggle/input/dataset/train-00000-of-00001-baac38b53532b0da.parquet",
    "test": "/kaggle/input/dataset/test-00000-of-00001-1019821dbb200a34.parquet"
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [2]:
!pip install git+https://github.com/kmkurn/pytorch-crf.git --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pytorch-crf (setup.py) ... [?25l[?25hdone


In [3]:
print(dataset["train"][0])

{'tokens': ['Selegiline', '-', 'induced', 'postural', 'hypotension', 'in', 'Parkinson', "'", 's', 'disease', ':', 'a', 'longitudinal', 'study', 'on', 'the', 'effects', 'of', 'drug', 'withdrawal', '.'], 'tags': [0, 0, 0, 1, 2, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'sentence_id': 'BC5CDR-0'}


In [4]:
from torch.utils.data import DataLoader, Dataset
from collections import Counter
def build_vocab(dataset: List[Dict], min_freq: int = 1):
    token_counter = Counter()
    for item in dataset:
        token_counter.update(item["tokens"])
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for token, count in token_counter.items():
        if count >= min_freq:
            vocab[token] = len(vocab)
    return vocab
tag2idx = {"O": 0, "B-Disease": 1, "I-Disease": 2}
idx2tag = {v: k for k, v in tag2idx.items()}

class NERDataset(Dataset):
    def __init__(self, data: List[Dict], vocab: Dict[str, int], tag2idx: Dict[str, int], max_len: int = 128):
        self.data = data
        self.vocab = vocab
        self.tag2idx = tag2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]["tokens"][:self.max_len]
        tags = self.data[idx]["tags"][:self.max_len]

        input_ids = [self.vocab.get(token, self.vocab["<UNK>"]) for token in tokens]
        tag_ids = tags
        pad_len = self.max_len - len(input_ids)

        return {
            "input_ids": torch.tensor(input_ids + [self.vocab["<PAD>"]]*pad_len, dtype=torch.long),
            "tags": torch.tensor(tag_ids + [0]*pad_len, dtype=torch.long),
            "attention_mask": torch.tensor([1]*len(input_ids) + [0]*pad_len, dtype=torch.long)
        }

In [5]:
vocab = build_vocab(dataset["train"], min_freq=1)

train_dataset = NERDataset(dataset["train"], vocab, tag2idx)
test_dataset = NERDataset(dataset["test"], vocab, tag2idx)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [6]:
import torch
import torch.nn as nn
from torchcrf import CRF

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tagset_size,hidden_dim, embedding_dim=100, pad_idx=0, dropout_rate=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding_dropout = nn.Dropout(dropout_rate)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.crf = CRF(tagset_size, batch_first=True)
    
    def forward(self, input_ids, tags=None, mask=None):
        embedded = self.embedding(input_ids)
        embedded = self.embedding_dropout(embedded)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out) 
        emissions = self.hidden2tag(lstm_out)
        
        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask, reduction='mean')
            return loss
        else:
            predictions = self.crf.decode(emissions, mask=mask)
            return predictions

In [7]:
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
def train_step(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        loss = model(
            input_ids=batch["input_ids"],
            tags=batch["tags"],
            mask=batch["attention_mask"].bool()
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

def eval_step(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            mask = batch["attention_mask"].bool()
            loss = model(
                input_ids=batch["input_ids"],
                tags=batch["tags"],
                mask=mask
            )
            total_loss += loss.item()
            predictions = model(
                input_ids=batch["input_ids"],
                tags=None,
                mask=mask
            )
            for pred_seq, true_seq, mask_seq in zip(predictions, batch["tags"], mask):
                true_seq = true_seq[mask_seq].cpu().tolist()
                all_preds.extend(pred_seq)
                all_labels.extend(true_seq)

    report = classification_report(all_labels, all_preds, output_dict=True, zero_division=0)
    return total_loss / len(dataloader), report


In [8]:
from torch import nn
import torch.optim as optim
from transformers import get_scheduler
from tqdm.notebook import tqdm

def train_model(model, train_loader, test_loader, device, epochs=3, lr=2e-5):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr,weight_decay=1e-5)

    scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=epochs * len(train_loader)
    )

    best_f1 = 0
    for epoch in tqdm(range(epochs), desc="Epochs"):
        print(f"\nEpoch {epoch+1}/{epochs}")

        train_loss = train_step(model, train_loader, optimizer, device)
        test_loss, test_report = eval_step(model, test_loader, device)

        scheduler.step()
        f1 = test_report['macro avg']['f1-score']
        print(f"Train Loss: {train_loss:.4f} | Test Loss: {test_loss:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"Recall:    {test_report['macro avg']['recall']:.4f}")
        print(f"Precision: {test_report['macro avg']['precision']:.4f}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), "best_model.pt")
            print(f"Saved new best model at epoch {epoch+1} with F1-score {f1:.4f}")

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM_CRF(
    vocab_size=len(vocab),
    tagset_size=len(tag2idx),
    hidden_dim=300,
    embedding_dim=100,
    pad_idx=vocab["<PAD>"],
)
train_model(model, train_dataloader, test_dataloader, device, epochs=15, lr=1e-3)

Epochs:   0%|          | 0/15 [00:00<?, ?it/s]


Epoch 1/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 5.4180 | Test Loss: 3.2916
F1-score: 0.6742
Recall:    0.6150
Precision: 0.7761
Saved new best model at epoch 1 with F1-score 0.6742

Epoch 2/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 2.8062 | Test Loss: 2.2780
F1-score: 0.7739
Recall:    0.7347
Precision: 0.8239
Saved new best model at epoch 2 with F1-score 0.7739

Epoch 3/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 1.9768 | Test Loss: 1.8120
F1-score: 0.8092
Recall:    0.7757
Precision: 0.8490
Saved new best model at epoch 3 with F1-score 0.8092

Epoch 4/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 1.5147 | Test Loss: 1.6460
F1-score: 0.8247
Recall:    0.8086
Precision: 0.8430
Saved new best model at epoch 4 with F1-score 0.8247

Epoch 5/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 1.2088 | Test Loss: 1.5613
F1-score: 0.8282
Recall:    0.8344
Precision: 0.8235
Saved new best model at epoch 5 with F1-score 0.8282

Epoch 6/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.9887 | Test Loss: 1.4361
F1-score: 0.8408
Recall:    0.8389
Precision: 0.8433
Saved new best model at epoch 6 with F1-score 0.8408

Epoch 7/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.8284 | Test Loss: 1.3809
F1-score: 0.8430
Recall:    0.8366
Precision: 0.8502
Saved new best model at epoch 7 with F1-score 0.8430

Epoch 8/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.6994 | Test Loss: 1.3983
F1-score: 0.8433
Recall:    0.8505
Precision: 0.8368
Saved new best model at epoch 8 with F1-score 0.8433

Epoch 9/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.5987 | Test Loss: 1.3810
F1-score: 0.8451
Recall:    0.8485
Precision: 0.8419
Saved new best model at epoch 9 with F1-score 0.8451

Epoch 10/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.5438 | Test Loss: 1.3808
F1-score: 0.8519
Recall:    0.8532
Precision: 0.8510
Saved new best model at epoch 10 with F1-score 0.8519

Epoch 11/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.4630 | Test Loss: 1.4394
F1-score: 0.8469
Recall:    0.8457
Precision: 0.8480

Epoch 12/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.4183 | Test Loss: 1.4737
F1-score: 0.8474
Recall:    0.8651
Precision: 0.8312

Epoch 13/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.3711 | Test Loss: 1.4334
F1-score: 0.8481
Recall:    0.8645
Precision: 0.8337

Epoch 14/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.3372 | Test Loss: 1.4179
F1-score: 0.8543
Recall:    0.8579
Precision: 0.8507
Saved new best model at epoch 14 with F1-score 0.8543

Epoch 15/15


Training:   0%|          | 0/484 [00:00<?, ?it/s]

Train Loss: 0.3108 | Test Loss: 1.4755
F1-score: 0.8502
Recall:    0.8540
Precision: 0.8466
