# load dataset

In [34]:
from datasets import load_dataset

raw_dataset = load_dataset("tner/bc5cdr")

  0%|          | 0/3 [00:00<?, ?it/s]

## dataset labels

```python
label2id = {
    "O": 0,
    "B-Chemical": 1,
    "B-Disease": 2,
    "I-Disease": 3,
    "I-Chemical": 4
}
```

## dataset structure

In [35]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5228
    })
    validation: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5330
    })
    test: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 5865
    })
})

In [36]:
raw_dataset["train"].features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'tags': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)}

In [37]:
raw_dataset["train"][0]

{'tokens': ['Naloxone',
  'reverses',
  'the',
  'antihypertensive',
  'effect',
  'of',
  'clonidine',
  '.'],
 'tags': [1, 0, 0, 0, 0, 0, 1, 0]}

# import modules

In [38]:
!pip install evaluate
!pip install seqeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
# data preprocessing
from transformers import AutoTokenizer

# dataloader
from transformers import DataCollatorForTokenClassification
from torch.utils.data.dataloader import DataLoader

# model fine-tuning
from transformers import AutoModelForTokenClassification, get_scheduler
from tqdm.auto import tqdm
import torch

# model evaluation
import evaluate
import seqeval

# hyperparameters

In [40]:
label_names = ["0", "B-Chemical", "B-Disease", "I-Disease", "I-Chemical"]

model_checkpoint = "distilbert-base-uncased"
batch_size = 32
num_train_epochs = 15
learning_rate = 0.0001

# preprocess dataset

In [41]:
# reorganize columns

raw_dataset = raw_dataset.rename_column("tokens", "words")
raw_dataset = raw_dataset.rename_column("tags", "labels")

In [42]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [43]:
def shift_label(label):
	# change B-XXX to I-XXX
	label_map = {1: 4, 2: 3}
	label = label_map.get(label, label) # (arg1:key for target value, arg2:default value)
	return label

def align_labels_with_tokens(labels, word_ids):
	new_labels = []
	current_word = None
	for word_id in word_ids:
		if word_id is None: # default word id for special tokens
			new_labels.append(-100)
		elif word_id != current_word:
			# start a new word!
			current_word = word_id
			new_labels.append(labels[word_id])
		else:
			new_labels.append(shift_label(labels[word_id]))
	return new_labels

def tokenize_and_align_labels(examples):
	tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
	new_labels_list = []
	for i, labels in enumerate(examples["labels"]):
		word_ids = tokenized_inputs.word_ids(i)
		new_labels_list.append(align_labels_with_tokens(labels, word_ids))
	tokenized_inputs["labels"] = new_labels_list
	return tokenized_inputs

In [44]:
tokenized_datasets = raw_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [45]:
datasets = ["train", "validation", "test"]

for split in datasets:
    dataset = tokenized_datasets[split]
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [46]:
tokenized_datasets["train"][0]

{'labels': tensor([-100,    1,    4,    4,    4,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    1,    4,    4,    0, -100]),
 'input_ids': tensor([  101,  6583,  4135, 22500,  2063,  7901,  2015,  1996,  3424, 10536,
          4842, 25808,  3512,  3466,  1997, 18856, 10698, 10672,  1012,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

# prepare training & eval data

In [47]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [48]:
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, 
collate_fn=data_collator)

eval_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=batch_size, collate_fn=data_collator)

# fine-tune model

In [49]:
label_names = ["0", "B-Chemical", "B-Disease", "I-Disease", "I-Chemical"]

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [50]:
# instantiate model

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, id2label=id2label, label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# check number of labels

model.config.num_labels

5

In [52]:
# check model inputs

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([32, 79]),
 'attention_mask': torch.Size([32, 79]),
 'labels': torch.Size([32, 79])}

In [53]:
# move model to GPU

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [54]:
# define optimizer and dynamic learning rate

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [55]:
# define metrics

metric = evaluate.load("seqeval")

def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [56]:
# full training loop

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    
    # Training
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        true_predictions, true_labels = postprocess(predictions, labels)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    
    print(
        f"epoch {epoch+1}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

  0%|          | 0/2460 [00:00<?, ?it/s]

epoch 1: {'precision': 0.7508711707494669, 'recall': 0.7804627527300249, 'f1': 0.7653810470510272, 'accuracy': 0.9562497879990502}
epoch 2: {'precision': 0.770694087403599, 'recall': 0.8103578765271922, 'f1': 0.7900284599978918, 'accuracy': 0.9578915233540246}
epoch 3: {'precision': 0.7837304677360744, 'recall': 0.816142285652503, 'f1': 0.7996080612271922, 'accuracy': 0.9591669210678064}
epoch 4: {'precision': 0.8051196016884944, 'recall': 0.8042491080116769, 'f1': 0.8046841194288187, 'accuracy': 0.9598860282894067}
epoch 5: {'precision': 0.7851124321530111, 'recall': 0.8210617364039355, 'f1': 0.8026847765769098, 'accuracy': 0.9589837522472101}
epoch 6: {'precision': 0.8094096334185849, 'recall': 0.8212239160990378, 'f1': 0.8152739762786455, 'accuracy': 0.9622061666836268}
epoch 7: {'precision': 0.8165719282903366, 'recall': 0.8076548816088226, 'f1': 0.8120889275425341, 'accuracy': 0.9603066381737391}
epoch 8: {'precision': 0.8060844591000957, 'recall': 0.8193318196561791, 'f1': 0.8126

In [57]:
save_path = '/kaggle/working/distilbert_ner_v4_pytorch'
model.save_pretrained(save_path)

# test model

### using test_dataloader

In [58]:
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=batch_size, collate_fn=data_collator)

In [59]:
y_preds = []

for batch in test_dataloader:
    batch = {k:v.to(device) for k,v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)
        
    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]
    y_preds.extend(predictions)

    true_predictions, true_labels = postprocess(predictions, labels)
    metric.add_batch(predictions=true_predictions, references=true_labels)

results = metric.compute()
print(
    {
        key: results[f"overall_{key}"]
        for key in ["precision", "recall", "f1", "accuracy"]
    },
)

{'precision': 0.7914875990265624, 'recall': 0.8086974923288541, 'f1': 0.8, 'accuracy': 0.9603710489564248}


In [60]:
y_test = tokenized_datasets["test"]["labels"]

def postprocess_for_testing(predictions, labels):
    
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_predictions, true_labels

In [61]:
seq_preds, seq_labels = postprocess_for_testing(y_preds, y_test)

In [62]:
print(len(seq_preds))
print(len(seq_labels))

5865
5865


## classification report

In [63]:
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

In [64]:
print(f1_score(seq_preds, seq_labels))
print(classification_report(seq_preds, seq_labels))

0.8
              precision    recall  f1-score   support

    Chemical       0.88      0.87      0.87      5480
     Disease       0.81      0.79      0.80      4548
           _       0.77      0.75      0.76      9285

   micro avg       0.81      0.79      0.80     19313
   macro avg       0.82      0.80      0.81     19313
weighted avg       0.81      0.79      0.80     19313



In [67]:
print(seq_preds[567])
print(seq_labels[567])

['0', '0', '0', '0', '0', '0', 'B-Chemical', 'I-Chemical', 'B-Disease', '0']
['B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', '0', '0', '0', 'B-Chemical', 'B-Disease', '0']
