In [1]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets transformers==4.28.0

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.33.0
    Uninstalling transformers-4.33.0:
      Successfully uninstalled transformers-4.33.0
Successfully installed transformers-4.28.0


In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import load_metric
from torch.utils.data import DataLoader
from datasets import Dataset, ClassLabel, Sequence, Features, Value, DatasetDict
from transformers import AutoTokenizer,AutoModelForTokenClassification, AdamW, DataCollatorForTokenClassification



In [3]:
file="/kaggle/input/151s5d1fs6e15fa/"
df = pd.read_json(file+'train.json',lines=True)
test_df = pd.read_json(file+'test.json',lines=True)
valid_df = pd.read_json(file+'valid.json',lines=True)
print(len(df))
print(len(test_df))
print(len(valid_df))
df[:2]

5228
5865
5330


Unnamed: 0,tags,tokens
0,"[1, 0, 0, 0, 0, 0, 1, 0]","[Naloxone, reverses, the, antihypertensive, ef..."
1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[In, unanesthetized, ,, spontaneously, hyperte..."


In [4]:
tag_name = ["O",
    "B-Chemical",
    "B-Disease",
    "I-Disease",
    "I-Chemical"]

In [5]:
tags = ClassLabel(num_classes=len(tag_name), names=tag_name)

In [6]:
tags

ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None)

In [7]:
dataset_structure = {"ner_tags":Sequence(tags),
                 'tokens': Sequence(feature=Value(dtype='string'))}

In [8]:
dataset_structure

{'ner_tags': Sequence(feature=ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [9]:
dataset_structure["ner_tags"].feature.names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [10]:
def df_to_dataset(df, columns=['tags', 'tokens']):
  ner_tags = df['tags']
  tokens = df['tokens']
  d = {'ner_tags':ner_tags, 'tokens':tokens}
  dataset = Dataset.from_dict(mapping=d,features=Features(dataset_structure),)
  return dataset

dataset = df_to_dataset(df)
test_dataset =  df_to_dataset(test_df)
valid_dataset =  df_to_dataset(valid_df)

dataset = DatasetDict({
    'train': dataset,
    'test': test_dataset,
    'valid': valid_dataset})

label_names = dataset['train'].features["ner_tags"].feature.names
label_names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [11]:
dataset['train']

Dataset({
    features: ['ner_tags', 'tokens'],
    num_rows: 5228
})

In [12]:
dataset['train'][:1]

{'ner_tags': [[1, 0, 0, 0, 0, 0, 1, 0]],
 'tokens': [['Naloxone',
   'reverses',
   'the',
   'antihypertensive',
   'effect',
   'of',
   'clonidine',
   '.']]}

In [30]:
model_name = 'microsoft/deberta-v3-base' 
tokenizer = AutoTokenizer.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
def tokenize_adjust_labels(all_samples_per_split):
  tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"], is_split_into_words=True, truncation=True)
  total_adjusted_labels = []

  for k in range(0, len(tokenized_samples["input_ids"])):
    word_ids_list = tokenized_samples.word_ids(batch_index=k)
    existing_label_ids = all_samples_per_split["ner_tags"][k]
    adjusted_label_ids = []
    prev_wid = -1
    i = -1
    for word_idx in word_ids_list:
      if(word_idx is None):
        adjusted_label_ids.append(-100)
      elif(word_idx!=prev_wid):
        i = i + 1
        adjusted_label_ids.append(existing_label_ids[i])
        prev_wid = word_idx
      else:
        label_name = label_names[existing_label_ids[i]]
        adjusted_label_ids.append(existing_label_ids[i])

    total_adjusted_labels.append(adjusted_label_ids)

  tokenized_samples["labels"] = total_adjusted_labels
  return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels,batched=True,remove_columns=list(dataset["train"].features.keys()))

  0%|          | 0/6 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [32]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5228
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5865
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5330
    })
})

In [33]:
tokenized_dataset['train'][0]['input_ids']

[1, 7460, 43623, 2268, 57375, 262, 97424, 1290, 265, 114354, 323, 2]

In [34]:
tokenized_dataset['train'][0]['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [35]:
tokenized_dataset['train'][0]['labels']

[-100, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, -100]

In [36]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [torch.tensor(s["input_ids"]) for s in samples]
    segments_tensors = [torch.tensor(s["attention_mask"]) for s in samples]
    label_ids = [torch.tensor(s["labels"]) for s in samples]

    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,  batch_first=True)
    label_ids = pad_sequence(label_ids,  batch_first=True)

    masks_tensors = torch.zeros(tokens_tensors.shape,  dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 16
trainloader = DataLoader(tokenized_dataset["train"], batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
validloader = DataLoader(tokenized_dataset["valid"], batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

In [37]:
data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape}
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([16, 108])
tensor([[    1,  7460, 43623,  ...,     0,     0,     0],
        [    1,   344, 11812,  ...,     0,     0,     0],
        [    1,   392,   264,  ...,     0,     0,     0],
        ...,
        [    1,  1942,   323,  ...,     0,     0,     0],
        [    1,   264,  3372,  ...,     0,     0,     0],
        [    1,   977,  4056,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([16, 108])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([16, 108])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
----------

In [38]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [39]:
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_names))
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [40]:
from seqeval.metrics import f1_score
metric = load_metric("seqeval")

def get_predictions(model, dataloader, compute_acc=False):
    
    pred_list = []
    label_list=[]

    with torch.no_grad():

        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]

            tokens_tensors, segments_tensors, masks_tensors = data[:3]

            outputs = model(input_ids=tokens_tensors, attention_mask=masks_tensors)

            predictions = torch.argmax(outputs[0].data, 2)

            labels = data[3]

            true_predictions = [
                [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
            true_labels = [
                [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]

            for i in true_predictions[0]:
              pred_list.append(i)

            for i in true_labels[0]:
              label_list.append(i)
            
    p=[pred_list]
    tp=[label_list]
    results = metric.compute(predictions=p, references=tp)
    results["overall_f1"]
    return results["overall_f1"]

In [41]:
model.train()

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

EPOCHS = 8

for epoch in range(EPOCHS):

    running_loss = 0.0

    for data in trainloader:

        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        optimizer.zero_grad()

        outputs = model(input_ids=tokens_tensors, attention_mask=masks_tensors, labels=labels)

        loss = outputs[0]

        loss.backward()

        optimizer.step()

        running_loss += loss.item()

    acc = get_predictions(model, trainloader, compute_acc=True)
    tacc = get_predictions(model, validloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, train_f1_score: %.3f, valid_f1_score: %.3f' %(epoch + 1, running_loss, acc ,tacc))

[epoch 1] loss: 31.472, train_f1_score: 0.887, valid_f1_score: 0.841
[epoch 2] loss: 11.560, train_f1_score: 0.951, valid_f1_score: 0.863
[epoch 3] loss: 6.547, train_f1_score: 0.958, valid_f1_score: 0.886
[epoch 4] loss: 4.234, train_f1_score: 0.972, valid_f1_score: 0.901
[epoch 5] loss: 3.073, train_f1_score: 0.980, valid_f1_score: 0.912
[epoch 6] loss: 2.902, train_f1_score: 0.971, valid_f1_score: 0.886
[epoch 7] loss: 2.007, train_f1_score: 0.990, valid_f1_score: 0.911
[epoch 8] loss: 1.678, train_f1_score: 0.989, valid_f1_score: 0.906


In [42]:
testloader = DataLoader(tokenized_dataset["test"], batch_size=BATCH_SIZE, collate_fn=create_mini_batch)
test_acc = get_predictions(model, testloader, compute_acc=True)
print('test_f1_score:',test_acc)

test_f1_score: 0.9095334685598376
