In [1]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers -q
!pip install seqeval -q
!pip install datasets transformers==4.28.0

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.33.0
    Uninstalling transformers-4.33.0:
      Successfully uninstalled transformers-4.33.0
Successfully installed transformers-4.28.0


In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import load_metric
from torch.utils.data import DataLoader
from datasets import Dataset, ClassLabel, Sequence, Features, Value, DatasetDict
from transformers import AutoModel,AutoTokenizer,AutoModelForSequenceClassification,AutoModelForTokenClassification, AdamW, DataCollatorForTokenClassification



In [3]:
path="/kaggle/input/151s5d1fs6e15fa/"
df = pd.read_json(path+"train.json",lines=True)
test_df = pd.read_json(path+"test.json",lines=True)
valid_df = pd.read_json(path+"valid.json",lines=True)
print(len(df))
print(len(test_df))
print(len(valid_df))
df[:2]

5228
5865
5330


Unnamed: 0,tags,tokens
0,"[1, 0, 0, 0, 0, 0, 1, 0]","[Naloxone, reverses, the, antihypertensive, ef..."
1,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[In, unanesthetized, ,, spontaneously, hyperte..."


In [4]:
tag_name = ["O",
    "B-Chemical",
    "B-Disease",
    "I-Disease",
    "I-Chemical"]

In [5]:
tags = ClassLabel(num_classes=len(tag_name), names=tag_name)

In [6]:
tags

ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None)

In [7]:
dataset_structure = {"ner_tags":Sequence(tags),
                 'tokens': Sequence(feature=Value(dtype='string'))}

In [8]:
dataset_structure

{'ner_tags': Sequence(feature=ClassLabel(num_classes=5, names=['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical'], id=None), length=-1, id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [9]:
dataset_structure["ner_tags"].feature.names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [10]:
def df_to_dataset(df, columns=['tags', 'tokens']):
  ner_tags = df['tags']
  tokens = df['tokens']
  d = {'ner_tags':ner_tags, 'tokens':tokens}
  dataset = Dataset.from_dict(mapping=d,features=Features(dataset_structure),)
  return dataset

dataset = df_to_dataset(df)
test_dataset =  df_to_dataset(test_df)
valid_dataset =  df_to_dataset(valid_df)

dataset = DatasetDict({
    'train': dataset,
    'test': test_dataset,
    'valid': valid_dataset})

label_names = dataset['train'].features["ner_tags"].feature.names
label_names

['O', 'B-Chemical', 'B-Disease', 'I-Disease', 'I-Chemical']

In [11]:
dataset['train']

Dataset({
    features: ['ner_tags', 'tokens'],
    num_rows: 5228
})

In [12]:
dataset['train'][:1]

{'ner_tags': [[1, 0, 0, 0, 0, 0, 1, 0]],
 'tokens': [['Naloxone',
   'reverses',
   'the',
   'antihypertensive',
   'effect',
   'of',
   'clonidine',
   '.']]}

In [13]:
model_name= 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
def tokenize_adjust_labels(all_samples_per_split):
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["tokens"],is_split_into_words=True, truncation=True)
    total_adjusted_labels = []
    word_id_list=[]
    for k in range(0, len(tokenized_samples["input_ids"])):
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        existing_label_ids = all_samples_per_split["ner_tags"][k]
        adjusted_label_ids = []
        word_id=[]
        prev_wid = -1
        i = -1
        for word_idx in word_ids_list:
            if(word_idx is None):
                adjusted_label_ids.append(-100)
                word_id.append(-100)
            elif(word_idx!=prev_wid):
                i = i + 1
                adjusted_label_ids.append(existing_label_ids[i])
                word_id.append(word_idx)
                prev_wid = word_idx
            else:
                word_id.append(word_idx)
                label_name = label_names[existing_label_ids[i]]
                adjusted_label_ids.append(existing_label_ids[i])

        total_adjusted_labels.append(adjusted_label_ids)
        word_id_list.append(word_id)

    tokenized_samples["labels"] = total_adjusted_labels
    tokenized_samples["word_id"] = word_id_list
    return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels,batched=True,remove_columns=list(dataset["train"].features.keys()))

  0%|          | 0/6 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [15]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_id'],
        num_rows: 5228
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_id'],
        num_rows: 5865
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_id'],
        num_rows: 5330
    })
})

In [16]:
tokenized_dataset['train'][0]['input_ids']

[1, 7460, 43623, 2268, 57375, 262, 97424, 1290, 265, 114354, 323, 2]

In [17]:
tokenized_dataset['train'][0]['attention_mask']

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [18]:
tokenized_dataset['train'][0]['labels']

[-100, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, -100]

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [20]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [21]:
model = AutoModelForTokenClassification.from_pretrained(model_name,num_labels=len(label_names))
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForTokenClassification: ['mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Be

DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [22]:
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p

    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [23]:
example = dataset["train"][1]
labels = [label_names[i] for i in example[f"ner_tags"]]
metric.compute(predictions=[labels], references=[labels])

{'Chemical': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 2},
 'Disease': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

In [24]:
from transformers import TrainingArguments, Trainer

epochs = 8
batch_size = 16
logging_steps = len(tokenized_dataset['train']) // batch_size

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/bert-fine-tune-ner/results",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
#fdda45603e7e5fa15e9169efd986fd0b3c940fed

In [25]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1522,0.092505,0.894792,0.911959,0.903294,0.968172
2,0.0528,0.092163,0.904539,0.926135,0.91521,0.97107
3,0.0283,0.106242,0.917122,0.922351,0.919729,0.972263
4,0.0148,0.126877,0.919189,0.920806,0.919997,0.972575
5,0.0092,0.140108,0.899928,0.936954,0.918068,0.971534
6,0.0055,0.147129,0.912291,0.937913,0.924924,0.973704
7,0.0029,0.158457,0.919528,0.933543,0.926482,0.974488
8,0.0017,0.157877,0.914641,0.936527,0.925455,0.973992


TrainOutput(global_step=2616, training_loss=0.03333646807227204, metrics={'train_runtime': 841.712, 'train_samples_per_second': 49.689, 'train_steps_per_second': 3.108, 'total_flos': 1402810519738464.0, 'train_loss': 0.03333646807227204, 'epoch': 8.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.1578766256570816,
 'eval_precision': 0.914641128402644,
 'eval_recall': 0.936527392879983,
 'eval_f1': 0.9254548805856175,
 'eval_accuracy': 0.9739919790591004,
 'eval_runtime': 22.6964,
 'eval_samples_per_second': 234.839,
 'eval_steps_per_second': 14.716,
 'epoch': 8.0}

In [27]:
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])

predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'Chemical': {'precision': 0.9388902081849125,
  'recall': 0.9583097681182839,
  'f1': 0.9485005997600959,
  'number': 12377},
 'Disease': {'precision': 0.8419341156510675,
  'recall': 0.886952636282395,
  'f1': 0.86385725683615,
  'number': 6714},
 'overall_precision': 0.9040901248350756,
 'overall_recall': 0.9332146037399822,
 'overall_f1': 0.9184215274376886,
 'overall_accuracy': 0.9730983162958509}

In [28]:
tokenized_dataset["test"]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'word_id'],
    num_rows: 5865
})

In [29]:
key=0
print(tokenized_dataset["test"]["labels"][key])
print(tokenized_dataset["test"]["word_id"][key])
print(tokenized_dataset["test"]["input_ids"][key])
print(predictions[key])
print(labels[key])

[-100, 1, 1, 1, 0, 0, 2, 0, -100]
[-100, 0, 0, 0, 1, 2, 3, 4, -100]
[1, 1107, 25784, 67469, 341, 1635, 66368, 323, 2]
[0 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0]
[-100    1    1    1    0    0    2    0 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100
 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -100 -10

In [30]:
p=[]
l=[]
for k in range(0, len(predictions)):
    pred = []
    label=[]
    s=tokenized_dataset["test"]["word_id"][k]    
    t=[]
    t2=[]
    prev_wid=0
    t.append(predictions[k][1])
    t2.append(labels[k][1])
    for i in range(2,len(s)-1):
        if(s[i]!=prev_wid):
            n=max(t,key=t.count)
            pred.append(n)
            nn=max(t2,key=t2.count)
            label.append(nn)
            t=[]
            t2=[]
            prev_wid = i
        t.append(predictions[k][i])
        t2.append(labels[k][i])
    pred.append(max(t,key=t.count))
    label.append(max(t2,key=t2.count))
    p.append(pred)
    l.append(label)  

In [31]:
print(len(p))   
print(len(l))   

5865
5865


In [32]:
pred=[[label_names[ppp] for ppp in pp ]for pp in p]
la=[[label_names[ppp] for ppp in pp ]for pp in l]

In [33]:
from seqeval.metrics import f1_score
print("before:", results["overall_f1"])
print("after:", f1_score(pred,la))

before: 0.9184215274376886
after: 0.9172904307854769
