In [1]:
import seqeval

import evaluate

metric = evaluate.load("seqeval")

import json
import sys
import numpy as np
import evaluate
import torch
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForTokenClassification

def load_dnrti():
    ret = {}
    for split_name in ['train_json', 'dev_json', 'test_json']:
        data = []
        with open(f'/home/vikrant/Desktop/NER/DNRTI/{split_name}.jsonl', 'r') as reader:
            for line in reader:
                data.append(json.loads(line))
        ret[split_name] = Dataset.from_list(data)
    return DatasetDict(ret)

ds = load_dnrti()

label2id = {
    'I-Org': 0,
    'I-SecTeam': 1,
    'O': 2,
    'B-Exp': 3,
    'I-Purp': 4,
    'B-Purp': 5,
    'I-Features': 6,
    'I-Time': 7,
    'B-SecTeam': 8,
    'B-SamFile': 9,
    'B-Area': 10,
    'I-Area': 11,
    'B-HackOrg': 12,
    'B-Way': 13,
    'B-OffAct': 14,
    'B-Org': 15,
    'I-Exp': 16,
    'I-OffAct': 17,
    'B-Features': 18,
    'B-Time': 19,
    'I-SamFile': 20,
    'I-Way': 21,
    'I-HackOrg': 22
}


checkpoint = "dslim/bert-base-NER"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, add_prefix_space=True)


epochs = 4
batch_size = 16
learning_rate = 1e-4
max_length = 120


id2label = {v: k for k, v in label2id.items()}
label_list = list(label2id.keys()) # ds["train"].features[f"ner_tags"].feature.names


model = AutoModelForTokenClassification.from_pretrained(
    checkpoint, num_labels=len(label2id), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True,
)
if torch.cuda.is_available():
  device = "cuda"
else:
  device = "cpu"

model = model.to(device)

print("Current CUDA Device: [{}] {}".format(torch.cuda.current_device(), torch.cuda.get_device_name(torch.cuda.current_device())))
print("Number of CUDA Devices: {}".format(torch.cuda.device_count()))

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=max_length, truncation=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

training_args = TrainingArguments(
    output_dir="my_awesome_ds_model",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train_json"],
    eval_dataset=tokenized_ds["test_json"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()





Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([23]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

Current CUDA Device: [0] NVIDIA RTX A5000
Number of CUDA Devices: 1


Map:   0%|          | 0/5260 [00:00<?, ? examples/s]

Map:   0%|          | 0/661 [00:00<?, ? examples/s]

Map:   0%|          | 0/663 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.195393,0.763802,0.785653,0.774574,0.934796
2,0.300800,0.147478,0.812754,0.854398,0.833056,0.954589
3,0.300800,0.140066,0.8622,0.873612,0.867869,0.961941
4,0.101100,0.146025,0.859983,0.886422,0.873003,0.964542


TrainOutput(global_step=1316, training_loss=0.16225219207694103, metrics={'train_runtime': 164.4459, 'train_samples_per_second': 127.945, 'train_steps_per_second': 8.003, 'total_flos': 1254013369959840.0, 'train_loss': 0.16225219207694103, 'epoch': 4.0})

In [12]:
predictions = trainer.predict(tokenized_ds["test_json"])
predictions

PredictionOutput(predictions=array([[[-8.6082029e-01, -1.5843542e+00,  1.0574989e+01, ...,
         -1.0110166e+00, -5.3579456e-01, -1.1848768e+00],
        [-1.6466786e+00,  3.1291062e-01, -9.5396407e-02, ...,
         -1.8256509e-01, -9.3834066e-01, -3.3222732e-01],
        [ 3.6779183e-01,  3.0631654e+00,  2.9438074e+00, ...,
         -2.9395938e-01, -1.2250012e+00,  4.5430553e-01],
        ...,
        [ 1.2712060e-01, -8.1365019e-02,  9.8557014e+00, ...,
         -1.2822399e-01, -9.2375940e-01, -3.7288493e-01],
        [-9.5845944e-01, -2.4787630e-01,  6.0922318e+00, ...,
         -1.2010353e+00, -1.9077986e+00,  1.3461667e-01],
        [-9.2712152e-01, -2.3863177e-01,  6.0802412e+00, ...,
         -1.2513655e+00, -1.9135616e+00,  1.9581741e-01]],

       [[-1.0206460e+00, -1.8249142e+00,  9.7541237e+00, ...,
         -9.9438852e-01, -5.0042647e-01, -1.1152949e+00],
        [-1.0920552e+00, -1.3906964e+00,  9.4506969e+00, ...,
         -1.2908159e+00, -1.0836993e+00, -1.2342025e+0

In [3]:
predictions = np.argmax(predictions.predictions, axis = -1)

In [4]:
predictions

array([[ 2,  8,  8, ...,  2,  2,  2],
       [ 2,  2,  2, ...,  8,  1,  2],
       [ 2,  8,  1, ...,  2,  2,  2],
       ...,
       [ 2,  2,  2, ...,  2,  2,  2],
       [ 2,  2, 13, ..., 13, 13, 21],
       [ 2, 12, 12, ...,  2,  2,  2]])

In [10]:
print(predictions[0])

[ 2  8  8  1  2  2 12  2  2  2 12 22 22 22  2  2  2  2  2  2  2  2  2  2
 12  2  2  2  2  2  2  2  2  2  2  2  2  8  2 12  2  2 22 22  2  2  2  2
  2  8  8  2  2  2  2  2 22  2 12 22 22  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  8  2  2  2  2]


In [9]:
print(ds["test_json"]["ner_tags"][0])

[8, 2, 2, 12, 2, 12, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [11]:
len(predictions[0])

87

In [13]:
#METRICS VALUES ON TETS SET

print(compute_metrics(predictions[:][0:2]))

{'precision': 0.8621997471554994, 'recall': 0.8736122971818958, 'f1': 0.8678685047720043, 'accuracy': 0.9619408471413222}
