- Installazione dipendenze

In [None]:
!pip install accelerate -U
!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install evaluate
!pip install seqeval

In [22]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score
import evaluate
from transformers import TrainingArguments ,DataCollatorForTokenClassification

metric = evaluate.load("seqeval")

- Inizializzazione modello base e relativo tokenizer

In [35]:
model_name = "allenai/scibert_scivocab_uncased"#"KISTI-AI/scideberta-cs"#
tokenizer = AutoTokenizer.from_pretrained(model_name)#,add_prefix_space=True)

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

- Caricamento datasets

In [39]:

train_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset_nuovi/scierc_train_ner_with_inc.json', split='train')
val_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset_nuovi/scierc_dev_ner_with_inc.json', split='train')
test_dataset= load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset_nuovi/scierc_test_ner_with_inc.json', split='train')


In [40]:
datasets = DatasetDict({
    "train": train_dataset,
    'validation': val_dataset,
    "test": test_dataset,
})

- Creazione lista delle label adattate al modello IOB

In [41]:
label_list=["O","B-Task","I-Task","B-Method","I-Method","B-Metric","I-Metric","B-Material","I-Material","B-OtherScientificTerm","I-OtherScientificTerm","B-Generic","I-Generic"]

#label_list=["O","Task","Method","Material","Metric","OtherScientificTerm","Generic"]

label_map = {label: i for i, label in enumerate(label_list)}

-Funzione che si occupa di tokenizzare i dataset caricati in precedenza con il tokenizer del modello, i label corrispondenti ai token vengono adattati allo stile IOB per rappresentare meglio le divisioni apportate dal tokenizer.

Esempio:

Prima della funzione:

"a", "rigidly-moving" "Lambertian", "object"

"O",Material","Material","Material"


Dopo la funzione :


"a","rigidly","-","moving","Lambertian","object"

"O","B-Material","I-Material","I-Material","B-Material","B-Material"





In [36]:
def tokenize_and_align_labels_IB(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"],truncation=True, is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        tokens= tokenized_inputs.tokens(batch_index=i)
        label_ids = []
        previous_word_id=None

        for word_idx,token in zip(word_ids,tokens):
            if word_idx is None:
                label_ids.append(-100)
            elif (word_idx != previous_word_id and label[word_idx]!="O" ):

                label_ids.append(label_map["B-"+label[word_idx]])
                previous_word_id=word_idx
            elif (label[word_idx]!="O"):

                label_ids.append(label_map["I-"+label[word_idx]])
            else:

                label_ids.append(label_map[label[word_idx]])


        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [43]:
tokenized_datasets = datasets.map(tokenize_and_align_labels_IB, batched=True,remove_columns=datasets["train"].column_names,)

In [33]:

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

-Creazione funzione per calcolare le metriche durante il finetuning

In [12]:
import numpy as np


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [
        [label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    #print(true_predictions)

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    print(all_metrics)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

- Training arguments

In [13]:

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Tesi/Model/",
    evaluation_strategy="epoch",
    save_strategy="no",
    save_total_limit = 2,
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [14]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

-Creazione trainer e inizio addestramento

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [17]:
#outputs = trainer.predict(tokenized_datasets["test"])
#print(outputs.metrics)

-Salvataggio modelli e dataset

In [None]:
tokenized_datasets.save_to_disk("/content/drive/MyDrive/Tesi/dataset_nuovi/scibert_with_inc_IB/dataset_tokenizzati")

In [20]:
trainer.save_model(output_dir="/content/drive/MyDrive/Tesi/dataset_nuovi/scideberta_with_inc_IB/model")