
- Installazione dipendenze

In [None]:
!pip install torch

!pip install datasets
!pip install tokenizers
!pip install transformers
!pip install seqeval

In [8]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report, accuracy_score
from transformers import AutoModelForTokenClassification
import torch
from transformers import BertForTokenClassification, BertTokenizer
import datasets
import json

- Definizione del modello e del tokenizer

In [4]:
model_path="/content/drive/MyDrive/Tesi/Model"
tokenizer = AutoTokenizer.from_pretrained(model_path)

- Definizione lista delle label

In [5]:
label_list=["O","B-Task","I-Task","B-Method","I-Method","B-Metric","I-Metric","B-Material","I-Material","B-OtherScientificTerm","I-OtherScientificTerm","B-Generic","I-Generic"]


label_map = {label: i for i, label in enumerate(label_list)}

- Caricamento dataset e successiva tokenizzazione, può essere bypassato se si dispone del dataset già tokenizzato

In [None]:
train_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset/scierc_train_ner_with_inc.json', split='train')
val_dataset = load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset/scierc_dev_ner_with_inc.json', split='train')
test_dataset= load_dataset('json', data_files='/content/drive/MyDrive/Tesi/dataset/scierc_test_ner_with_inc.json', split='train')

In [None]:
datasets = DatasetDict({
    "train": train_dataset,
    'validation': val_dataset,
    "test": test_dataset,
})

In [None]:
def tokenize_and_align_labels_IB(examples):
    tokenized_inputs = tokenizer(
        examples["sentence"],truncation=True, is_split_into_words=True,
    )
    labels = []
    for i, label in enumerate(examples["ner"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        tokens= tokenized_inputs.tokens(batch_index=i)
        label_ids = []
        previous_word_id=None

        for word_idx,token in zip(word_ids,tokens):
            if word_idx is None:
                label_ids.append(-100)
            elif (word_idx != previous_word_id and label[word_idx]!="O" ):
                
                label_ids.append(label_map["B-"+label[word_idx]])
                previous_word_id=word_idx
            elif (label[word_idx]!="O"):
                
                label_ids.append(label_map["I-"+label[word_idx]])
            else:
                
                label_ids.append(label_map[label[word_idx]])


        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = datasets.map(tokenize_and_align_labels_IB, batched=True,remove_columns=datasets["train"].column_names,)

In [12]:
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

- Creazione oggetto model

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    model_path,
    id2label=id2label,
    label2id=label2id,
)

- Caricamento dataset tokenizzato dalla memoria

In [9]:
tokenized_datasets=datasets.load_from_disk("/content/drive/MyDrive/Tesi/dataset/dataset_tokenizzati")

- Generazione risposte

In [None]:
results=[]
cls_token_id = tokenizer.cls_token_id  # ID of [CLS] token
sep_token_id = tokenizer.sep_token_id  # ID of [SEP] token

for i,sentence in enumerate(tokenized_datasets["test"]["input_ids"]):

    #print(i,"\n--------------------------------------")

    input_ids = [x for x in sentence if x != cls_token_id and x != sep_token_id]
    #print(tokenizer.decode(input_ids))
    with torch.no_grad():


        input_ids = torch.tensor(input_ids).unsqueeze(0)

        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        predicted_label_ids = torch.argmax(logits, dim=-1).squeeze(0).tolist()

        #predicted_labels=[ label_list[x] for x in predicted_label_ids if x!=-100]

        results.append(predicted_label_ids)

- Salvataggio predictions

In [15]:
file_risultati="/content/drive/MyDrive/Tesi/Model/result/result_predictions_with_inc.json"

In [19]:
with open(file_risultati, 'a') as f:
    json.dump({"predictions_id":results}, f)