In [5]:
def estrai_testo_da_iob(percorso_file_iob, percorso_output_txt):
    testo_ricostruito = []
    with open(percorso_file_iob, 'r', encoding='utf-8') as f:
        frase = []
        for riga in f:
            riga = riga.strip()
            if not riga:
                if frase:
                    testo_ricostruito.append(' '.join(frase))
                    frase = []
            else:
                token = riga.split()[0]
                frase.append(token)
        if frase:
            testo_ricostruito.append(' '.join(frase))  # per l'ultima frase se non terminata da riga vuota

    with open(percorso_output_txt, 'w', encoding='utf-8') as f_out:
        for riga in testo_ricostruito:
            f_out.write(riga + '\n')

# Esempio di utilizzo:
estrai_testo_da_iob('dev.txt', 'dev_riconcatenato.txt')


In [None]:
!pip install google-colab --upgrade # upgrade google-colab to latest version
from google.colab import data_table # Importing the data_table submodule which might contain the 'display_dataframe_to_user' function

import pandas as pd # Import the pandas library and give it the alias 'pd'
from pathlib import Path # Import the Path object from the pathlib module
def correggi_formato_iob(percorso_input, percorso_output):
    with open(percorso_input, 'r', encoding='latin-1') as infile, \
         open(percorso_output, 'w', encoding='utf-8') as outfile:
        for riga in infile:
            riga = riga.strip()
            if not riga:
                outfile.write('\n')
                continue
            parti = riga.split()
            if len(parti) >= 2:
                token = parti[0]
                label = parti[-1]  # Prende l'ultima colonna come etichetta
                outfile.write(f"{token}\t{label}\n")
            else:
                # Se la riga non ha almeno due elementi, la ignora oppure scrive una riga vuota
                outfile.write('\n')

# File in input/output
iob_files = {
    "dev_.txt": "dev_iob_corretto.txt",
}

# Applicazione della funzione a entrambi i file
output_paths_iob = {}
for input_name, output_name in iob_files.items():
    input_path = Path("/content/") / input_name
    output_path = Path("/content/") / output_name
    correggi_formato_iob(input_path, output_path)
    output_paths_iob[output_name] = str(output_path)

# Visualizza link per scaricare i file corretti
df_iob = pd.DataFrame(list(output_paths_iob.items()), columns=["Nome File", "Percorso"])
# Instead of tools.display_dataframe_to_user and name argument, try displaying the DataFrame directly with data_table:
data_table.DataTable(df_iob)
# or
# display(df_iob) # this is another way to display dataframe.

In [2]:
!pip install -q transformers datasets seqeval


In [3]:
def load_iob_file(path):
    sentences = []
    labels = []
    with open(path, encoding="utf-8") as f:
        tokens = []
        tags = []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                tokens.append(splits[0])
                tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

train_sents, train_labels = load_iob_file("train_iob_corretto.txt")
dev_sents, dev_labels = load_iob_file("dev_iob_corretto.txt")
test_sents, test_labels = load_iob_file("test_iob_corretto.txt")


In [4]:
with open("classes.txt") as f:
    label_list = [line.strip() for line in f.readlines()]
label_list = ["O"] + label_list  # aggiungiamo la classe 'O'
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("expertai/LLaMAntino-3-SLIMER-IT")

def tokenize_and_align_labels(sentences, labels):
    tokenized_inputs = tokenizer(
        sentences,
        is_split_into_words=True,
        return_offsets_mapping=True,
        padding=True,
        truncation=True
    )
    aligned_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned.append(-100)
            elif word_idx != previous_word_idx:
                aligned.append(label2id.get(label[word_idx], label2id["O"]))
            else:
                aligned.append(label2id.get(label[word_idx], label2id["O"]))
            previous_word_idx = word_idx
        aligned_labels.append(aligned)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


In [6]:
!pip install -q datasets


In [7]:
from datasets import Dataset, DatasetDict

def load_iob_dataset(path):
    sentences, labels = [], []
    with open(path, encoding='utf-8') as f:
        tokens, tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                tokens.append(splits[0])
                tags.append(splits[-1])
        if tokens:
            sentences.append(tokens)
            labels.append(tags)
    return sentences, labels

train_tokens, train_tags = load_iob_dataset("/content/train_iob_corretto.txt")
dev_tokens, dev_tags = load_iob_dataset("/content/dev_iob_corretto.txt")
test_tokens, test_tags = load_iob_dataset("/content/test_iob_corretto.txt")


In [8]:
train_ds = Dataset.from_dict({"tokens": train_tokens, "ner_tags": train_tags})
dev_ds = Dataset.from_dict({"tokens": dev_tokens, "ner_tags": dev_tags})
test_ds = Dataset.from_dict({"tokens": test_tokens, "ner_tags": test_tags})

dataset = DatasetDict({
    "train": train_ds,
    "validation": dev_ds,
    "test": test_ds
})


In [9]:
def tokenize_and_align_labels(dataset):
    tokenized_inputs = tokenizer(
        dataset["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    labels = []
    for i, label in enumerate(dataset["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                aligned_labels.append(label2id.get(label[word_idx], 0))
            else:
                aligned_labels.append(label2id.get(label[word_idx], 0))
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Applica la trasformazione
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "expertai/LLaMAntino-3-SLIMER-IT"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none"  # evitiamo problemi in Colab
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer
)