In [1]:
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

def load_conll_sentences(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        raw = f.read().strip()
    sentences = raw.split('\n\n')
    return [s.split('\n') for s in sentences if s.strip()]

def get_main_label(sentence):
    labels = [line.split()[-1] for line in sentence if line.strip() and len(line.split()) > 1]
    labels = [label for label in labels if label != 'O']
    return labels[0] if labels else 'O'

def stratified_sample(sentences, ratio=0.2, seed=42):
    y = [get_main_label(s) for s in sentences]
    train_s, _ = train_test_split(sentences, train_size=ratio, stratify=y, random_state=seed)
    return train_s

def save_conll_sentences(sentences, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write('\n\n'.join(['\n'.join(s) for s in sentences]))

def sample_and_save(input_path, output_name, ratio=0.2):
    sents = load_conll_sentences(input_path)
    sampled = stratified_sample(sents, ratio)
    output_path = f"/content/{output_name}"
    save_conll_sentences(sampled, output_path)
    return output_path

# Processa i tre file caricati
train_sampled = sample_and_save('/content/train.txt', 'train_sampled.txt')
dev_sampled = sample_and_save('/content/dev.txt', 'dev_sampled.txt')
test_sampled = sample_and_save('/content/test.txt', 'test_sampled.txt')

train_sampled, dev_sampled, test_sampled


('/content/train_sampled.txt',
 '/content/dev_sampled.txt',
 '/content/test_sampled.txt')

In [2]:
!pip install google-colab --upgrade # upgrade google-colab to latest version
from google.colab import data_table # Importing the data_table submodule which might contain the 'display_dataframe_to_user' function

import pandas as pd # Import the pandas library and give it the alias 'pd'
from pathlib import Path # Import the Path object from the pathlib module
def correggi_formato_iob(percorso_input, percorso_output):
    with open(percorso_input, 'r', encoding='latin-1') as infile, \
         open(percorso_output, 'w', encoding='utf-8') as outfile:
        for riga in infile:
            riga = riga.strip()
            if not riga:
                outfile.write('\n')
                continue
            parti = riga.split()
            if len(parti) >= 2:
                token = parti[0]
                label = parti[-1]  # Prende l'ultima colonna come etichetta
                outfile.write(f"{token}\t{label}\n")
            else:
                # Se la riga non ha almeno due elementi, la ignora oppure scrive una riga vuota
                outfile.write('\n')

# File in input/output
iob_files = {
    "train_sampled.txt": "train_sampled_iob_corretto.txt",
    "test_sampled.txt": "test_sampled_iob_corretto.txt",
    "dev_sampled.txt": "dev_sampled_iob_corretto.txt",
}

# Applicazione della funzione a entrambi i file
output_paths_iob = {}
for input_name, output_name in iob_files.items():
    input_path = Path("/content/") / input_name
    output_path = Path("/content/") / output_name
    correggi_formato_iob(input_path, output_path)
    output_paths_iob[output_name] = str(output_path)

# Visualizza link per scaricare i file corretti
df_iob = pd.DataFrame(list(output_paths_iob.items()), columns=["Nome File", "Percorso"])
# Instead of tools.display_dataframe_to_user and name argument, try displaying the DataFrame directly with data_table:
data_table.DataTable(df_iob)
# or
# display(df_iob) # this is another way to display dataframe.

Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


Unnamed: 0,Nome File,Percorso
0,train_sampled_iob_corretto.txt,/content/train_sampled_iob_corretto.txt
1,test_sampled_iob_corretto.txt,/content/test_sampled_iob_corretto.txt
2,dev_sampled_iob_corretto.txt,/content/dev_sampled_iob_corretto.txt
