# Setup

In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys

# Get data

**Local**

In [None]:
corpus_name = "CORPUS_NAME"
df = pd.read_json(f"../corpora/{corpus_name}/{corpus_name}.json")
df

# One hot encoding

In [None]:
def extract_entity_classes_from_df(df):
    entity_classes = set()

    for idx, row in df.iterrows():
        ner_tokens = row['ner_tokens']
        for token in ner_tokens:
            entity_classes.add(token.replace("B-", "").replace("I-", ""))

    return list(entity_classes)

# Extrair classes de entidades NER do DataFrame
entity_classes = extract_entity_classes_from_df(df)

print("Classes de Entidades NER encontradas:")
print(entity_classes)
len(entity_classes)

In [None]:
from collections import Counter

# Convertendo todos os itens para minúsculas
lowercase_labels_list = [item.lower() for item in entity_classes]

# Contando as ocorrências de cada item
counter = Counter(lowercase_labels_list)

# Identificando os itens repetidos
repeated_items = [item for item, count in counter.items() if count > 1]

print(f"Itens repetidos: {repeated_items}")


In [None]:
def bio_to_one_hot(bio_tags, entity_types):
    """
    Converte uma lista de tags BIO para uma lista one-hot encoding.

    Parameters:
    bio_tags (list): Lista de tags BIO.
    entity_types (list): Lista de tipos de entidades.

    Returns:
    list: Lista one-hot encoding para as entidades.
    """
    # Inicializa o vetor one-hot encoding com zeros
    one_hot_vector = np.zeros(len(entity_types))

    # Itera sobre as tags BIO
    for tag in bio_tags:
        # Se a tag não for 'O' (Outside)
        if tag != 'O':
            # Separa o prefixo (B ou I) do tipo da entidade
            prefix, entity = tag.split('-')
            # Marca a presença da entidade no vetor one-hot
            if entity in entity_types:
                one_hot_vector[entity_types.index(entity)] = 1

    return one_hot_vector.tolist()

In [None]:
df['classes'] = df['ner_tokens'].apply(lambda x: bio_to_one_hot(x, entity_classes))

In [None]:
df

In [None]:
df.drop_duplicates(subset=['sentences'], inplace=True, keep="first")
df

## Handout

In [None]:
X = df.sentences.values.tolist()
y = df.classes.values.tolist()

k_folds = 5
random_state=42

# Atual

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np


def handout(X, y, df, n_splits, test_size, random_state=42):
    df_train_aux = df.copy()
    #Function to split data at train and validation
    train_validation_folds = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    for train_index, validation_index in train_validation_folds.split(X, y):
        #print("      TRAIN:", train_index[:5], "VALIDATION:", validation_index[:5])
        df_train = df_train_aux.copy()
        df_test = df_train_aux.copy()
        
        #DF with train instances
        df_train = df_train[df_train.index.isin(train_index)]

        #DF with validation instances
        df_test = df_test[df_test.index.isin(validation_index)]

        #Saving splits
        temp_dict = {
            "train": df_train,
            "test": df_test,
        }

        print("TRAIN:", len(df_train), "TEST:", len(df_test),   "TOTAL:", len(df_train) + len(df_test))
        print("==============")
        
        return temp_dict

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit, MultilabelStratifiedKFold
import numpy as np
import pandas as pd

def cross_validation(X, y, df, n_splits, random_state=42, shuffle=True):
    kfold = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=shuffle)

    #Save all the splits
    train_validation_test = []


    for train_index, test_index in kfold.split(X, y):
        #DF auxiliar that will be used to generate train and test df
        df_train_aux = df.copy()

        #DF with test instances
        df_test = df.copy()
        #print("TRAIN:", train_index, "TEST:", test_index)

        #Values to split train data at train and validation
        X_train = [X[i] for i in train_index]
        y_train = [y[i] for i in train_index]
        
        #DF to split train data at train and validation  
        df_train_aux = df_train_aux[df_train_aux.index.isin(train_index)]
        df_train_aux.index = [i for i in range(0, len(df_train_aux.index))]
        
        #DF with test instances
        df_test = df_test[df_test.index.isin(test_index)]
        
        #Function to split data at train and validation
        train_validation_folds = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=random_state)
        
        for train_index, validation_index in train_validation_folds.split(X_train, y_train):
            #print("      TRAIN:", train_index[:5], "VALIDATION:", validation_index[:5])
            df_train = df_train_aux.copy()
            df_validation = df_train_aux.copy()
            
            #DF with train instances
            df_train = df_train[df_train.index.isin(train_index)]

            #DF with validation instances
            df_validation = df_validation[df_validation.index.isin(validation_index)]

            #Saving splits
            temp_dict = {
                "train": df_train,
                "test": df_test,
                "validation": df_validation
            }
            train_validation_test.append(temp_dict)
            print("TRAIN:", len(df_train), "TEST:", len(df_test),  "VALIDATION:", len(df_validation),  "TOTAL:", len(df_train) + len(df_test) + len(df_validation))
            print("==============")
    
    return train_validation_test

In [None]:
remove = ['classes']

# Aplicação handout


In [None]:
handout_train_validation_test_list = handout(X, y, df, n_splits=2, test_size=0.2, random_state=random_state) #cross-validation

In [None]:
import os

In [None]:
def pandas2txt(df, path):
    with open(path, "w", encoding="utf-8") as f_out:
        for _, line in df.iterrows():
            for txt, tag in zip(line["tokens"], line["ner_tokens"]):
                print("{} {}".format(txt, tag), file=f_out)
            print(file=f_out)

    with open(path.replace("txt", "json"), "w", encoding='utf-8') as outfile:
        outfile.write(df.to_json(orient="records"))

In [None]:
#pandas df
df_train = handout_train_validation_test_list["train"]
df_train['classes'] = df_train['ner_tokens'].apply(lambda x: bio_to_one_hot(x, entity_classes))
df_test = handout_train_validation_test_list["test"]

In [None]:
X = df_train.sentences.values.tolist()
y = df_train.classes.values.tolist()

k_folds = 5
random_state=42

handout_train_validation_list = handout(X, y, df_train, n_splits=2, test_size=0.1, random_state=random_state) #cross-validation

In [None]:
df_train = handout_train_validation_list["train"]
df_validation = handout_train_validation_list["test"]

In [None]:
df_train

In [None]:
path = f'../corpora/{corpus_name}/labeled/1folds/fold0'
os.makedirs(path)

#Remove unnecessary columns
df_train = df_train.drop(remove, axis=1)
df_test = df_test.drop(remove, axis=1)
df_validation = df_validation.drop(remove, axis=1)
    
pandas2txt(df_train, f'{path}/train.txt')
pandas2txt(df_test, f'{path}/test.txt')
pandas2txt(df_validation, f'{path}/dev.txt')

In [None]:
df_validation

# Cross validation

Change the DataFrame to the handout training set if you want to perform cross-validation using it.

In [None]:
df = pd.read_json(f"../corpora/{corpus_name}/{corpus_name}.json")
df['classes'] = df['ner_tokens'].apply(lambda x: bio_to_one_hot(x, entity_classes))
df.drop_duplicates(subset=['sentences'], inplace=True)
df

In [None]:
X = df.sentences.values.tolist()
y = df.classes.values.tolist()

k_folds = 5
random_state=42

In [None]:
train_validation_test_list = cross_validation(X, y, df, k_folds, random_state=random_state) #cross-validation

In [None]:
train_validation_test_list[0]["train"]

In [None]:
pd.Series(list(set(train_validation_test_list[0]["validation"]["sentences"]).intersection(set(train_validation_test_list[0]["test"]["sentences"]))))

In [None]:
remove = ['classes']

# Kfolds

In [None]:
os.makedirs(f'../corpora/{corpus_name}/labeled/{k_folds}folds/')

for idx, df_dict in enumerate(train_validation_test_list):
    #pandas df
    df_train = df_dict["train"]
    df_test = df_dict["test"]
    df_validation = df_dict["validation"]
    
    #Remove unnecessary columns
    df_train = df_train.drop(remove, axis=1)
    df_test = df_test.drop(remove, axis=1)
    df_validation = df_validation.drop(remove, axis=1)
    
    #Creating folder
    os.makedirs(f'../corpora/{corpus_name}/labeled/{k_folds}folds/fold{idx}')
    
    pandas2txt(df_train, f'../corpora/{corpus_name}/labeled/{k_folds}folds/fold{idx}/train.txt')
    pandas2txt(df_validation, f'../corpora/{corpus_name}/labeled/{k_folds}folds/fold{idx}/dev.txt')
    pandas2txt(df_test, f'../corpora/{corpus_name}/labeled/{k_folds}folds/fold{idx}/test.txt')