In [10]:
TRAIN_PATH = 'data/process_data_no_split_words/train.json'
TEST_PATH = 'data/process_data_no_split_words/test.json'
DEV_PATH = 'data/process_data_no_split_words/dev.json'

In [28]:
import pandas as pd

df_train = pd.read_json(TRAIN_PATH)
df_test = pd.read_json(TEST_PATH)
df_dev = pd.read_json(DEV_PATH)

In [32]:
df_train.labels = df_train.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])
df_test.labels = df_test.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])
df_dev.labels = df_dev.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])

In [33]:
# label like this: [[start, end, tag]], ex: [[0, 30, 'DESIGN#POSITIVE'], [37, 48, 'FEATURES#NEGATIVE'], [49, 81, 'SER&ACC#POSITIVE']]
# text like this: 'kiểu dáng thì đẹp cầm chắc tay nhưng loa nhỏ quá nhân viên phục vụ rất nhiệt tình'
# convert to IOB format

def convert_to_IOB_format(label, text):
    # Tokenize the input text into words
    tokens = text.split()

    # Initialize an empty list to store the IOB-formatted tokens and labels
    iob_labels = []

    # Initialize variables to keep track of token positions
    current_token_start = 0
    current_token_end = 0

    # Iterate through the tokens
    for token in tokens:
        # Calculate the start and end positions of the current token
        current_token_start = text.find(token, current_token_end)
        current_token_end = current_token_start + len(token)

        # Determine the IOB tag for the current token
        iob_tag = 'O'  # Default tag is Outside

        for label_entry in label:
            start, end, tag = label_entry
            if current_token_start >= start and current_token_end <= end:
                if current_token_start == start:
                    iob_tag = 'B-' + tag  # Beginning of an entity
                else:
                    iob_tag = 'I-' + tag  # Inside an entity
                break

        # Append the token and its IOB tag to the list
        iob_labels.append([current_token_start, current_token_end, iob_tag])

    return iob_labels


In [34]:
# Convert the labels in the training data, dev data, and test data to IOB format
df_train.labels = df_train.apply(lambda row: convert_to_IOB_format(row.labels, row.text), axis=1)
df_test.labels = df_test.apply(lambda row: convert_to_IOB_format(row.labels, row.text), axis=1)
df_dev.labels = df_dev.apply(lambda row: convert_to_IOB_format(row.labels, row.text), axis=1)

In [40]:
import os

def save_data_to_csv(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)

    with open(f'{folder}/{filename}', 'w', encoding='utf-8') as file:
        df.to_json(file, force_ascii=False)

In [41]:
# save to file csv
save_data_to_csv(df_train, 'data/span_detection_datasets_IOB', 'train.json')
save_data_to_csv(df_test, 'data/span_detection_datasets_IOB', 'test.json')
save_data_to_csv(df_dev, 'data/span_detection_datasets_IOB', 'dev.json')

# End