In [10]:
TRAIN_PATH = 'data/process_data_split_words/train.jsonl'
TEST_PATH = 'data/process_data_split_words/test.jsonl'
DEV_PATH = 'data/process_data_split_words/dev.jsonl'

# TRAIN_PATH = 'data/process_data/train.json'
# TEST_PATH = 'data/process_data/test.json'
# DEV_PATH = 'data/process_data/dev.json'

In [11]:
# function read jsonl file as dataframe
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [37]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)

In [38]:
df_train.labels = df_train.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])
df_test.labels = df_test.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])
df_dev.labels = df_dev.labels.apply(lambda label: [(ap[0], ap[1], ap[2].split('#')[0]) for ap in label])

In [34]:
# label like this: [[start, end, tag], ...], ex: [[0, 30, 'DESIGN#POSITIVE'], [37, 48, 'FEATURES#NEGATIVE'], [49, 81, 'SER&ACC#POSITIVE']]
# text like this: 'kiểu dáng thì đẹp cầm chắc tay nhưng loa nhỏ quá nhân viên phục vụ rất nhiệt tình'
# convert to IOB format

def convert_to_IOB_format(text, labels):

    # split text to tokens
    tokens = text.split(' ')

    # dict key: hash location, value: IOB tag
    # init dict with all 'O' tag and key is location of token
    iob_labels = {}

    for idx, token in enumerate(tokens):
        if idx == 0:
            start = 0
            end = len(token)
        else:
            start = end + 1 # 1 for space
            end = start + len(token)

        # hash location because location is tuple and can't use as key:
        # ex: location = (0, 30) -> hash_location = '0_30'
        hash_location = str(start) + '_' + str(end)
        iob_labels[hash_location] = 'O'

        for label in labels:
            start_label, end_label, tag = label

            # if token is in label
            if start_label <= start and end <= end_label:
                # if token is start of label
                if start_label == start:
                    iob_labels[hash_location] = 'B-' + tag
                else:
                    iob_labels[hash_location] = 'I-' + tag
                break

    # sort dict by key
    iob_labels = dict(sorted(iob_labels.items(), key=lambda x: int(x[0].split('_')[0])))
    # return iob_labels
    return iob_labels


In [35]:
# Convert the labels in the training data, dev data, and test data to IOB format
df_train.labels = df_train.apply(lambda row: convert_to_IOB_format(row.text, row.labels), axis=1)
# df_test.labels = df_test.apply(lambda row: convert_to_IOB_format(row.text, row.labels), axis=1)
# df_dev.labels = df_dev.apply(lambda row: convert_to_IOB_format(row.text, row.labels), axis=1)

In [16]:
import os

def save_data_to_csv(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)

    with open(f'{folder}/{filename}', 'w', encoding='utf-8') as file:
        df.to_json(file, force_ascii=False)

In [17]:
# # save to file csv
# save_data_to_csv(df_train, 'data/span_detection_datasets_IOB', 'train.json')
# save_data_to_csv(df_test, 'data/span_detection_datasets_IOB', 'test.json')
# save_data_to_csv(df_dev, 'data/span_detection_datasets_IOB', 'dev.json')

# save to file csv
save_data_to_csv(df_train, 'data/span_detection_datasets_split_word_IOB', 'train.json')
save_data_to_csv(df_test, 'data/span_detection_datasets_split_word_IOB', 'test.json')
save_data_to_csv(df_dev, 'data/span_detection_datasets_split_word_IOB', 'dev.json')

In [41]:
idx = 1
print(df_train.iloc[idx]['text'])
print(df_train.iloc[idx]['labels'])

ao_pin 
[(0, 3, 'PERFORMANCE'), (8, 15, 'BATTERY'), (44, 104, 'GENERAL')]


# End