In [1]:
TRAIN_PATH = 'data/process_data_split_words/train.jsonl'
TEST_PATH = 'data/process_data_split_words/test.jsonl'
DEV_PATH = 'data/process_data_split_words/dev.jsonl'

# TRAIN_PATH = 'data/process_data/train.json'
# TEST_PATH = 'data/process_data/test.json'
# DEV_PATH = 'data/process_data/dev.json'

In [2]:
# function read jsonl file as dataframe
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [36]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)

In [37]:
# this process just for aspect extraction

df_train.labels = df_train.labels.apply(lambda label: [ap.split('#')[0] if ap != 'O' else ap for ap in label])
df_test.labels = df_test.labels.apply(lambda label: [ap.split('#')[0] if ap != 'O' else ap for ap in label])
df_dev.labels = df_dev.labels.apply(lambda label: [ap.split('#')[0] if ap != 'O' else ap for ap in label])

In [48]:
def print_data(df, index):
    print(f'Index: {index}')

    columns = df.columns

    for column in columns:
        print(f'{column}: {df[column][index]}')

    print('\n')
    print("\n=============================================================================================\n")

def print_df(df):
    for i in range(len(df)):
        print_data(df, i)
        
# check data
def check_data():
    check_train = len(df_train[df_train.text.map(len) != df_train.labels.map(len)]) == 0
    check_test = len(df_test[df_test.text.map(len) != df_test.labels.map(len)]) == 0
    check_dev = len(df_dev[df_dev.text.map(len) != df_dev.labels.map(len)]) == 0

    if check_train and check_test and check_dev:
        print('All data is ok')
    else:
        print('Data is not ok at:')
        if not check_train:
            print('Train data')
        if not check_test:
            print('Test data')
        if not check_dev:
            print('Dev data')

check_data()

All data is ok


In [39]:
def convert_to_IOB_format_and_decay_to_token(text, labels):

    iob_labels = []
    tokens = []

    for span, label in zip(text, labels):
        current_tokens = span.split(' ')
        tokens.extend(current_tokens)
        if label == 'O':
            iob_labels.extend(['O'] * len(current_tokens))
        else:
            iob_labels.append(f'B-{label}')
            iob_labels.extend([f'I-{label}'] * (len(current_tokens) - 1))
    # return iob_labels
    return tokens, iob_labels


In [40]:
# Convert the labels in the training data, dev data, and test data to IOB format and decay to token
df_train.text, df_train.labels = zip(*df_train.apply(lambda row: convert_to_IOB_format_and_decay_to_token(row['text'], row['labels']), axis=1))
df_test.text, df_test.labels = zip(*df_test.apply(lambda row: convert_to_IOB_format_and_decay_to_token(row['text'], row['labels']), axis=1))
df_dev.text, df_dev.labels = zip(*df_dev.apply(lambda row: convert_to_IOB_format_and_decay_to_token(row['text'], row['labels']), axis=1))

In [None]:
check_data()

In [49]:
# save data
import os

def save_data_to_jsonl(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)


    with open(os.path.join(folder, filename), 'w', encoding='utf-8') as file:

        # get all name of columns
        columns = df.columns
        for index, row in df.iterrows():
            json_obj = {}
            for column in columns:
                json_obj[column] = row[column]
            json.dump(json_obj, file, ensure_ascii=False)
            file.write('\n')

In [50]:
# save to file jsonl
save_data_to_jsonl(df_train, 'data/span_detection_datasets_split_word_IOB', 'train.jsonl')
save_data_to_jsonl(df_test, 'data/span_detection_datasets_split_word_IOB', 'test.jsonl')
save_data_to_jsonl(df_dev, 'data/span_detection_datasets_split_word_IOB', 'dev.jsonl')

In [51]:
idx = 1
print(df_train.iloc[idx]['text'])
print(df_train.iloc[idx]['labels'])

['lag', 'va', 'hao', 'pin', 'là', 'cái', 'tóm_tắt', 'về', 'máy', 'sam', 'làm', 'tệ', 'quá', 'không', 'bằng', 'mấy', 'con', 'tàu', 'cùng', 'phân_khúc']
['B-PERFORMANCE', 'O', 'B-BATTERY', 'I-BATTERY', 'O', 'O', 'O', 'O', 'O', 'B-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL', 'I-GENERAL']


# End