In [1]:
TRAIN_PATH = 'data/original/train.jsonl'
DEV_PATH = 'data/original/dev.jsonl'
TEST_PATH = 'data/original/test.jsonl'

In [2]:
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [3]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)

# Processing data

In [4]:
def get_all_span_aspect(text, label):
    
    for span in label:
        start, end, aspect_sentiment = span
        aspect = text[start:end]
        
        yield aspect, aspect_sentiment

df_train['all_span_aspect'] = df_train.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_test['all_span_aspect'] = df_test.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_dev['all_span_aspect'] = df_dev.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)

In [None]:
# # save to csv
# df_train.to_csv('data/extract_span/train.csv')
# df_test.to_csv('data/extract_span/test.csv')
# df_dev.to_csv('data/extract_span/dev.csv')

In [6]:
from underthesea import word_tokenize
import regex as re

def tokenize(text):
    return word_tokenize(text, format="text")

def preprocess(text):
    text = text.lower()
    # text = re.sub(r"[^\w\s]", " ", text) # remove punctuation
    # text = re.sub(r"\s+", " ", text) # remove extra space
    text = text.strip()
    return text

def post_process(text):

    text = tokenize(text)
    text = preprocess(text)

    return text

In [7]:
def process_data_frame(df):
    new_df = pd.DataFrame()
    new_df['text'] = df['text'].apply(lambda x: post_process(x))
    new_df['labels'] = df['labels']
    return new_df

In [8]:
new_train_df = process_data_frame(df_train)
new_dev_df = process_data_frame(df_dev)
new_test_df = process_data_frame(df_test)

In [12]:
# save data
import os

def save_data_to_csv(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)

    with open(f'{folder}/{filename}', 'w', encoding='utf-8') as file:
        df.to_json(file, force_ascii=False)

save_data_to_csv(new_train_df, 'data/process_data', 'train.json')
save_data_to_csv(new_dev_df, 'data/process_data', 'dev.json')
save_data_to_csv(new_test_df, 'data/process_data', 'test.json')

# End