In [1]:
TRAIN_PATH = 'data/train.jsonl'
DEV_PATH = 'data/dev.jsonl'
TEST_PATH = 'data/test.jsonl'

In [2]:
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [3]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)

In [4]:
df_train

Unnamed: 0,text,labels
0,Pin Sài tầm 50h cho pin 100/100. Camera ổn ......,"[[0, 31, BATTERY#POSITIVE], [33, 42, CAMERA#PO..."
1,Lag và hao pin là cái tóm tắt về máy. S...,"[[0, 3, PERFORMANCE#NEGATIVE], [8, 15, BATTERY..."
2,Tất cả đều ổn ngoại trừ lúc máy nóng lên thì p...,"[[79, 109, CAMERA#NEUTRAL], [111, 169, BATTERY..."
3,"Ok mua máy ở TGDD chính sách đổi trả rất tốt,r...","[[18, 217, SER&ACC#POSITIVE]]"
4,"kiểu dáng thì đẹp,cầm chắc tay,nhưng loa nhỏ q...","[[0, 30, DESIGN#POSITIVE], [37, 48, FEATURES#N..."
...,...,...
7780,Máy mua đc 2thang bị lỗi camera trước có đổi đ...,"[[18, 37, CAMERA#NEGATIVE]]"
7781,"I bought this phone some hours ago, because I ...",[]
7782,Máy màn hình cứ tự sáng liên tục dù chẳng có t...,"[[0, 57, FEATURES#NEGATIVE]]"
7783,sau gần một tuần sử dụng máy cảm thấy máy dùng...,"[[25, 56, GENERAL#POSITIVE], [57, 123, FEATURE..."


# Processing data

In [7]:
def get_all_span_aspect(text, label):
    
    for span in label:
        start, end, aspect_sentiment = span
        aspect = text[start:end]
        
        yield aspect, aspect_sentiment

df_train['all_span_aspect'] = df_train.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_test['all_span_aspect'] = df_test.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_dev['all_span_aspect'] = df_dev.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)

In [9]:
# save to csv
df_train.to_csv('extract_span/train.csv')
df_test.to_csv('extract_span/test.csv')
df_dev.to_csv('extract_span/dev.csv')

In [4]:
from underthesea import word_tokenize
import regex as re

def tokenize(text):
    return word_tokenize(text, format="text")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text) # remove punctuation
    text = re.sub(r"\s+", " ", text) # remove extra space
    text = text.strip()
    return text

def post_process(text):

    text = tokenize(text)
    text = preprocess(text)

    return text

In [5]:
def process_data_frame(df):
    new_df = pd.DataFrame()
    new_df['text'] = df['text'].apply(lambda x: post_process(x))
    new_df['labels'] = df['labels']
    return new_df

In [9]:
new_train_df = process_data_frame(df_train)
new_dev_df = process_data_frame(df_dev)
new_test_df = process_data_frame(df_test)

In [27]:
len_labels = []
for label in new_train_df['labels']:
    len_labels.append(len(label))

max(len_labels)

11

In [12]:
# save data
import os

def save_data_to_csv(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)
    
    df.to_csv(f'{folder}/{filename}', encoding='utf-8', index=False)

save_data_to_csv(new_train_df, 'process_data', 'train.csv')
save_data_to_csv(new_dev_df, 'process_data', 'dev.csv')
save_data_to_csv(new_test_df, 'process_data', 'test.csv')

# End