In [1]:
TRAIN_PATH = 'data/original/train.jsonl'
DEV_PATH = 'data/original/dev.jsonl'
TEST_PATH = 'data/original/test.jsonl'

In [2]:
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [3]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)

In [4]:
def print_data(df, index):
    print(f'Index: {index}')

    columns = df.columns

    for column in columns:
        print(f'{column}: {df[column][index]}')

    print('\n')
    print("\n=============================================================================================\n")

def print_df(df):
    for i in range(len(df)):
        print_data(df, i)

# Processing data

In [5]:
from underthesea import word_tokenize
import regex as re

def tokenize(text):
    return word_tokenize(text, format="text")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text) # remove punctuation
    text = re.sub(r"\s+", " ", text) # remove extra space
    text = text.strip()
    return text

def post_process(text):

    text = tokenize(text)
    text = preprocess(text)

    return text

In [6]:
def get_all_span_aspect(text, labels):
    # ex: "Lag và hao pin là cái tóm tắt về máy. Sam làm tệ quá, không bằng mấy con tàu cùng phân khúc"
    # "labels": [[0, 3, "PERFORMANCE#NEGATIVE"], [8, 15, "BATTERY#NEGATIVE"], [45, 106, "GENERAL#NEGATIVE"]]
    # Step 1: "labels": [[0, 3, "PERFORMANCE#NEGATIVE"], [4, 7, "O"], [8, 15, "BATTERY#NEGATIVE"], [16, 44, "0"], [45, 106, "GENERAL#NEGATIVE"], [107, -1, "O"]] 
    # Step 2: get all span aspect

    # Step 1
    text_len = len(text)
    process_labels = []

    # define start and end of label
    start_label, _, _ = labels[0] if labels else (None, None, None)

    # check if start_label is None mean labels is empty
    if start_label is None:
        return []

    # define start and end of O
    if start_label != 0:
        process_labels.append([0, start_label - 1, 'O'])

    next_start = 0
    next_end = 0
    for idx, label in enumerate(labels):
        start, end, aspect_sentiment = label
                
        process_labels.append([start, end, aspect_sentiment])

        if idx + 1 < len(labels):
            # define start and end of O of next label
            # just do it if next label is not the last label
            next_start = end + 1
            next_end = labels[idx + 1][0] - 1

            if next_start < next_end:
                process_labels.append([next_start, next_end - 1, 'O'])
        else:
            # if current label is the last label
            # define start and end of O of next label
            # if next label is the last label then end of O is end of text. If current label have end of text then skip
            if end + 1 < text_len:
                process_labels.append([end + 1, text_len - 1, 'O'])

        
    # result = []
    # Step 2
    for span in process_labels:
        start, end, aspect_sentiment = span
        aspect = text[start:end]
        
        # result.append((aspect, aspect_sentiment))
        yield aspect, aspect_sentiment
    
    # return result
    

df_train['all_span_aspect'] = df_train.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_test['all_span_aspect'] = df_test.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)
df_dev['all_span_aspect'] = df_dev.apply(lambda row: list(get_all_span_aspect(row['text'], row['labels'])), axis=1)

In [7]:
# apply post process to all_span_aspect
df_train['all_span_aspect'] = df_train.apply(lambda row: [(post_process(aspect), sentiment) for aspect, sentiment in row['all_span_aspect']], axis=1)
df_test['all_span_aspect'] = df_test.apply(lambda row: [(post_process(aspect), sentiment) for aspect, sentiment in row['all_span_aspect']], axis=1)
df_dev['all_span_aspect'] = df_dev.apply(lambda row: [(post_process(aspect), sentiment) for aspect, sentiment in row['all_span_aspect']], axis=1)

In [9]:
# after post process, remove all empty aspect
df_train['all_span_aspect'] = df_train.apply(lambda row: [(aspect, sentiment) for aspect, sentiment in row['all_span_aspect'] if aspect != ""], axis=1)
df_test['all_span_aspect'] = df_test.apply(lambda row: [(aspect, sentiment) for aspect, sentiment in row['all_span_aspect'] if aspect != ""], axis=1)
df_dev['all_span_aspect'] = df_dev.apply(lambda row: [(aspect, sentiment) for aspect, sentiment in row['all_span_aspect'] if aspect != ""], axis=1)

In [11]:
# decay all_span_aspect span_aspect to one list, aspect_sentiment to one list
def decay_span_aspect(all_span_aspect):
    span_aspect = []
    aspect_sentiment = []
    for aspect, sentiment in all_span_aspect:
        span_aspect.append(aspect)
        aspect_sentiment.append(sentiment)
    
    return span_aspect, aspect_sentiment

df_train['text'], df_train['labels'] = zip(*df_train['all_span_aspect'].map(decay_span_aspect))
df_test['text'], df_test['labels'] = zip(*df_test['all_span_aspect'].map(decay_span_aspect))
df_dev['text'], df_dev['labels'] = zip(*df_dev['all_span_aspect'].map(decay_span_aspect))

In [13]:
# remove all value in text, labels is empty
df_train = df_train[df_train['text'].map(len) > 0]
df_dev = df_dev[df_dev['text'].map(len) > 0]
df_test = df_test[df_test['text'].map(len) > 0]

# reset index
df_train.reset_index(drop=True, inplace=True)
df_dev.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [16]:
# save data
import os

def save_data_to_jsonl(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)


    with open(os.path.join(folder, filename), 'w', encoding='utf-8') as file:

        # get all name of columns
        columns = df.columns
        for index, row in df.iterrows():
            json_obj = {}
            for column in columns:
                json_obj[column] = row[column]
            json.dump(json_obj, file, ensure_ascii=False)
            file.write('\n')

# Combine data

In [17]:
del df_train['all_span_aspect']
del df_dev['all_span_aspect']
del df_test['all_span_aspect']

In [18]:
save_data_to_jsonl(df_train, 'data/processed_data', 'train.jsonl')
save_data_to_jsonl(df_dev, 'data/processed_data', 'dev.jsonl')
save_data_to_jsonl(df_test, 'data/processed_data', 'test.jsonl')

In [19]:
df_train

Unnamed: 0,text,labels
0,"[pin sài_tầm 50 h cho pin 100 100, camera ổn, ...","[BATTERY#POSITIVE, CAMERA#POSITIVE, GENERAL#PO..."
1,"[lag, va, hao pin, là cái tóm_tắt về máy, sam ...","[PERFORMANCE#NEGATIVE, O, BATTERY#NEGATIVE, O,..."
2,[tất_cả đều ổn ngoại_trừ lúc máy nóng lên thì ...,"[O, CAMERA#NEUTRAL, BATTERY#POSITIVE, BATTERY#..."
3,"[ok mua máy ở tgdd, chính_sách đổi trả rất tốt...","[O, SER&ACC#POSITIVE]"
4,"[kiểu_dáng thì đẹp cầm_chắc tay, nhưn, loa nhỏ...","[DESIGN#POSITIVE, O, FEATURES#NEGATIVE, SER&AC..."
...,...,...
7620,"[mình vừa mua máy hôm_nay, bản màu ghi, cảm_nh...","[O, DESIGN#POSITIVE, GENERAL#POSITIVE, SCREEN#..."
7621,"[máy mua đc 2 thang, bị lỗi camera trước, có đ...","[O, CAMERA#NEGATIVE, O]"
7622,[máy màn_hình cứ tự sáng liên_tục dù chẳng có ...,"[FEATURES#NEGATIVE, O]"
7623,"[sau gần một tuần sử_dụng, máy cảm_thấy máy dù...","[O, GENERAL#POSITIVE, FEATURES#POSITIVE]"


In [73]:
for i in range(len(df_train)):
    print_data(df_train, i)

Index: 0
text: ['pin sài_tầm 50 h cho pin 100 100', 'camera ổn', 'tất_cả đều ok', 'nhân_viên thế_giới di_động trần_văn_thời cà_mau nhiệt_tình và vui_vẻ chúc các ae sức khỏe tốt và phục_ok hoài_nha']
labels: ['BATTERY#POSITIVE', 'CAMERA#POSITIVE', 'GENERAL#POSITIVE', 'SER&ACC#POSITIVE']




Index: 1
text: ['lag', 'va', 'hao pin', 'là cái tóm_tắt về máy', 'sam làm tệ quá không bằng mấy con tàu cùng phân_khúc']
labels: ['PERFORMANCE#NEGATIVE', 'O', 'BATTERY#NEGATIVE', 'O', 'GENERAL#NEGATIVE']




Index: 2
text: ['tất_cả đều ổn ngoại_trừ lúc máy nóng lên thì pin tụt nhanh hơn tụt quần_haizz', 'cam chụp cũng gọi_là tầm trung', 'nếu dùng lướt_web bình_thường thì có_thể dùng được cả ngày', 'máy nóng lên thì pin tụt nhanh hơn tụt quần', 'haizz cam_chụp cũng gọi_là tầm_trung nếu dùng lướt_web bình_thường thì có_thể dùng được cả ngà']
labels: ['O', 'CAMERA#NEUTRAL', 'BATTERY#POSITIVE', 'BATTERY#NEGATIVE', 'O']




Index: 3
text: ['ok mua máy ở tgdd', 'chính_sách đổi trả rất tốt rất yên_tâm khi m

# End