In [1]:
TRAIN_PATH = 'data/train.jsonl'
DEV_PATH = 'data/dev.jsonl'
TEST_PATH = 'data/test.jsonl'

In [2]:
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [8]:
df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)
df_test = read_jsonl_to_dataframe(TEST_PATH)

# Processing data

In [4]:
from underthesea import word_tokenize
import regex as re

def tokenize(text):
    return word_tokenize(text, format="text")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", " ", text) # remove punctuation
    text = re.sub(r"\s+", " ", text) # remove extra space
    text = text.strip()
    return text

def post_process(text):

    text = tokenize(text)
    text = preprocess(text)

    return text

In [5]:
def process_data_frame(df):
    new_df = pd.DataFrame()
    new_df['text'] = df['text'].apply(lambda x: post_process(x))
    new_df['labels'] = df['labels']
    return new_df

In [9]:
new_train_df = process_data_frame(df_train)
new_dev_df = process_data_frame(df_dev)
new_test_df = process_data_frame(df_test)

In [27]:
len_labels = []
for label in new_train_df['labels']:
    len_labels.append(len(label))

max(len_labels)

11

In [12]:
# save data
import os

def save_data_to_csv(df, folder, filename):
    if not os.path.exists(folder):
        os.mkdir(folder)
    
    df.to_csv(f'{folder}/{filename}', encoding='utf-8', index=False)

save_data_to_csv(new_train_df, 'process_data', 'train.csv')
save_data_to_csv(new_dev_df, 'process_data', 'dev.csv')
save_data_to_csv(new_test_df, 'process_data', 'test.csv')

# End