In [3]:
from data_handler import DataHandler, save_encoded_data, clean_text, load_json
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
non_travel_related_data = load_json('filtered_data/non_travel_related.json')
travel_related_data = load_json('filtered_data/travel_related.json')

cleaned_texts_non_travel = []
cleaned_texts_travel = []

for data in tqdm(non_travel_related_data, desc="Processing non-travel related data"):
    text = data['content']
    cleaned_text = clean_text(text)
    cleaned_texts_non_travel.append(cleaned_text)

for data in tqdm(travel_related_data, desc="Processing travel related data"):
    text = data['content']
    cleaned_text = clean_text(text)
    cleaned_texts_travel.append(cleaned_text)

cleaned_texts_all = cleaned_texts_non_travel + cleaned_texts_travel

test_size = 0.2

train_texts_non_travel, test_texts_non_travel, train_labels_non_travel, test_labels_non_travel = train_test_split(
  cleaned_texts_non_travel, [0]*len(cleaned_texts_non_travel), test_size=test_size, random_state=42)

train_texts_travel, test_texts_travel, train_labels_travel, test_labels_travel = train_test_split(
  cleaned_texts_travel, [1]*len(cleaned_texts_travel), test_size=test_size, random_state=42)

train_texts = train_texts_non_travel + train_texts_travel
train_labels = train_labels_non_travel + train_labels_travel

test_texts = test_texts_non_travel + test_texts_travel
test_labels = test_labels_non_travel + test_labels_travel

Processing non-travel related data: 100%|██████████| 14439/14439 [00:00<00:00, 25663.36it/s]
Processing travel related data: 100%|██████████| 39072/39072 [00:06<00:00, 6238.88it/s]


In [5]:
from sklearn.utils import shuffle

data_handler = DataHandler(tokenizer_name='bert-base-chinese')

train_texts, train_labels = shuffle(train_texts, train_labels, random_state=42)
test_texts, test_labels = shuffle(test_texts, test_labels, random_state=42)

In [6]:
train_encodings = data_handler.gen_encoded_data(train_texts, max_length=512)
test_encodings = data_handler.gen_encoded_data(test_texts, max_length=512)
all_encodings = data_handler.gen_encoded_data(cleaned_texts_all, max_length=512)

save_encoded_data('encoded_data/train/encodings_512', train_encodings)
save_encoded_data('encoded_data/train/labels', train_labels)
save_encoded_data('encoded_data/test/encodings_512', test_encodings)
save_encoded_data('encoded_data/test/labels', test_labels)
save_encoded_data('encoded_data/all_512', all_encodings)

In [7]:
train_encodings = data_handler.gen_encoded_data(train_texts, max_length=256)
test_encodings = data_handler.gen_encoded_data(test_texts, max_length=256)
all_encodings = data_handler.gen_encoded_data(cleaned_texts_all, max_length=256)

save_encoded_data('encoded_data/train/encodings_256', train_encodings)
# save_encoded_data('encoded_data/train/labels', train_labels)
save_encoded_data('encoded_data/test/encodings_256', test_encodings)
# save_encoded_data('encoded_data/test/labels', test_labels)
# save_encoded_data('encoded_data/all', all_encodings)