In [2]:
import re
import json
import pickle
from sklearn.model_selection import train_test_split
import json

def load_json(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        datas = json.load(file)
        contents = [data['content'] for data in datas]
    return contents

def save_encoded_data(directory, encodings, labels):
    # 儲存 encodings
    with open(f'{directory}/encodings.pickle', 'wb') as handle:
        pickle.dump(encodings, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # 儲存 labels
    with open(f'{directory}/labels.pickle', 'wb') as handle:
        pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 生成 encoded data

In [3]:
from transformers import BertTokenizer

def gen_encoded_data(texts, labels):    
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

    encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)

    return encodings, labels

### 刪除垃圾文字

In [4]:
def clean_texts(texts):
    cleaned_texts = []

    for text in texts:
        # 移除 ※ [本文轉錄自....]
        text = re.sub(r'※ \[.*?\] ', '', text)
        # 移除作者、看板、標題、時間信息
        text = re.sub(r'作者: .*? \(.*?\) 看板: .*? 標題: .*? 時間:\s+\w{3}\s+\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\d{4}', '', text)
        # 移除網址
        text = re.sub(r'http[s]?://\S+', '', text)
        # 移除 ※ ...之銘言
        text = re.sub(r'^※.*?之銘言：$', '', text, flags=re.MULTILINE)
        # 移除 網誌圖文版 :
        text = re.sub(r'.{0,5}?(網誌：|網誌版：|圖文：|圖文版：)', '', text)
        # 移除 發信站:
        text = re.sub(r'※ 發信站:.*', '', text, flags=re.DOTALL)
        # 移除特殊字元
        text = re.sub(r'[^\w。，!?"]', '', text, flags=re.UNICODE)
        # 移除特殊字元特例
        text = re.sub(r'_+', '', text)

        cleaned_texts.append(text)

    return cleaned_texts

In [5]:
from sklearn.utils import shuffle

texts_non_travel_related = load_json('output/non_travel_related.json')
texts_travel_related = load_json('output/travel_related.json')

# 清洗文本
cleaned_texts_non_travel = clean_texts(texts_non_travel_related)
cleaned_texts_travel = clean_texts(texts_travel_related)

# 拆分數據為訓練集和測試集
# non travel
train_texts_non_travel, test_texts_non_travel, train_labels_non_travel, test_labels_non_travel = train_test_split(
    cleaned_texts_non_travel, [0]*len(cleaned_texts_non_travel), test_size=0.2, random_state=42)
# travel
train_texts_travel, test_texts_travel, train_labels_travel, test_labels_travel = train_test_split(
    cleaned_texts_travel, [1]*len(cleaned_texts_travel), test_size=0.2, random_state=42)

# 合併訓練集和測試集
train_texts = train_texts_non_travel + train_texts_travel
train_labels = train_labels_non_travel + train_labels_travel
test_texts = test_texts_non_travel + test_texts_travel
test_labels = test_labels_non_travel + test_labels_travel

# shuffle
train_texts, train_labels = shuffle(train_texts, train_labels, random_state=42)
test_texts, test_labels = shuffle(test_texts, test_labels, random_state=42)

# 生成編碼數據
train_encodings, train_labels = gen_encoded_data(train_texts, train_labels)
test_encodings, test_labels = gen_encoded_data(test_texts, test_labels)

# 儲存編碼數據
save_encoded_data('encoded_data/train', train_encodings, train_labels)
save_encoded_data('encoded_data/test', test_encodings, test_labels)

len(train_labels), len(test_labels)