In [1]:
import glob
import json
import re

In [12]:
def write_json_data(path, data):
    with open(path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

### 過濾原始資料

In [15]:
files = glob.glob("IR-data/*.json")

filtered_data = []

# 遍歷找到的檔案
for file_path in files:
    # 打開並讀取 JSON 檔案
    with open(file_path, 'r', encoding='utf-8') as file:
        try:
            # 解析 JSON 數據
            data_list = json.load(file)

            for data in data_list:
                if data['title'] != '文章已被刪除':
                    # 刪除不需要的屬性
                    del data['push']
                    del data['date']
                    del data['link']

                    filtered_data.append(data)

        except json.JSONDecodeError:
                print(f"Error decoding JSON from file {file_path}")

filtered_data_output_file = 'output/filtered_data.json'
write_json_data(filtered_data_output_file, filtered_data)

len(filtered_data)

### 檢視所有標題標籤

In [21]:
file = "output/filtered_data.json"

titles = {}
pattern = r"\[(.*?)\]"

with open(file, 'r', encoding='utf-8') as file:
    try:
        # 解析 JSON 數據
        data_list = json.load(file)

        for data in data_list:
            title = data['title']
            match = re.search(pattern, title)
            if match:
                word = match.group(1)
                if word not in titles:
                    titles[word] = 1
                else:
                    titles[word] += 1

    except json.JSONDecodeError:
        print(f"Error decoding JSON from file {file_path}")

filtered_titles = {key: value for key, value in titles.items() if value > 10}

sorted(filtered_titles.items(), key=lambda x:x[1], reverse=True)

[('遊記', 23002),
 ('交易', 13961),
 ('心得', 4870),
 ('問題', 3205),
 ('廣告', 2301),
 ('食記', 2108),
 ('徵伴', 1307),
 ('分享', 257),
 ('情報', 242),
 ('住宿', 230),
 ('問卷', 215),
 ('新聞', 132),
 ('資訊', 125),
 ('徵求', 124),
 ('公告', 108),
 ('講座', 104),
 ('揪團', 76),
 ('贈送', 70),
 ('黑特', 67),
 ('旅遊', 66),
 ('玩樂', 46),
 ('閒聊', 44),
 ('請益', 44),
 ('轉讓', 43),
 ('美食', 40),
 ('廣宣', 38),
 ('相簿', 32),
 ('活動', 30),
 ('已售出', 30),
 ('售出', 26),
 ('票券', 23),
 ('請問', 22),
 ('轉錄', 19),
 ('抱怨', 18),
 ('買賣', 18),
 ('推薦', 18),
 ('徵', 18)]

In [22]:
travel_related_titles = ['遊記', '心得', '問題', '食記', '徵伴', '分享', '住宿', '新聞', '黑特', '旅遊', '玩樂', '請益', '美食', '抱怨', '推薦']

# 徵求 揪團 廣宣 活動
non_travel_related_titles = ['交易', '廣告', '情報', '問卷', '資訊', '徵求', '公告', '講座', '揪團', '贈送', '閒聊', '轉讓', '廣宣', '活動', '已售出', '售出', '票券', '請問', '轉錄', '買賣', '徵']

### 根據標題標籤分類旅遊、非旅遊相關的文章

In [24]:
travel_pattern = re.compile(r'\[' + '|'.join(travel_related_titles) + r'\]')
non_travel_pattern = re.compile(r'\[' + '|'.join(non_travel_related_titles) + r'\]')


file = "output/filtered_data.json"

classified_data = {
    'travel-related' : [],
    'non-travel-related' : []
}

with open(file, 'r', encoding='utf-8') as file:
    try:
        # 解析 JSON 數據
        data_list = json.load(file)

        for data in data_list:
            title = data['title']
            if re.search(travel_pattern, title):
                # 添加到旅遊列表
                classified_data['travel-related'].append(data)
            elif re.search(non_travel_pattern, title):
                # 添加到非旅遊列表
                classified_data['non-travel-related'].append(data)

    except json.JSONDecodeError:
        print(f"Error decoding JSON from file {file_path}")


travel_related_output_file = 'output/travel_related.json'
non_travel_related_output_file = 'output/non_travel_related.json'

write_json_data(travel_related_output_file, classified_data['travel-related'])
write_json_data(non_travel_related_output_file, classified_data['non-travel-related'])

len(classified_data['travel-related']), len(classified_data['non-travel-related'])

(39072, 14439)