In [None]:
"""
天鳳平台載下來的牌譜是一堆壓縮後的html檔案，這個cell先解壓縮
"""
import os
import gzip
import shutil
# 定義要解壓縮的資料夾和解壓後的目錄
input_folder = f'E:/scraw2009/2009'
output_folder = 'E:/專題/data/2009/html'

# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)

# 遍歷資料夾中的所有檔案
for filename in os.listdir(input_folder):
    if filename.endswith('.gz'):
        file_path = os.path.join(input_folder, filename)
        # 確定解壓縮後檔案的路徑
        output_file_path = os.path.join(output_folder, filename[:-3])  # 去除 .gz 擴展名

        # 解壓縮檔案
        with gzip.open(file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        print(f"已成功解壓縮 '{file_path}' 到 '{output_file_path}'")

In [None]:
"""
將html檔案直接轉成txt檔
"""
import os
from bs4 import BeautifulSoup

# 设置输入和输出文件夹路径
input_folder = 'E:/專題/data/2009/html'
output_folder = 'E:/專題/data/2009/txt'

# 确保输出文件夹存在
os.makedirs(output_folder, exist_ok=True)

# 遍历输入文件夹中的所有文件
for filename in os.listdir(input_folder):
    if filename.endswith('.html'):
        # 构建完整的输入文件路径和输出文件路径
        html_path = os.path.join(input_folder, filename)
        txt_filename = os.path.splitext(filename)[0] + '.txt'
        txt_path = os.path.join(output_folder, txt_filename)
        
        # 读取 HTML 文件内容
        with open(html_path, 'r', encoding='utf-8') as html_file:
            html_content = html_file.read()
        
        # 将 HTML 内容直接写入 TXT 文件
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            txt_file.write(html_content)

print('所有 HTML 文件已成功轉 TXT 文件。')

In [None]:
"""
刪除log後綴的檔案
"""
import os

# 定義目錄和要匹配的詞
directory = 'E:/專題/data/2008/html'
keyword = 'log'

# 遍歷目錄中的所有檔案
for filename in os.listdir(directory):
    # 檢查檔案名稱是否包含指定的詞
    if keyword in filename:
        file_path = os.path.join(directory, filename)  # 獲取檔案的完整路徑
        if os.path.isfile(file_path):  # 確保是檔案而不是目錄
            os.remove(file_path)  # 刪除檔案
            print(f"已成功刪除檔案 '{file_path}'")

In [None]:
"""
保留四人麻將牌譜
"""
import os
for i in range(2009,2008):
    # 设置输入和输出文件夹路径
    input_folder = f'E:/專題/data/{i}/txt'
    output_folder = f'E:/專題/data/{i}/processed txt'
    keyword1 = '四鳳'  # 需要包含的词
    keyword2 = '四技'  # 需要包含的词
    keyword3 = '四般'  # 需要包含的词

    # 确保输出文件夹存在
    os.makedirs(output_folder, exist_ok=True)

    # 遍历输入文件夹中的所有文件
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            # 构建完整的输入文件路径和输出文件路径
            txt_path = os.path.join(input_folder, filename)
            processed_filename = filename
            processed_path = os.path.join(output_folder, processed_filename)

            # 读取 TXT 文件内容
            with open(txt_path, 'r', encoding='utf-8') as txt_file:
                lines = txt_file.readlines()

            # 保留包含指定词的行
            reserved_lines = [line for line in lines if (keyword1 in line)|(keyword2 in line)|(keyword3 in line)]

            # 将处理后的内容写入新的 TXT 文件
            with open(processed_path, 'w', encoding='utf-8') as txt_file:
                txt_file.writelines(reserved_lines)

    print('所有包含指定词的行已被保留並保存到新的文件中。')

In [None]:
"""
根據網路上的github提供的方法將連結內的'?log='替換成'log/?'
github網址:https://github.com/NotoOotori/notoootori.github.io/blob/master/_posts/2020-07-28-%E5%A4%A9%E5%87%A4%E7%89%8C%E8%B0%B1%E9%87%87%E9%9B%86%E5%8F%8A%E5%88%86%E6%9E%90.md
"""
import os
for i in range(2009,2023):
    # 定義要處理的資料夾路徑
    folder_path = f'E:/專題/data/{i}/processed txt'

    # 定義要替換的文本
    old_text = '?log='
    new_text = 'log/?'

    # 遍歷資料夾中的所有 .txt 檔案
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # 只處理 .txt 檔案
            file_path = os.path.join(folder_path, filename)

            # 讀取文件內容
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            # 替換文本
            new_content = content.replace(old_text, new_text)

            # 將修改後的內容寫回文件
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(new_content)

            print(f"已成功處理檔案: {filename}")

In [None]:
"""
將每一局牌譜存成一個txt
"""
from bs4 import BeautifulSoup
import re
import urllib.request
import gzip
import os
from urllib.error import HTTPError


for times in range(2022,2011,-1):
    # 定義要讀取的 HTML 檔案路徑
    folder_path = f'E:/專題/data/{times}/processed txt'

    for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
        else:
            continue
        # 讀取 HTML 檔案內容
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

        # 使用 BeautifulSoup 解析 HTML 內容
        soup = BeautifulSoup(content, 'lxml')
        # 提取所有文本（可能包含雙引號）
        text_with_quotes = re.findall(r'"(.*?)"', content)
        for text in text_with_quotes:
            print(text)

        os.makedirs(output_dir, exist_ok=True)

        HEADER = {
            'Host': 'e.mjv.jp',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive'
        }
        i=1
        for url in text_with_quotes:
            try:
                req = urllib.request.Request(url=url, headers=HEADER)
                opener = urllib.request.build_opener()
                response = opener.open(req)
                response = gzip.decompress(response.read()).decode('utf-8')
                file_name = os.path.join(output_dir, os.path.splitext(filename)[0]+f'-{i}.txt')
                i=i+1
                with open(file_name, 'w', encoding='utf-8') as f:
                    f.write(response)
                print(f"已成功將內容寫入 '{file_name}'")
            except HTTPError as e:
                if e.code == 404:
                    print(f"跳過：{url} 因為 404 Not Found")
                    continue  # 跳過此次迴圈並繼續執行下一次
                else:
                    print(f"HTTP 錯誤 {e.code}：無法獲取 {url}")
                    break  # 如果不是 404，則中斷迴圈或採取其他措施