In [10]:
import os
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import hashlib


def get_filename_from_url(url):
    # URLからMD5ハッシュを計算
    hash_object = hashlib.md5(url.encode())
    # ハッシュ値を16進数の文字列に変換
    hex_dig = hash_object.hexdigest()
    return hex_dig


def download_html(url, save_dir):
    response = requests.get(url)
    filename = os.path.join(save_dir, get_filename_from_url(url.replace('/', '_')) + '.html')
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)


def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('http')]
    return links

# 初期のハイパーリンク
initial_urls = [
                'https://wiki.xn--rckteqa2e.com/wiki/%E3%83%9D%E3%82%B1%E3%83%A2%E3%83%B3%E4%B8%80%E8%A6%A7',
                ]

# HTMLファイルを保存するディレクトリ
save_dir = 'downloaded_html_files'
os.makedirs(save_dir, exist_ok=True)


In [11]:
from urllib.parse import urljoin

def get_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    links = [urljoin(url, link.get('href')) for link in soup.find_all('a')]
    # "https://ja.wikipedia.org/"で始まるリンクのみを含める
    links = [link for link in links if link.startswith("https://wiki.xn--rckteqa2e.com/wiki")]
    return links

for initial_url in initial_urls:
    try:
        # 初期のハイパーリンクのHTMLをダウンロード
        download_html(initial_url, save_dir)
    except ConnectionError:
        print(f"Failed to connect to {initial_url}. Skipping...")
        continue

    try:
        # 初期のハイパーリンクに紐付いたハイパーリンクを取得
        linked_urls = get_links(initial_url)
    except ConnectionError:
        print(f"Failed to get links from {initial_url}. Skipping...")
        continue
    
    linked_urls = list(set(linked_urls))
    
    # 紐付いたハイパーリンクのHTMLをダウンロード
    for url in linked_urls:
        try:
            download_html(url, save_dir)
        except ConnectionError:
            print(f"Failed to download HTML from {url}. Skipping...")
            continue