In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# 定义允许爬取的URL列表
allowed_url_parts  = ['https://web.ubot.com.tw/UB/2023stocksandETF/','https://www.ubot.com.tw/','https://card.ubot.com.tw/', 'https://cardweb.ubot.com.tw/', 'https://mybank.ubot.com.tw/']

def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an HTTPError if the HTTP request returned an unsuccessful status code
        response.encoding = response.apparent_encoding  # Set the encoding based on the content
        return response.text
    except requests.RequestException as e:
        print(f"Error during request to {url}: {e}")
        return None

def parse_page_content(content, url):
    if content:
        soup = BeautifulSoup(content, 'html.parser')
        title = soup.title.text if soup.title else 'No Title'
        description_tag = soup.find('meta', attrs={'name': 'description'})
        description = description_tag.get('content') if description_tag else 'No Description'
        return {'url': url, 'title': title, 'description': description}
    else:
        return None

def crawl_website(url, visited_urls=set()):
    if any(part in url for part in allowed_url_parts) and url not in visited_urls:
        content = get_page_content(url)
        if content:
            page_info = parse_page_content(content, url)
            print(f"URL: {page_info['url']}")
            print(f"Title: {page_info['title']}")
            print(f"Description: {page_info['description']}")
            
            # Here you can add code to record the title, description, and URL as needed
            record_to_json(page_info)

            visited_urls.add(url)

            # Recursively crawl links on the page
            soup = BeautifulSoup(content, 'html.parser')
            links = soup.find_all('a', href=True)
            for link in links:
                next_url = urljoin(url, link['href'])
                if next_url.startswith('http') and next_url not in visited_urls:
                    crawl_website(next_url, visited_urls)

def record_to_json(data):
    with open('聯邦.2023stocksandETF.json', 'a', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False)
        json_file.write('\n')


# Replace 'your_website_url' with the actual URL you want to start crawling from

crawl_website('https://web.ubot.com.tw/UB/2023stocksandETF/')

