In [3]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import os

In [4]:
# Function to get the HTML content of a page
def get_html(url):
    # Create a session to handle cookies and headers
    session = requests.Session()
    
    # Fetch the content of the page
    response = session.get(url)
    if response.status_code == 200:
        return response.content
    
    else:
        print(f'Failed to retrieve the page. Status code: {response.status_code}')


# Function to parse the given page and extract the first 200 book titles and links
def parse_main_page(html):
    soup = BeautifulSoup(html, 'html.parser')
    
    books = [book for book in soup.find_all('li') if book.get("class") == ["pgdbetext"]] #Filter out non-book content
    
    book_data = []
    for book in books:
        link_tag = book.find('a')
        if link_tag:
            title = link_tag.text.strip()
            link = 'https://www.gutenberg.org' + link_tag['href']
            if not (title,link) in book_data: #Skip repeated links
                book_data.append((title, link))
    
        if len(book_data) == 200 : break
    return book_data

# Function to save the data to a CSV file
def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Title', 'URL'])
        writer.writerows(data)

# Function to read the CSV file and extract links
def read_csv_and_get_links(filename):
    links = []
    with open(filename, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) > 1:  # Ensure the row has enough columns
                links.append(row[1])  # The URL is in the second column
    return links

#--------Functions for extracting data from individual books -----------------------

def  get_author(soup):
    try:
        author = soup.find('a', itemprop = "creator").text.strip()
    except:
        print("No author found")
        author = ""
    return author
    
def get_title(soup):
    try:
        title = soup.find('td', itemprop = "headline").text.strip()
    except:
        print("Did not find title")
        title = ""
    return title

#Get publish date of book
def get_pub_date(soup):
    try:
        pub_date = soup.find("td", itemprop = "datePublished").text.strip()
    except:
        print("No date found")
        pub_date = ""
    return pub_date

#Extract book content from web version e-book
def extract_book_content(content_url):
    content_html = get_html(content_url)
    soup = BeautifulSoup(content_html)
    
    # Locate the start and end markers
    start_marker = soup.find("span", string=lambda text: text and "START OF THE PROJECT GUTENBERG EBOOK" in text)
    end_marker = soup.find("span", string=lambda text: text and "END OF THE PROJECT GUTENBERG EBOOK" in text)

    if start_marker and end_marker:
        # Find the elements that contain the markers
        start_element = start_marker.find_parent()
        end_element = end_marker.find_parent()
        
        # Extract all text between the start and end elements
        content = []
        for element in start_element.find_all_next(string=True):
            if start_marker.text in element:
                continue
            if end_marker.text in element:
                break
            # Filter out whitespace and empty strings
            if element.strip():
                content.append(element.strip())

        return "\n".join(content)
    
    else:
        print("Could not find the book markers in the text.")
        return None

#Get book data (including title, author, publish date + book content)
def get_book_data(book_url):
    book_html = get_html(book_url)
    soup = BeautifulSoup(book_html)

    #Get basic info (title, author, publish date)
    book_info = {}
    book_info["title"], book_info["author"], book_info["date"] = get_title(soup), get_author(soup), get_pub_date(soup)

    #Get read online url
    content_url = "https://www.gutenberg.org"+soup.find("a", title = "Read online").get("href")
    #Extract content of book
    book_content = extract_book_content(content_url)
    
    return book_info, book_content #Returns a dictionary containing book info (keys: title, author, date) and a string containing the content of the book

def save_to_txt(book_info, book_content):
    title = book_info["title"].replace("/", ",") #If '/' exists in book title, replace with ','
    
    # Format the book info
    info = (f"書名: {title}\n"
            f"作者: {book_info['author']}\n"
            f"時間: {book_info['date']}\n\n")
    
    full_text = info + book_content

    # Write to a text file with book title as file name, save to folder "top_200_books"
    if not os.path.exists('top_200_books'):
        os.makedirs('top_200_books')

    file_name = f'top_200_books/{title}.txt'
    duplicate_num = 0 

    while os.path.exists(file_name): #Check if title already exists
        duplicate_num += 1
        file_name = f'top_200_books/{title}_{duplicate_num}.txt' #Alternative title for duplicates (eg. 比目魚_1.txt) 
        
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    print(file_name)

In [10]:
if __name__ == '__main__':
    
    #Crawl book links for the first 200 books
    main_page_url = "https://www.gutenberg.org/browse/languages/zh"
    main_page_html = get_html(main_page_url)
    
    top_200_data = parse_main_page(main_page_html)
    
    save_to_csv(top_200_data, "top_200_links.csv")
    
    #Retrieve the 200 links crawled
    links = read_csv_and_get_links("top_200_links.csv")

    #Crawl each book and save to txt file
    problem_links = [] 
    link_index = 0
    for link in links:
        try:
            info, content = get_book_data(link)
            save_to_txt(info, content)
        except:
            print(f"Problem with {link_index}")
            problem_links.append(link)
        link_index+=1

top_200_books/豆棚閒話.txt
top_200_books/戲中戲.txt
top_200_books/比目魚.txt
top_200_books/比目魚_1.txt
top_200_books/Study of Inner Cultivation.txt
top_200_books/三字經.txt
top_200_books/山水情.txt
top_200_books/山海經.txt
top_200_books/施公案.txt
top_200_books/施公案_1.txt
top_200_books/易經.txt
top_200_books/木蘭奇女傳.txt
top_200_books/海公案.txt
top_200_books/燕丹子.txt
top_200_books/狄公案.txt
top_200_books/百家姓.txt
top_200_books/禮記.txt
top_200_books/綠牡丹.txt
top_200_books/詩經.txt
top_200_books/麟兒報.txt
top_200_books/Hu Die MeiYuan Yang Meng.txt
top_200_books/Qing Lou MengQi Hong Xiao Shi.txt
top_200_books/天豹圖.txt
top_200_books/梁公九諫.txt
top_200_books/長恨歌.txt
top_200_books/李娃傳.txt
top_200_books/玉樓春.txt
top_200_books/漢書.txt
top_200_books/引鳳蕭.txt
top_200_books/今古奇觀.txt
top_200_books/後西游記.txt
top_200_books/飛跎全傳.txt
top_200_books/佛說四十二章經.txt
top_200_books/紅樓夢.txt
top_200_books/洛神賦.txt
top_200_books/晁氏儒言 一卷.txt
top_200_books/水滸後傳.txt
top_200_books/幼學瓊林.txt
top_200_books/治世餘聞.txt
top_200_books/琵琶記.txt
top_200_books/雪月梅傳.txt
top_200_b