In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

'''
This is a function to get all the links that contains the content of the txt books.
The max depth of link within link is 2, to retrieve all links, we check all link start with 'http://www.authorama.com/'
'''
def get_all_links(url, depth=0, max_depth=1):
    if depth > max_depth:
        return []

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        absolute_url = urljoin(url, href)
        if absolute_url.startswith('http://www.authorama.com/'):
            links.append(absolute_url)
            nested_links = get_all_links(absolute_url, depth=depth+1, max_depth=max_depth)
            links.extend(nested_links)

    return links

url = 'http://www.authorama.com/'
book_links = get_all_links(url)

In [5]:
import json
import os
import re

def extract_chapter_number(url):
    base_name = os.path.basename(url)
    chapter_number = base_name.split('-')[-1].split('.')[0]
    return chapter_number

def get_book_details(book_url):
    response = requests.get(book_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title_author = soup.title
    if title_author:
        title_author = title_author.string
        match = re.search(r'(.*)\s\(by\s(.*)\)', title_author)
        
        if match:
            title = match.group(1).strip()
            author = match.group(2).strip()
        else:
            title = title_author
            author = "Unknown"
    else:
        title = "Unknown"
        author = "Unknown"

    chapter = extract_chapter_number(book_url)

    content = []

    for paragraph in soup.find_all('p'):
        content.append(paragraph.text)

    book_details = {
        'title': title,
        'author': author,
        'chapter': chapter,
        'content': content
    }

    return book_details

def save_book_as_json(book_details, output_dir):
    filename = os.path.join(output_dir, book_details['title'].replace(' ', '_') + '_chapter_' + book_details['chapter'] + '.json')

    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(book_details, file, ensure_ascii=False, indent=4)

output_dir = '/Users/chenxiliu/Documents/bd23spchenxi/bdfinalproject/crawl_text'
for book_link in book_links:
    book_details = get_book_details(book_link)
    save_book_as_json(book_details, output_dir)
