In [25]:
from bs4 import BeautifulSoup
import requests
import json

In [26]:
base_url = 'https://library.tarvalon.net'

book_summaries_url = 'https://library.tarvalon.net/index.php?title=Book_Summaries'
# chapter_summaries_url = 'https://library.tarvalon.net/index.php?title=Book_Summaries'

summaries_page = requests.get(book_summaries_url)
book_summary_soup = BeautifulSoup(summaries_page.text, 'html')

long_summaries = book_summary_soup.find_all('ul')[0]
# brief_summaries = book_summary_soup.find_all('ul')[1]

full_summary_links_data = long_summaries.find_all('a')
# brief_summary_links_data = brief_summaries.find_all('a')


book_summary_links = [base_url + link['href'] for link in full_summary_links_data]
# brief_book_summary_links = [base_url + link['href'] for link in brief_summary_links_data]

print(book_summary_links)

['https://library.tarvalon.net/index.php?title=New_Spring:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Eye_of_the_World:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Great_Hunt:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Dragon_Reborn:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Shadow_Rising:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Fires_of_Heaven:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=Lord_of_Chaos:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=A_Crown_of_Swords:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=The_Path_of_Daggers:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=Winter%27s_Heart:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=Crossroads_of_Twilight:_Plot_Summary', 'https://library.tarvalon.net/index.php?title=Knife_of_Dreams:_Plot_Summary', 'https://library.tarvalon.net/index.php?titl

In [27]:
def scrape_summary(soup):
    # Extract title
    title_tag = soup.find('h1', class_='firstHeading')
    title = title_tag.text.strip()
    title = title.replace(': Plot Summary', '')
    
    # Extract the main content div
    content_div = soup.find('div', class_='mw-content-ltr')

    # Remove the last 3 <p> tags (they contain the footer)
    all_paragraphs = content_div.find_all('p')
    for p in all_paragraphs[-3:]:
        p.extract()
        
    # Remove any nested <div> elements inside the main div (they contain an info box)
    for nested_div in content_div.find_all('div'):
        nested_div.extract()
        
    # Extract author
    author_tag = content_div.find('i')
    if author_tag:
        author = author_tag.text.strip().replace('Author: ', '')
        author_tag.extract()  # Ensure the author tag is removed from the content
    else:
        author = "Unknown"  # Default if author tag is missing

    # Initialize summaries as a list
    summaries = []
    current_section = None

    for tag in content_div.find_all(['p', 'b', 'h2']):
        # Skip empty <p> or those with only a <br> tag
        if tag.name == 'p' and not tag.get_text(strip=True):
            continue

        # If the tag is a <p> and contains a <b>, treat <b> as a section header
        if tag.name == 'p' and tag.find('b'):
            current_section = tag.find('b').get_text(strip=True)
            # Check if this section already exists
            if not any(s["section"] == current_section for s in summaries):
                summaries.append({"section": current_section, "content": ""})
        # Start a new section if we encounter a header
        elif tag.name in ['b', 'h2'] and tag.get_text(strip=True):
            current_section = tag.get_text(strip=True)
            # Check if this section already exists
            if not any(s["section"] == current_section for s in summaries):
                summaries.append({"section": current_section, "content": ""})
        # Otherwise, add the paragraph to the current section
        elif tag.name == 'p':
            if current_section:
                # Find the section and append the content
                for s in summaries:
                    if s["section"] == current_section:
                        s["content"] += tag.get_text(strip=False)
                        break
            else:
                # If no section exists, start a default section
                current_section = "Summary"
                if not any(s["section"] == current_section for s in summaries):
                    summaries.append({"section": current_section, "content": ""})
                summaries[-1]["content"] += tag.get_text(strip=False)

    return title, author, summaries


In [28]:
data = []

for link in book_summary_links:
    # Fetch the page
    page = requests.get(link)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Scrape the summary
    title, author_name, summary = scrape_summary(soup)

    # Append the structured data
    data.append({
        "title": title,
        "author": author_name,
        "summary": summary  # summary is already in the updated format (list of dictionaries)
    })

# Write the data to a JSON file
with open('data/test/book_summaries.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)