In [1]:
patience_url = "https://rpo.library.utoronto.ca/content/patience"
sir_gawain_url = "https://rpo.library.utoronto.ca/content/sir-gawain-and-green-knight"
cleanness_url = "https://rpo.library.utoronto.ca/content/cleanness"
pearl_url = "pearl.html"
# pearl_url = "https://quod.lib.umich.edu/c/cme/Pearl?rgn=main;view=fulltext"
# need to manually download pearl html file for this step 

In [2]:
import requests
from bs4 import BeautifulSoup

def url_to_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

# Pearl

In [3]:
with open("pearl.html", "r", encoding="utf-8") as pearl_file:
    pearl_html = pearl_file.read()
    pearl_soup = BeautifulSoup(pearl_html, "html.parser")

In [4]:
pearl_stanzas = [
    "".join([
        line.text for line in stanza.find_all("span", "line")]
           ).replace("\n\n", "\n") for stanza in pearl_soup.find_all('div', "lg lg--stanza")]

# Gawain

In [5]:
sir_gawain_soup = url_to_soup(sir_gawain_url)

In [6]:
def soup_to_stanzas(soup):    
    stanzas = []
    current_stanza = []
    for div in soup.find_all('div', 'poemline'):
        if 'stanza' in div.get('class', []):  # Start of a new stanza section
            if current_stanza:  # If we have lines from previous stanza, add them
                stanzas.append('\n'.join(current_stanza))
            current_stanza = []
        line = div.find('span', 'line-text')
        if line and not line.find('nonum') and line.text.strip():
            current_stanza.append(line.text.strip())
    # Don't forget to add the last stanza
    if current_stanza:
        stanzas.append('\n'.join(current_stanza))
    return stanzas

In [7]:
def url_to_stanzas(url):
    soup = url_to_soup(url)
    stanzas = soup_to_stanzas(soup)
    return stanzas

In [8]:
gawain_stanzas = soup_to_stanzas(sir_gawain_soup)

# Cleanness

In [9]:
import numpy as np

In [10]:
cleanness_stanzas = url_to_stanzas(cleanness_url)

In [11]:
def poem_chunker(poem, num_chunks):
    # Turn 
    poem_lines = sum([poem[index_stanza].split("\n") for index_stanza in range(len(poem))], [])
    poem_chunks = ["\n".join(list([str(line) for line in chunk])) for chunk in np.array_split(poem_lines, num_chunks)]
    return poem_chunks

In [12]:
cleanness_stanzas = poem_chunker(cleanness_stanzas, 180)

# Patience

In [13]:
patience_stanzas = url_to_stanzas(patience_url)

In [14]:
patience_stanzas = poem_chunker(patience_stanzas, 53)

In [15]:
pearl_ms_stanzas = pearl_stanzas + gawain_stanzas + cleanness_stanzas + patience_stanzas

In [16]:
len(pearl_ms_stanzas)

465

## JSON conversion

In [17]:
import json

In [18]:
def stanzas_to_jsons(stanzas, json_output_path="stanzas.json"):
    stanza_dictionary = {
        "user": stanzas,
        "assistant": stanzas
    }
    with open(json_output_path, "w", encoding="utf-8") as json_output_file:
        json.dump(stanza_dictionary, json_output_file)
    return stanza_dictionary

In [20]:
stanza_dictionary = stanzas_to_jsons(pearl_ms_stanzas)