In [9]:
import csv
import re
import time
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [3]:
url = "https://www.allmusicals.com/e/epicthemusical.htm"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

In [None]:
""" 
# ðŸ”¹ Step 1: find the section containing all the song links
lyric_section = soup.find(class_="lyrics-list")

ol = lyric_section.find("ol") # where the actual links are

# ðŸ”¹ Step 2: extract <a> tags within that section
song_links = []
current_act = None

# Iterate over direct <li> children of <ol>
for li in ol.find_all("li", recursive = False):
    classes = li.get("class", [])

    # Act header like <li class="act"><strong><span>Act I</span></strong></li>
    if "act" in classes:
        current_act = li.get_text(" ", strip=True) or None
        print(current_act)
        continue

    for a in lyric_section.find_all("a", href=True):
        href = a["href"].strip()
        abs_url = urljoin(url, href)
        title = a.get_text(strip=True)
        # Optional: only keep Epic: The Musical song pages
        if re.search(r"/epicthemusical/.+\.htm$", abs_url, re.I):
            song_links.append((title, abs_url, current_act))

# ðŸ”¹ Step 3: remove duplicates while preserving order
seen = set()
unique_links = []
for title, link, act in song_links:
    if link not in seen:
        unique_links.append((title, link, act))
        seen.add(link)

# ðŸ”¹ Step 4: print or save the result
# print(f"Found {len(unique_links)} songs:")
#for t, u, a in unique_links:
    #print(f"- {a} {t} -> {u}") """

In [90]:
# ðŸ”¹ Step 1: find the section containing all the song links
lyric_section = soup.find("section", class_="lyrics-list")
ol = lyric_section.find("ol")  # where the actual links are

# ðŸ”¹ Step 2: extract links, tracking the current act label
song_links = []
current_act = None

for li in ol.find_all("li", recursive=False):
    classes = li.get("class", [])

    # Act header like <li class="act">Act I</li>
    if "act" in classes:
        current_act = li.get_text(" ", strip=True) or None
        # print(current_act)
        continue

    # âœ… FIXED: search anchors inside THIS li
    for a in li.find_all("a", href=True):
        href = a["href"].strip()
        abs_url = urljoin(url, href)
        title = a.get_text(strip=True)
        if re.search(r"/epicthemusical/.+\.htm$", abs_url, re.I):
            song_links.append((title, abs_url, current_act))

# ðŸ”¹ De-dup
seen = set()
unique_links = []
for title, link, act in song_links:
    key = link.lower()
    if key not in seen:
        unique_links.append((title, link, act))
        seen.add(key)


## Getting Song Lines

In [91]:
SPEAKER_LINE = re.compile(r'^\s*\[(.+?)\]\s*$')   # e.g., [ODYSSEUS, CREW]

def split_speakers(s: str):
    """
    Turn 'ODYSSEUS, CREW & NARRATOR' into ['ODYSSEUS','CREW','NARRATOR'].
    """
    # normalize separators
    s = re.sub(r'\s*(?:,|&|and|/|\+)\s*', ',', s, flags=re.I)
    parts = [p.strip() for p in s.split(',') if p.strip()]
    return parts or ["UNKNOWN"]

def parse_song_page(title: str, url: str, act: str) -> pd.DataFrame:
    """
    Return a DataFrame with columns: song, speaker, line
    (one row per speaker per line).
    """
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # ---- locate the content ----
    page = soup.find(id="page")
    if not page:
        raise ValueError("Could not find <div id='page'> on this page.")
    
    # song title (prefer the printed h2; fall back to <title>)
    h2 = page.find("h2")
    song_title = title;
    #song_title = (h2.get_text(strip=True) if h2
    #              else (soup.title.get_text(strip=True) if soup.title else "Unknown Song"))

    # Get the page text with explicit line breaks at <br>
    text = page.get_text(separator="\n", strip=True)

    rows = []
    current_speakers = ["UNKNOWN"]
    prev_speaker_groups = []   # <- keep a list of past speaker lists
    for raw in text.splitlines():
        line = raw.strip()

        # ignore boilerplate/metadata lines
        if not line or line.lower().startswith("last update"):
            continue

        # speaker header like [ODYSSEUS, CREW]
        m = SPEAKER_LINE.match(line)
        if m:
            speaker_text = m.group(1).strip()
            # Handle special [BOTH] keyword
            if speaker_text.lower() == "both":
                flat_prev = [s for grp in prev_speaker_groups[-2:] for s in grp]
                current_speakers = list(dict.fromkeys(flat_prev)) or ["UNKNOWN"]
            else:
                current_speakers = split_speakers(speaker_text)
            prev_speaker_groups.append(current_speakers)
            continue

        # otherwise it's a lyric line â†’ one row per speaker
        for spk in current_speakers:
            rows.append({"act": act, "song": song_title, "speaker": spk, "line": line})

    return pd.DataFrame(rows)

In [92]:
## Old one
dfs = [] # apparently its faster to create a list of dataframes with the lines from each song and concat at end
for t, u, a in unique_links:
    df = parse_song_page(t, u, a) #url for each song
    dfs.append(df)
    # print(df.head(11))
df_overall = pd.concat(dfs, ignore_index = True)
df_overall.to_csv("epic_all_songs_lines.csv", index=False, encoding="utf-8")



In [93]:
df_overall

Unnamed: 0,act,song,speaker,line
0,The Troy Saga,The Horse and the Infant,UNKNOWN,The Horse and the Infant
1,The Troy Saga,The Horse and the Infant,ODYSSEUS,"Alright, my brothers, listen closely"
2,The Troy Saga,The Horse and the Infant,SOLDIERS,"Alright, my brothers, listen closely"
3,The Troy Saga,The Horse and the Infant,ODYSSEUS,"Tonight, we make the Trojans pay"
4,The Troy Saga,The Horse and the Infant,SOLDIERS,"Tonight, we make the Trojans pay"
...,...,...,...,...
2461,The Ithaca Saga,Would You Fall In Love With Me Again,PENELOPE,How long has it been?
2462,The Ithaca Saga,Would You Fall In Love With Me Again,ODYSSEUS,Twenty years
2463,The Ithaca Saga,Would You Fall In Love With Me Again,PENELOPE,I love you
2464,The Ithaca Saga,Would You Fall In Love With Me Again,PENELOPE,I love you


## Each Stanza as a Line

In [18]:
SPEAKER_LINE = re.compile(r'^\s*\[(.+?)\]\s*$')   # e.g. [ODYSSEUS, CREW]

def split_speakers(s: str):
    # normalize common separators: "A, B & C / D + E and F" â†’ ['A','B','C','D','E','F']
    s = re.sub(r'\s*(?:,|&|and|/|\+)\s*', ',', s, flags=re.I)
    return [p.strip() for p in s.split(',') if p.strip()] or ["UNKNOWN"]

def parse_song_page_stanzas(url: str) -> pd.DataFrame:
    """
    Returns a DataFrame with columns: song, stanza_idx, speaker, stanza
    - stanza = all consecutive lines a speaker (or speaker group) says until the next [SPEAKER] header
    - duplicates one row per speaker if multiple speakers are listed
    """
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    page = soup.find(id="page")
    if not page:
        raise ValueError("No <div id='page'> found")

    # Prefer the visible-print h2 as the song title; fall back to <title>
    h2 = page.find("h2")
    song_title = (h2.get_text(strip=True) if h2
                  else (soup.title.get_text(strip=True) if soup.title else "Unknown Song"))

    # Turn <br> into line breaks
    text = page.get_text(separator="\n", strip=True)

    rows = []
    stanza_lines = []
    stanza_idx = 0
    current_speakers = ["UNKNOWN"]

    def flush_stanza():
        nonlocal stanza_lines, stanza_idx
        if not stanza_lines:
            return
        stanza = "\n".join(stanza_lines).strip()
        # compress 3+ blank lines to at most one blank line
        stanza = re.sub(r'\n{3,}', '\n\n', stanza)
        if stanza:
            stanza_idx += 1
            for spk in current_speakers:
                rows.append({
                    "song": song_title,
                    "stanza_idx": stanza_idx,
                    "speaker": spk,
                    "stanza": stanza
                })
        stanza_lines = []

    for raw in text.splitlines():
        line = raw.strip()

        # skip site footer noise
        if not line or line.lower().startswith("last update"):
            # Keep blank lines as stanza separators (optional):
            # stanza_lines.append("")  # uncomment to preserve blank lines inside stanza
            continue

        m = SPEAKER_LINE.match(line)
        if m:
            # new speaker header â†’ finish previous stanza, switch speakers
            flush_stanza()
            current_speakers = split_speakers(m.group(1))
        else:
            stanza_lines.append(line)

    # flush the trailing stanza at EOF
    flush_stanza()

    return pd.DataFrame(rows)


In [None]:
dfs = [] # apparently its faster to create a list of dataframes with the lines from each song and concat at end
for t, u, a in unique_links:
    df = parse_song_page_stanzas(u) #url for each song
    dfs.append(df)
    # print(df.head(11))
df_stanza = pd.concat(dfs, ignore_index = True)
df_stanza.to_csv("epic_all_songs_stanza.csv", index=False, encoding="utf-8")



ValueError: too many values to unpack (expected 2)

## Clean Up

In [94]:
df_overall["speaker"] = df_overall["speaker"].str.lower()
df_overall = df_overall[df_overall["speaker"] != "UNKNOWN"]
df_overall = df_overall[df_overall["speaker"] != "spoken"] #sometimes the label is [Oddesues, spoken]
# or easier way:
#drop_speakers = ["UNKNOWN", "spoken"]
# df_overall = df_overall[~df_overall["speaker"].isin(drop_speakers)]

In [97]:
print(df_overall["speaker"].unique())

['unknown' 'odysseus' 'soldiers' 'zeus' 'ensemble' 'all' 'crew'
 'eurylochus' 'polites' 'odyssseus' 'lotus eaters' 'athena' 'polyphemus'
 'soldier' 'cyclopes' 'perimedes' 'elpenor' 'aeolus' 'winions' 'penelope'
 'telemachus' 'poseidon' 'laestrygonians' 'circe' 'hermes'
 'fallen soldiers' 'tiresias' 'sirens' 'siren' 'scylla' 'antinuous'
 'the suitors' 'telemahcus' 'calypso' 'apollo' 'hephaestus' 'aphrodite'
 'ares' 'hera' 'suitors' 'amphinomus']


In [96]:
# print(df_overall[df_overall["speaker"] == "spoken"])
print(df_overall[df_overall["speaker"] == "both"])

Empty DataFrame
Columns: [act, song, speaker, line]
Index: []
