In [3]:
import csv
import re
import time
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [4]:
url = "https://www.allmusicals.com/e/epicthemusical.htm"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

In [None]:
""" 
# ðŸ”¹ Step 1: find the section containing all the song links
lyric_section = soup.find(class_="lyrics-list")

ol = lyric_section.find("ol") # where the actual links are

# ðŸ”¹ Step 2: extract <a> tags within that section
song_links = []
current_act = None

# Iterate over direct <li> children of <ol>
for li in ol.find_all("li", recursive = False):
    classes = li.get("class", [])

    # Act header like <li class="act"><strong><span>Act I</span></strong></li>
    if "act" in classes:
        current_act = li.get_text(" ", strip=True) or None
        print(current_act)
        continue

    for a in lyric_section.find_all("a", href=True):
        href = a["href"].strip()
        abs_url = urljoin(url, href)
        title = a.get_text(strip=True)
        # Optional: only keep Epic: The Musical song pages
        if re.search(r"/epicthemusical/.+\.htm$", abs_url, re.I):
            song_links.append((title, abs_url, current_act))

# ðŸ”¹ Step 3: remove duplicates while preserving order
seen = set()
unique_links = []
for title, link, act in song_links:
    if link not in seen:
        unique_links.append((title, link, act))
        seen.add(link)

# ðŸ”¹ Step 4: print or save the result
# print(f"Found {len(unique_links)} songs:")
#for t, u, a in unique_links:
    #print(f"- {a} {t} -> {u}") """

In [5]:
# ðŸ”¹ Step 1: find the section containing all the song links
lyric_section = soup.find("section", class_="lyrics-list")
ol = lyric_section.find("ol")  # where the actual links are

# ðŸ”¹ Step 2: extract links, tracking the current act label
song_links = []
current_act = None

for li in ol.find_all("li", recursive=False):
    classes = li.get("class", [])

    # Act header like <li class="act">Act I</li>
    if "act" in classes:
        current_act = li.get_text(" ", strip=True) or None
        # print(current_act)
        continue

    # âœ… FIXED: search anchors inside THIS li
    for a in li.find_all("a", href=True):
        href = a["href"].strip()
        abs_url = urljoin(url, href)
        title = a.get_text(strip=True)
        if re.search(r"/epicthemusical/.+\.htm$", abs_url, re.I):
            song_links.append((title, abs_url, current_act))

# ðŸ”¹ De-dup
seen = set()
unique_links = []
for title, link, act in song_links:
    key = link.lower()
    if key not in seen:
        unique_links.append((title, link, act))
        seen.add(key)


## Getting Song Lines

In [19]:
SPEAKER_LINE = re.compile(r'^\s*\[(.+?)\]\s*$')   # e.g., [ODYSSEUS, CREW]

def split_speakers(s: str):
    """
    Turn 'ODYSSEUS, CREW & NARRATOR' into ['ODYSSEUS','CREW','NARRATOR'].
    """
    # normalize separators
    s = re.sub(r'\s*(?:,|&|and|/|\+)\s*', ',', s, flags=re.I)
    parts = [p.strip() for p in s.split(',') if p.strip()]
    return parts or ["UNKNOWN"]

def parse_song_page(title: str, url: str, act: str) -> pd.DataFrame:
    """
    Return a DataFrame with columns: song, speaker, line
    (one row per speaker per line).
    """
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # ---- locate the content ----
    page = soup.find(id="page")
    if not page:
        raise ValueError("Could not find <div id='page'> on this page.")
    
    # song title (prefer the printed h2; fall back to <title>)
    h2 = page.find("h2")
    song_title = title;
    #song_title = (h2.get_text(strip=True) if h2
    #              else (soup.title.get_text(strip=True) if soup.title else "Unknown Song"))

    # Get the page text with explicit line breaks at <br>
    text = page.get_text(separator="\n", strip=True)

    rows = []
    current_speakers = ["UNKNOWN"]
    prev_speaker_groups = []   # <- keep a list of past speaker lists
    for raw in text.splitlines():
        line = raw.strip()

        # ignore boilerplate/metadata lines
        if not line or line.lower().startswith("last update"):
            continue

        # speaker header like [ODYSSEUS, CREW]
        m = SPEAKER_LINE.match(line)
        if m:
            speaker_text = m.group(1).strip()
            # Handle special [BOTH] keyword
            if speaker_text.lower() == "both":
                flat_prev = [s for grp in prev_speaker_groups[-2:] for s in grp]
                current_speakers = list(dict.fromkeys(flat_prev)) or ["UNKNOWN"]
            else:
                current_speakers = split_speakers(speaker_text)
            prev_speaker_groups.append(current_speakers)
            continue

        # otherwise it's a lyric line â†’ one row per speaker
        for spk in current_speakers:
            rows.append({"act": act, "song": song_title, "speaker": spk, "line": line})

    return pd.DataFrame(rows)

In [None]:
## Running Data Collection
dfs = [] # apparently its faster to create a list of dataframes with the lines from each song and concat at end
for t, u, a in unique_links:
    df = parse_song_page(t, u, a) #url for each song
    dfs.append(df)
    # print(df.head(11))
df_overall = pd.concat(dfs, ignore_index = True)


In [26]:
#df_overall

## Clean Up

In [None]:
df_overall["speaker"] = df_overall["speaker"].str.lower()  # make all speakers lowercase
df_overall = df_overall[df_overall["speaker"] != "unknown"]
df_overall = df_overall[df_overall["speaker"] != "spoken"] #sometimes the label is [Oddesues, spoken]
# or easier way:
#drop_speakers = ["UNKNOWN", "spoken"]
# df_overall = df_overall[~df_overall["speaker"].isin(drop_speakers)]

In [None]:
# Testing Data Clean Up
print(df_overall["speaker"].unique())
# print(df_overall[df_overall["speaker"] == "spoken"])
# print(df_overall[df_overall["speaker"] == "both"])

['odysseus' 'soldiers' 'zeus' 'ensemble' 'all' 'crew' 'eurylochus'
 'polites' 'odyssseus' 'lotus eaters' 'athena' 'polyphemus' 'soldier'
 'cyclopes' 'perimedes' 'elpenor' 'aeolus' 'winions' 'penelope'
 'telemachus' 'poseidon' 'laestrygonians' 'circe' 'hermes'
 'fallen soldiers' 'tiresias' 'sirens' 'siren' 'scylla' 'antinuous'
 'the suitors' 'telemahcus' 'calypso' 'apollo' 'hephaestus' 'aphrodite'
 'ares' 'hera' 'suitors' 'amphinomus']


In [27]:

df_overall.to_csv("epic_all_songs_lines.csv", index=False, encoding="utf-8")
