In [1]:
import csv
import re
import time
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
url = "https://www.allmusicals.com/e/epicthemusical.htm"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

# Scraping the Data

### Getting All Song Links

In [3]:
# Find the section containing all the song links
lyric_section = soup.find(class_="lyrics-list")

ol = lyric_section.find("ol") # where the actual links are

# Extract <a> tags within that section
song_links = []
current_act = None

# Iterate over direct <li> children of <ol>
for li in ol.find_all("li", recursive = False):
    classes = li.get("class", [])

    # Act header like <li class="act"><strong><span>Act I</span></strong></li>
    if "act" in classes:
        current_act = li.get_text(" ", strip=True) or None
        #print(current_act)
        continue

    for a in li.find_all("a", href=True):
        href = a["href"].strip()
        abs_url = urljoin(url, href)
        title = a.get_text(strip=True)
        # Optional: only keep Epic: The Musical song pages
        if re.search(r"/epicthemusical/.+\.htm$", abs_url, re.I):
            song_links.append((title, abs_url, current_act))

# Remove duplicates while preserving order
seen = set()
unique_links = []
for title, link, act in song_links:
    if link not in seen:
        unique_links.append((title, link, act))
        seen.add(link)

# print(f"Found {len(unique_links)} songs:")
#for t, u, a in unique_links:
    #print(f"- {a} {t} -> {u}")

### Getting Song Lines From Each Link

In [4]:
SPEAKER_LINE = re.compile(r'^\s*\[(.+?)\]\s*$')   # e.g., [ODYSSEUS, CREW]

def split_speakers(s: str):
    """
    Turn 'ODYSSEUS, CREW & NARRATOR' into ['ODYSSEUS','CREW','NARRATOR'].
    """
    # normalize separators
    s = re.sub(r'\s*(?:,|&|and|/|\+)\s*', ',', s, flags=re.I)
    parts = [p.strip() for p in s.split(',') if p.strip()]
    return parts or ["UNKNOWN"]

def parse_song_page(title: str, url: str, act: str) -> pd.DataFrame:
    """
    Return a DataFrame with columns: song, speaker, line
    (one row per speaker per line).
    """
    r = requests.get(url, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # ---- locate the content ----
    page = soup.find(id="page")
    if not page:
        raise ValueError("Could not find <div id='page'> on this page.")
    
    # song title (prefer the printed h2; fall back to <title>)
    h2 = page.find("h2")
    song_title = title;
    
    # Get the page text with explicit line breaks at <br>
    text = page.get_text(separator="\n", strip=True)

    rows = []
    current_speakers = ["UNKNOWN"]
    prev_speaker_groups = []   # <- keep a list of past speaker lists
    for raw in text.splitlines():
        line = raw.strip()

        # ignore boilerplate/metadata lines
        if not line or line.lower().startswith("last update"):
            continue

        # speaker header like [ODYSSEUS, CREW]
        m = SPEAKER_LINE.match(line)
        if m:
            speaker_text = m.group(1).strip()
            # Handle special [BOTH] keyword
            if speaker_text.lower() == "both":
                flat_prev = [s for grp in prev_speaker_groups[-2:] for s in grp]
                current_speakers = list(dict.fromkeys(flat_prev)) or ["UNKNOWN"]
            else:
                current_speakers = split_speakers(speaker_text)
            prev_speaker_groups.append(current_speakers)
            continue

        # otherwise it's a lyric line → one row per speaker
        for spk in current_speakers:
            rows.append({"act": act, "song": song_title, "speaker": spk, "line": line})

    return pd.DataFrame(rows)

In [5]:
## Running Data Collection
dfs = [] # apparently its faster to create a list of dataframes with the lines from each song and concat at end
for t, u, a in unique_links:
    df = parse_song_page(t, u, a) #url for each song
    dfs.append(df)
    # print(df.head(11))
df_overall = pd.concat(dfs, ignore_index = True)
#df_overall

In [6]:
len(df_overall)
print(df_overall["speaker"].unique())

['UNKNOWN' 'ODYSSEUS' 'SOLDIERS' 'ZEUS' 'ENSEMBLE' 'ALL' 'CREW'
 'EURYLOCHUS' 'POLITES' 'ODYSSSEUS' 'LOTUS EATERS' 'ATHENA' 'spoken'
 'POLYPHEMUS' 'SOLDIER' 'CYCLOPES' 'PERIMEDES' 'ELPENOR' 'AEOLUS'
 'WINIONS' 'PENELOPE' 'TELEMACHUS' 'POSEIDON' 'LAESTRYGONIANS' 'CIRCE'
 'HERMES' 'FALLEN SOLDIERS' 'TIRESIAS' 'SIRENS' 'SIREN' 'SCYLLA'
 'ANTINUOUS' 'THE SUITORS' 'TELEMAHCUS' 'CALYPSO' 'APOLLO' 'HEPHAESTUS'
 'APHRODITE' 'ARES' 'HERA' 'Poseidon' 'SUITORS' 'AMPHINOMUS']


# Clean Up Data

In [7]:
df_overall["speaker"] = df_overall["speaker"].str.lower()  # make all speakers lowercase
df_overall = df_overall[df_overall["speaker"] != "unknown"]
df_overall = df_overall[df_overall["speaker"] != "spoken"] #sometimes the label is [Oddesues, spoken]
corrections = { #all the corrections that need t obe made
    "odyssseus": "odysseus",
    "telemahcus": "telemachus",
    "the suitors": "suitors"
}
df_overall["speaker"] = df_overall["speaker"].replace(corrections)
df_overall['speaker'] = df_overall['speaker'].str.title() #capitalize the names of speakers
# or easier way:
#drop_speakers = ["UNKNOWN", "spoken"]
# df_overall = df_overall[~df_overall["speaker"].isin(drop_speakers)]


counts = df_overall["speaker"].value_counts()
# print(f"Number of unique speakers: {df_overall["speaker"].nunique()}")
# print(counts)

# Testing Data Clean Up
#print(df_overall["speaker"].unique())
# print(df_overall[df_overall["speaker"] == "spoken"])
#print(df_overall[df_overall["speaker"] == "odyssseus"])


In [8]:
# Drop speakers with less than 5 lines to avoid tiny classes
min_lines = 5
keep_speakers = counts[counts >= min_lines].index
df_finaldata = df_overall[df_overall["speaker"].isin(keep_speakers)].reset_index(drop=True)
print("Speakers kept:", len(keep_speakers))

Speakers kept: 33


In [9]:
# CSV with all speakers present
df_overall.to_csv("epic_all_songs_lines_allspeakers.csv", index=False, encoding="utf-8")
# CSV with only the speakers we want
df_finaldata.to_csv("epic_all_songs_lines_trainingdata.csv", index=False, encoding="utf-8")


## Creating a Document for Each Speaker (as opposed to CSV with all lines in box above)

In [17]:
from sklearn.model_selection import train_test_split
## Train/Test Split, stratified by speaker
X_train, X_test, y_train, y_test = train_test_split(
    df_finaldata["line"], df_finaldata["speaker"],
    test_size=0.2, random_state=42, stratify=df_finaldata["speaker"]
)

In [None]:
## Split into test and train data
# 1️⃣ Rebuild train/test DataFrames from the split
train_df = pd.DataFrame({
    "speaker": y_train.values,
    "line": X_train.values
})

test_df = pd.DataFrame({
    "speaker": y_test.values,
    "line": X_test.values
})

# 2️⃣ Base folder for all speaker text files
base_dir = "speaker_texts"
splits = {
    "train": train_df,
    "test": test_df,
}

for split_name, split_df in splits.items():
    split_dir = os.path.join(base_dir, split_name)
    os.makedirs(split_dir, exist_ok=True)

    # Group by speaker and write one file per speaker
    for speaker, group in split_df.groupby("speaker"):
        lines = group["line"].dropna().tolist()
        text = "\n".join(lines)

        # Sanitize filename a bit
        filename = (
            str(speaker)
            .replace("/", "_")
            .replace("\\", "_")
            .replace(" ", "_")
        )
        path = os.path.join(split_dir, f"{filename}.txt")

        with open(path, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"✅ [{split_name}] {speaker}: {len(lines)} lines written")


✅ [train] Aeolus: 30 lines written
✅ [train] All: 32 lines written
✅ [train] Antinuous: 66 lines written
✅ [train] Aphrodite: 5 lines written
✅ [train] Apollo: 4 lines written
✅ [train] Ares: 8 lines written
✅ [train] Athena: 84 lines written
✅ [train] Calypso: 71 lines written
✅ [train] Circe: 57 lines written
✅ [train] Crew: 44 lines written
✅ [train] Cyclopes: 4 lines written
✅ [train] Ensemble: 87 lines written
✅ [train] Eurylochus: 76 lines written
✅ [train] Fallen Soldiers: 4 lines written
✅ [train] Hephaestus: 4 lines written
✅ [train] Hera: 7 lines written
✅ [train] Hermes: 76 lines written
✅ [train] Laestrygonians: 33 lines written
✅ [train] Lotus Eaters: 7 lines written
✅ [train] Odysseus: 575 lines written
✅ [train] Penelope: 69 lines written
✅ [train] Perimedes: 4 lines written
✅ [train] Polites: 39 lines written
✅ [train] Polyphemus: 24 lines written
✅ [train] Poseidon: 88 lines written
✅ [train] Scylla: 7 lines written
✅ [train] Sirens: 4 lines written
✅ [train] Soldiers:

In [18]:
# Make a folder to hold all character text files
os.makedirs("speaker_texts", exist_ok=True)

# Group lines by speaker and save each one
for speaker, group in df_finaldata.groupby("speaker"):
    lines = group["line"].dropna().tolist()
    text = "\n".join(lines)
    
    # Sanitize file name (remove slashes/spaces)
    filename = speaker.replace("/", "_").replace(" ", "_")
    path = os.path.join("speaker_texts", f"{filename}.txt")

    with open(path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"✅ {speaker}: {len(lines)} lines written")


✅ Aeolus: 37 lines written
✅ All: 40 lines written
✅ Antinuous: 83 lines written
✅ Aphrodite: 6 lines written
✅ Apollo: 5 lines written
✅ Ares: 10 lines written
✅ Athena: 105 lines written
✅ Calypso: 89 lines written
✅ Circe: 71 lines written
✅ Crew: 55 lines written
✅ Cyclopes: 5 lines written
✅ Ensemble: 109 lines written
✅ Eurylochus: 95 lines written
✅ Fallen Soldiers: 5 lines written
✅ Hephaestus: 5 lines written
✅ Hera: 8 lines written
✅ Hermes: 95 lines written
✅ Laestrygonians: 41 lines written
✅ Lotus Eaters: 9 lines written
✅ Odysseus: 719 lines written
✅ Penelope: 86 lines written
✅ Perimedes: 5 lines written
✅ Polites: 49 lines written
✅ Polyphemus: 30 lines written
✅ Poseidon: 110 lines written
✅ Scylla: 9 lines written
✅ Sirens: 5 lines written
✅ Soldiers: 180 lines written
✅ Suitors: 83 lines written
✅ Telemachus: 69 lines written
✅ Tiresias: 13 lines written
✅ Winions: 24 lines written
✅ Zeus: 57 lines written
