In [6]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd




In [7]:
# Load your CSV of all lines
df = pd.read_csv("epic_all_songs_lines.csv")

# Load a pretrained embedding model (use any from https://huggingface.co/sentence-transformers)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Compute embeddings for every line
df['embedding'] = list(model.encode(df['line'], convert_to_numpy=True, show_progress_bar=True))

# 1️⃣ Build one vector per speaker (mean of all their lines)
speaker_vecs = (
    df.groupby('speaker')['embedding']
      .apply(lambda v: v.tolist())
      .apply(lambda lst: sum(lst) / len(lst))
)

# 2️⃣ Given a new line, compute its vector
test_line = "We must keep rowing through the storm"
test_vec = model.encode(test_line, convert_to_numpy=True)

# 3️⃣ Compare with each speaker using cosine similarity
import numpy as np

def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

scores = {spk: cosine(test_vec, vec) for spk, vec in speaker_vecs.items()}
pred_speaker = max(scores, key=scores.get)

print("Predicted speaker:", pred_speaker)
for spk, s in sorted(scores.items(), key=lambda x: -x[1])[:5]:
    print(f"{spk:15s}  {s:.3f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/73 [00:00<?, ?it/s]

Predicted speaker: soldiers
soldiers         0.552
eurylochus       0.535
odysseus         0.466
scylla           0.433
poseidon         0.427


In [14]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv("epic_all_songs_lines.csv")

# Choose the text column you want to classify
TEXT_COL = "line" if "line" in df.columns else "stanza"

# (Optional) normalize speaker labels
df["speaker"] = df["speaker"].str.strip()

# (Optional) drop very rare speakers (e.g., < 5 lines) to avoid tiny classes
min_lines = 5
counts = df["speaker"].value_counts()
keep_speakers = counts[counts >= min_lines].index
df = df[df["speaker"].isin(keep_speakers)].reset_index(drop=True)
print("Speakers kept:", len(keep_speakers))


Speakers kept: 34


In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    df[TEXT_COL], df["speaker"],
    test_size=0.2, random_state=42, stratify=df["speaker"]
)


In [16]:
model = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = model.encode(X_train.tolist(), convert_to_numpy=True, show_progress_bar=True)
X_test_emb  = model.encode(X_test.tolist(),  convert_to_numpy=True, show_progress_bar=True)


Batches:   0%|          | 0/58 [00:00<?, ?it/s]

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

In [17]:
# Map each training vector to its speaker
train_df = pd.DataFrame({"speaker": y_train.values})
train_df["idx"] = np.arange(len(train_df))
train_df["vec"] = list(X_train_emb)

centroids = (
    train_df.groupby("speaker")["vec"]
    .apply(lambda vs: np.mean(np.stack(list(vs)), axis=0))
    .to_dict()
)

# Also prepare an ordered list of speakers for vectorized scoring
speakers = np.array(list(centroids.keys()))
centroid_mat = np.stack([centroids[s] for s in speakers])  # shape: (S, D)


In [18]:
# cosine(a,b) = a·b / (||a|| ||b||)
def cosine_matrix(A, B):
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A_norm @ B_norm.T  # shape: (len(A), len(B))

S = cosine_matrix(X_test_emb, centroid_mat)     # (N_test, S)
top1_idx = S.argmax(axis=1)
y_pred = speakers[top1_idx]


In [19]:
# Overall accuracy
acc = (y_pred == y_test.values).mean()
print(f"Top-1 accuracy: {acc:.3f}")

# Top-k accuracy (e.g., k=3)
k = 3
topk_idx = np.argpartition(S, -k, axis=1)[:, -k:]
topk_hits = np.array([y_test.values[i] in speakers[topk_idx[i]] for i in range(len(y_test))]).mean()
print(f"Top-{k} accuracy: {topk_hits:.3f}")

# Per-class precision/recall/F1
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=speakers)
cm_df = pd.DataFrame(cm, index=[f"true:{s}" for s in speakers], columns=[f"pred:{s}" for s in speakers])
print("\nConfusion matrix (head):")
print(cm_df.iloc[:10, :10])


Top-1 accuracy: 0.186
Top-3 accuracy: 0.510

Classification report:
                 precision    recall  f1-score   support

         aeolus      0.200     0.143     0.167         7
            all      0.105     0.250     0.148         8
      antinuous      0.000     0.000     0.000        17
      aphrodite      0.000     0.000     0.000         1
         apollo      0.000     0.000     0.000         1
           ares      0.000     0.000     0.000         2
         athena      0.077     0.048     0.059        21
        calypso      0.345     0.556     0.426        18
          circe      0.154     0.286     0.200        14
           crew      0.111     0.091     0.100        11
       cyclopes      0.333     1.000     0.500         1
       ensemble      0.000     0.000     0.000        22
     eurylochus      0.156     0.263     0.196        19
fallen soldiers      0.000     0.000     0.000         1
     hephaestus      0.000     0.000     0.000         1
           hera    