In [30]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import classification_report

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv("../data_new/epic_all_songs_lines_trainingdata.csv")

# Name of column we're using -- uses lines if lines and stanza column if stanza
TEXT_COL = "line" if "line" in df.columns else "stanza"

In [3]:
## Train/Test Split, stratified by speaker
X_train, X_test, y_train, y_test = train_test_split(
    df[TEXT_COL], df["speaker"],
    test_size=0.2, random_state=42, stratify=df["speaker"]
)

In [4]:
## Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = model.encode(X_train.tolist(), convert_to_numpy=True, show_progress_bar=True)
X_test_emb  = model.encode(X_test.tolist(),  convert_to_numpy=True, show_progress_bar=True)


Batches: 100%|██████████| 58/58 [00:03<00:00, 19.16it/s]
Batches: 100%|██████████| 15/15 [00:00<00:00, 18.86it/s]


In [5]:
## Build speaker centroids from the training set

# Map each training vector to its speaker
train_df = pd.DataFrame({"speaker": y_train.values})
train_df["idx"] = np.arange(len(train_df))
train_df["vec"] = list(X_train_emb)

centroids = (
    train_df.groupby("speaker")["vec"]
    .apply(lambda vs: np.mean(np.stack(list(vs)), axis=0))
    .to_dict()
)

# Also prepare an ordered list of speakers for vectorized scoring
speakers = np.array(list(centroids.keys()))
centroid_mat = np.stack([centroids[s] for s in speakers])  # shape: (S, D)


In [6]:
## Predict by cosine similarity to centroids

# cosine(a,b) = a·b / (||a|| ||b||)
def cosine_matrix(A, B):
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A_norm @ B_norm.T  # shape: (len(A), len(B))

S = cosine_matrix(X_test_emb, centroid_mat)     # (N_test, S)
top1_idx = S.argmax(axis=1)
y_pred = speakers[top1_idx]


In [32]:
## Accuracy and detailed metrics

# Overall accuracy
acc = (y_pred == y_test.values).mean()
print(f"Top-1 accuracy: {acc:.3f}")

# Top-k accuracy (e.g., k=3)
k = 3
topk_idx = np.argpartition(S, -k, axis=1)[:, -k:]
topk_hits = np.array([y_test.values[i] in speakers[topk_idx[i]] for i in range(len(y_test))]).mean()
print(f"Top-{k} accuracy: {topk_hits:.3f}")

# Per-class precision/recall/F1
cr = classification_report(y_test, y_pred, digits=3)
print("\nClassification report:")
print(cr)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=speakers)
cm_df = pd.DataFrame(cm, index=[f"true:{s}" for s in speakers], columns=[f"pred:{s}" for s in speakers])
print("\nConfusion matrix (head):")
print(cm_df.iloc[:10, :10])


Top-1 accuracy: 0.188
Top-3 accuracy: 0.516

Classification report:
                 precision    recall  f1-score   support

         Aeolus      0.000     0.000     0.000         7
            All      0.095     0.250     0.138         8
      Antinuous      0.125     0.059     0.080        17
      Aphrodite      0.000     0.000     0.000         1
         Apollo      0.000     0.000     0.000         1
           Ares      0.000     0.000     0.000         2
         Athena      0.067     0.048     0.056        21
        Calypso      0.355     0.611     0.449        18
          Circe      0.167     0.286     0.211        14
           Crew      0.091     0.091     0.091        11
       Cyclopes      0.333     1.000     0.500         1
       Ensemble      0.111     0.045     0.065        22
     Eurylochus      0.156     0.263     0.196        19
Fallen Soldiers      0.000     0.000     0.000         1
     Hephaestus      0.000     0.000     0.000         1
           Hera    

## Graphs

In [None]:
## Confusion Matrix Visualization as Heatmap
fig = px.imshow(
    cm,
    x = speakers,
    y = speakers,
    color_continuous_scale="Blues",
    title="Confusion Matrix — Speaker Classification"
)
fig.update_layout(
    xaxis_title="Predicted label",
    yaxis_title="True label",
    height=800
)
fig.show()


In [48]:
cr = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(cr).transpose()
report_df = report_df.iloc[:-3]  # drop accuracy/macro avg rows

fig = px.bar(
    report_df.sort_values("f1-score", ascending=False),
    x=report_df.index,
    y="f1-score",
    title="Per-Class F1 Score",
    width=650

)
fig.update_layout(xaxis_tickangle=-90)
fig.show()
