In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


In [4]:
df = pd.read_csv("../data/datacleaning_2/epic_all_songs_lines_trainingdata.csv")

# Name of column we're using -- uses lines if lines and stanza column if stanza
TEXT_COL = "line" if "line" in df.columns else "stanza"

In [5]:
## Train/Test Split, stratified by speaker
X_train, X_test, y_train, y_test = train_test_split(
    df[TEXT_COL], df["speaker"],
    test_size=0.2, random_state=42, stratify=df["speaker"]
)

In [6]:
## Embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = model.encode(X_train.tolist(), convert_to_numpy=True, show_progress_bar=True)
X_test_emb  = model.encode(X_test.tolist(),  convert_to_numpy=True, show_progress_bar=True)


Batches: 100%|██████████| 58/58 [00:02<00:00, 28.72it/s]
Batches: 100%|██████████| 15/15 [00:00<00:00, 29.54it/s]


In [7]:
## Build speaker centroids from the training set

# Map each training vector to its speaker
train_df = pd.DataFrame({"speaker": y_train.values})
train_df["idx"] = np.arange(len(train_df))
train_df["vec"] = list(X_train_emb)

centroids = (
    train_df.groupby("speaker")["vec"]
    .apply(lambda vs: np.mean(np.stack(list(vs)), axis=0))
    .to_dict()
)

# Also prepare an ordered list of speakers for vectorized scoring
speakers = np.array(list(centroids.keys()))
centroid_mat = np.stack([centroids[s] for s in speakers])  # shape: (S, D)


In [8]:
## Predict by cosine similarity to centroids

# cosine(a,b) = a·b / (||a|| ||b||)
def cosine_matrix(A, B):
    A_norm = A / np.linalg.norm(A, axis=1, keepdims=True)
    B_norm = B / np.linalg.norm(B, axis=1, keepdims=True)
    return A_norm @ B_norm.T  # shape: (len(A), len(B))

S = cosine_matrix(X_test_emb, centroid_mat)     # (N_test, S)
top1_idx = S.argmax(axis=1)
y_pred = speakers[top1_idx]


In [9]:
## Accuracy and detailed metrics

# Overall accuracy
acc = (y_pred == y_test.values).mean()
print(f"Top-1 accuracy: {acc:.3f}")

# Top-k accuracy (e.g., k=3)
k = 3
topk_idx = np.argpartition(S, -k, axis=1)[:, -k:]
topk_hits = np.array([y_test.values[i] in speakers[topk_idx[i]] for i in range(len(y_test))]).mean()
print(f"Top-{k} accuracy: {topk_hits:.3f}")

# Per-class precision/recall/F1
print("\nClassification report:")
print(classification_report(y_test, y_pred, digits=3))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=speakers)
cm_df = pd.DataFrame(cm, index=[f"true:{s}" for s in speakers], columns=[f"pred:{s}" for s in speakers])
print("\nConfusion matrix (head):")
print(cm_df.iloc[:10, :10])


Top-1 accuracy: 0.186
Top-3 accuracy: 0.510

Classification report:
                 precision    recall  f1-score   support

         aeolus      0.200     0.143     0.167         7
            all      0.105     0.250     0.148         8
      antinuous      0.000     0.000     0.000        17
      aphrodite      0.000     0.000     0.000         1
         apollo      0.000     0.000     0.000         1
           ares      0.000     0.000     0.000         2
         athena      0.077     0.048     0.059        21
        calypso      0.345     0.556     0.426        18
          circe      0.154     0.286     0.200        14
           crew      0.111     0.091     0.100        11
       cyclopes      0.333     1.000     0.500         1
       ensemble      0.000     0.000     0.000        22
     eurylochus      0.156     0.263     0.196        19
fallen soldiers      0.000     0.000     0.000         1
     hephaestus      0.000     0.000     0.000         1
           hera    