In [2]:
import numpy as np
import pandas as pd

df = pd.read_parquet("../transcripts_with_embeddings.parquet")

In [16]:
def top_k_cosine_neighbors(df: pd.DataFrame, query_vec: np.ndarray, k: int = 5, emb_col: str = "embedding"):
    """
    Returns indices of top-k most similar rows in df[emb_col] to query_vec using cosine similarity.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing an 'embedding' column (list or np.ndarray per row).
    query_vec : np.ndarray
        The embedding vector to compare against, shape (D,).
    k : int
        Number of neighbors to return.
    emb_col : str
        Column name containing embeddings.

    Returns
    -------
    np.ndarray
        Array of DataFrame indices for top-k most similar rows (highest cosine sim).
    """

    # Stack embeddings into a matrix
    X = np.vstack(df[emb_col].to_numpy())  # shape (N, D)

    # Normalize both X and query for cosine similarity
    X_norm = X / (np.linalg.norm(X, axis=1, keepdims=True) + 1e-9)
    q = query_vec / (np.linalg.norm(query_vec) + 1e-9)

    # Compute cosine similarity (dot product of normalized vectors)
    sims = X_norm @ q

    # Get top-k indices (descending order)
    topk_idx = np.argpartition(-sims, range(k))[:k]
    topk_idx = topk_idx[np.argsort(-sims[topk_idx])]

    return df.index[topk_idx]

def display_key_info_match(sample_match: pd.Series):
    print(sample_match.team1_name, sample_match.team2_name)
    print(sample_match.team1_champions, sample_match.team2_champions)
    print(sample_match.team1_players, sample_match.team2_players)

In [10]:
sample_match = df.iloc[-100]
sample_embedding = sample_match.embedding
display_key_info_match(sample_match)

Bilibili Gaming JD Gaming
['Kennen', 'Maokai', 'Tristana', 'Miss Fortune', 'Rell'] ['Twisted Fate', 'Brand', 'Corki', 'Ezreal', 'Braum']
['Bin', 'Wei', 'Knight', 'Elk', 'ON'] ['sheer', 'Kanavi', 'Yagao', 'Ruler', 'MISSING']


In [23]:
closest = top_k_cosine_neighbors(df, sample_embedding, k=20)
closest

Index([1989, 1988, 1990, 1817, 1567, 1818, 1819, 1566, 1290, 1291, 1570,  969,
       1292, 1569,  971, 1784, 1964, 1293, 1774, 1785],
      dtype='int64')

In [30]:
display_key_info_match(df.iloc[1293])

JD Gaming Bilibili Gaming
['Gragas', 'Sejuani', 'Jayce', 'Varus', 'Tahm Kench'] ['Jax', 'Wukong', 'Veigar', 'Xayah', 'Rakan']
['369', 'Kanavi', 'Knight', 'Ruler', 'MISSING'] ['Bin', 'Xun', 'Yagao', 'Elk', 'ON']
