*üöß NOTICE: This is a W.I.P.*

# Character Cluster Coverage Analysis

**Purpose**: To provide a standard evaluation tool for measuring the scope and diversity of the characters represented in our character database without over-reliance on dimensional coverage. Specifically, what clusters do they cover and how broadly? What is not covered.

## 1. Setup

*Note: if additional packages are needed add them to the pyproject.toml [analysis] section using:*
```sh
uv add <package> --optional analysis
```

In [None]:
# Install analysis extras
!uv sync --extra analysis

In [None]:
# Import required packages
import os, json, textwrap
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

### 1. Load characters -> get hid, long_description

Load character data from characters.json

In [None]:
from pathlib import Path

CHARACTER_DATA_FPATH = Path("database_seeds") / "characters.json"

with open(CHARACTER_DATA_FPATH, "r", encoding="utf-8") as f:
    raw = json.load(f)

# characters.json might be a list[dict] or dict[hid -> dict]
if isinstance(raw, dict):
    records = []
    for hid, obj in raw.items():
        obj = dict(obj)
        obj.setdefault("hid", hid)
        records.append(obj)
else:
    records = raw

df = pd.DataFrame(records)

# sanity check: what columns exist?
df.columns

Extract what we need: hid, long_description

In [None]:
# Adjust if your schema differs (e.g. "id" instead of "hid", "description" instead of "long_description")
HID_COL = "hid"
DESC_COL = "long_description"

missing = [c for c in [HID_COL, DESC_COL] if c not in df.columns]
if missing:
    raise KeyError(
        f"Missing expected columns: {missing}. Available columns: {list(df.columns)}"
    )

df = df[[HID_COL, DESC_COL]].copy()

# Drop blanks
df[DESC_COL] = df[DESC_COL].astype(str).fillna("").str.strip()
df = df[df[DESC_COL].str.len() > 0].reset_index(drop=True)

df.head()

Normalize descriptions

In [None]:
import re


def normalize_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = re.sub(r"[ \t]+", " ", s)  # collapse spaces/tabs
    s = re.sub(r"\n{3,}", "\n\n", s)  # collapse excessive newlines
    return s.strip()


df["long_description_norm"] = df[DESC_COL].map(normalize_text)

Optional: add a stable header so embeddings ‚Äúknow‚Äù the entity id (useful when descriptions are short-ish):

In [None]:
df["embed_text"] = df.apply(
    lambda r: f"HID: {r[HID_COL]}\nDESCRIPTION:\n{r['long_description_norm']}", axis=1
)

### 2. Get OpenAI embeddings for long_description

In [None]:
from openai import OpenAI

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
if not os.environ.get("OPENAI_API_KEY"):
    raise EnvironmentError("OPENAI_API_KEY not set in environment.")

In [None]:
EMBED_MODEL = "text-embedding-3-large"  # or "text-embedding-3-small"


def embed_texts(texts, model=EMBED_MODEL, batch_size=128):
    vectors = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        resp = client.embeddings.create(model=model, input=batch)
        vectors.extend([d.embedding for d in resp.data])
    return np.array(vectors, dtype=np.float32)

In [None]:
emb = embed_texts(df["embed_text"].tolist())
emb.shape

In [None]:
df["embedding"] = list(emb)  # convenient but large
# Better: save separately
np.save("character_embeddings.npy", emb)
df[[HID_COL, "long_description_norm"]].to_parquet(
    "character_texts.parquet", index=False
)

Sanity check: do the embeddings look reasonable? (e.g. nearest neighbor search for a few examples)

In [None]:
S = cosine_similarity(emb)


def top_neighbors(i, k=10):
    sims = S[i].copy()
    sims[i] = -1
    nn = np.argsort(-sims)[:k]
    return pd.DataFrame(
        {
            "hid": df.loc[nn, HID_COL].values,
            "similarity": sims[nn],
            "preview": df.loc[nn, "long_description_norm"].str[:120].values,
        }
    )


top_neighbors(0, k=10)