In [None]:
# Takahashi embeddings analysis
# This notebook is split into cells for: environment, imports, params, loading, projection, clustering, plotting, and exports.

"""
Cell 1: Intro (markdown-like header).
"""
print('Notebook prepared: split cells will follow')


In [1]:
# Environment & metadata
import sys
import subprocess
print('Python:', sys.version.replace('\n',' '))
try:
    pkgs = subprocess.check_output([sys.executable, '-m', 'pip', 'freeze']).decode('utf-8')
    print('\nInstalled packages (top 30 lines):')
    print('\n'.join(pkgs.splitlines()[:30]))
except Exception as e:
    print('Could not list packages:', e)


Python: 3.12.3 (main, Aug 14 2025, 17:47:21) [GCC 13.3.0]

Installed packages (top 30 lines):
anyio==4.11.0
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
arrow==1.4.0
asttokens==3.0.1
async-lru==2.0.5
attrs==25.4.0
babel==2.17.0
beautifulsoup4==4.14.2
bleach==6.3.0
certifi==2025.11.12
cffi==2.0.0
charset-normalizer==3.4.4
comm==0.2.3
contourpy==1.3.3
cycler==0.12.1
debugpy==1.8.17
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.1
fastjsonschema==2.21.2
fonttools==4.60.1
fqdn==1.5.1
gensim==4.4.0
h11==0.16.0
httpcore==1.0.9
httpx==0.28.1
idna==3.11
ipykernel==7.1.0
ipython==9.7.0

Installed packages (top 30 lines):
anyio==4.11.0
argon2-cffi==25.1.0
argon2-cffi-bindings==25.1.0
arrow==1.4.0
asttokens==3.0.1
async-lru==2.0.5
attrs==25.4.0
babel==2.17.0
beautifulsoup4==4.14.2
bleach==6.3.0
certifi==2025.11.12
cffi==2.0.0
charset-normalizer==3.4.4
comm==0.2.3
contourpy==1.3.3
cycler==0.12.1
debugpy==1.8.17
decorator==5.2.1
defusedxml==0.7.1
executing==2.2.1
fastjsonschema==2.21.2
fontt

In [2]:
# Imports
import os
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Optional imports
try:
    import umap
    _has_umap = True
except Exception:
    _has_umap = False

try:
    from sklearn.manifold import TSNE
    _has_tsne = True
except Exception:
    _has_tsne = False

print('Imports ready. umap:', _has_umap, 'tsne:', _has_tsne)


Imports ready. umap: True tsne: True


In [3]:
# Parameters (tweakable)
OUTDIR = Path('notebooks/outputs')
TERMS_P = Path('models/minimal/voynich_takahashi/terms.npy')
EMB_P = Path('models/minimal/voynich_takahashi/embeddings.npy')
JSONL_P = Path('data/processed/voynich_takahashi.jsonl')
OUTDIR.mkdir(parents=True, exist_ok=True)

# Projection params
USE_UMAP = True
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1
TSNE_PERPLEXITY = 30
PCA_DIM = 50

# Clustering
K = 10

print('Parameters set. OUTDIR:', OUTDIR)


Parameters set. OUTDIR: notebooks/outputs


In [5]:
# Load data (adjust paths relative to notebook/workdir)
from pathlib import Path
cwd = Path.cwd()
ROOT = cwd.parent if cwd.name == 'notebooks' else cwd
TERMS_P = ROOT / 'models' / 'minimal' / 'voynich_takahashi' / 'terms.npy'
EMB_P = ROOT / 'models' / 'minimal' / 'voynich_takahashi' / 'embeddings.npy'
JSONL_P = ROOT / 'data' / 'processed' / 'voynich_takahashi.jsonl'
OUTDIR = ROOT / 'notebooks' / 'outputs'
OUTDIR.mkdir(parents=True, exist_ok=True)

print('Resolved ROOT:', ROOT)
print('Terms path:', TERMS_P)
print('Embeddings path:', EMB_P)
print('JSONL path:', JSONL_P)

terms = np.load(TERMS_P, allow_pickle=True)
emb = np.load(EMB_P)
print('Loaded terms:', terms.shape, 'embeddings:', emb.shape)

lines = []
if JSONL_P.exists():
    with open(JSONL_P, 'r', encoding='utf-8') as fh:
        for ln in fh:
            try:
                lines.append(json.loads(ln))
            except Exception:
                pass
print('JSONL lines:', len(lines))


Resolved ROOT: /home/tiago/Documents/GitHub/voynich-decoder
Terms path: /home/tiago/Documents/GitHub/voynich-decoder/models/minimal/voynich_takahashi/terms.npy
Embeddings path: /home/tiago/Documents/GitHub/voynich-decoder/models/minimal/voynich_takahashi/embeddings.npy
JSONL path: /home/tiago/Documents/GitHub/voynich-decoder/data/processed/voynich_takahashi.jsonl
Loaded terms: (5861,) embeddings: (5861, 20)
JSONL lines: 5208


In [None]:
# Preprocess / normalize embeddings
scaler = StandardScaler()
E = scaler.fit_transform(emb)

pca = PCA(n_components=min(PCA_DIM, E.shape[1]-1))
E_p = pca.fit_transform(E)
print('PCA reduced shape:', E_p.shape)


In [None]:
# Projection (UMAP or t-SNE)
if USE_UMAP and _has_umap:
    reducer = umap.UMAP(n_components=2, n_neighbors=UMAP_N_NEIGHBORS, min_dist=UMAP_MIN_DIST, metric='cosine', random_state=42)
    proj = reducer.fit_transform(E_p)
    method = 'umap'
else:
    reducer = TSNE(n_components=2, perplexity=TSNE_PERPLEXITY, random_state=42, n_iter=1000)
    proj = reducer.fit_transform(E_p)
    method = 'tsne'
print('Projection method:', method, 'proj shape:', proj.shape)


In [None]:
# Clustering
km = KMeans(n_clusters=K, random_state=42, n_init=10)
labels = km.fit_predict(E_p)
print('Clusters:', np.unique(labels))

if len(set(labels)) > 1:
    try:
        sil = silhouette_score(E_p, labels)
        print('Silhouette score:', sil)
    except Exception as e:
        print('Could not compute silhouette:', e)


In [None]:
# Plot clusters (2D)
plt.figure(figsize=(10,8))
pal = sns.color_palette('tab10', n_colors=K)
for k in range(K):
    idx = labels == k
    plt.scatter(proj[idx,0], proj[idx,1], s=12, color=pal[k%len(pal)], label=f'cluster {k} ({idx.sum()})', alpha=0.8)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.title(f'Takahashi embeddings projection (method={method})')
plt.tight_layout()
fig_path = OUTDIR / f'projection_{method}.png'
plt.savefig(fig_path, dpi=150)
plt.show()
print('Saved figure to', fig_path)


In [None]:
# Top terms per cluster and export
from numpy.linalg import norm
centroids = km.cluster_centers_

def top_terms_for_centroid(c, terms_vecs, terms, topk=10):
    sims = (terms_vecs @ c) / (norm(terms_vecs, axis=1) * (norm(c)+1e-12))
    order = np.argsort(-sims)[:topk]
    return [(terms[i].item() if hasattr(terms[i],'item') else terms[i], float(sims[i])) for i in order]

# Use E_p (PCA-reduced) as term vectors
top_by_cluster = {}
for k in range(K):
    c = centroids[k]
    top = top_terms_for_centroid(c, E_p, terms, topk=10)
    top_by_cluster[k] = top

out_terms_p = OUTDIR / 'top_terms_by_cluster.json'
with open(out_terms_p, 'w', encoding='utf-8') as fh:
    json.dump(top_by_cluster, fh, ensure_ascii=False, indent=2)
print('Saved top terms per cluster to', out_terms_p)

for k, items in top_by_cluster.items():
    print(f'Cluster {k} ({(labels==k).sum()} terms):', ', '.join([t for t,s in items[:5]]))


In [None]:
# Save summary and notes
print('Outputs saved in', OUTDIR)
print('Files:')
for p in sorted(OUTDIR.glob('*')):
    print(' -', p.name)

print('\nNotebook complete. Next: run gensim training for richer embeddings or try HDBSCAN for non-parametric clusters.')
