# Traverse: End-to-End (Records CSV)

Build a genre/style co-occurrence graph directly from a records CSV.
The timeline is based on **release year** rather than listening history.

**Prerequisites:**
```bash
pip install -e ".[dev]"
cd src/traverse/cosmograph/app && npm install && npm run build
```

## 1. Configuration

Update these to match your local setup.

In [11]:
from pathlib import Path

RECORDS_CSV = Path(r"C:\Users\xtrem\Documents\Datasets\records.csv")
YEAR_MIN = 1860   # clamp: drop years before this
YEAR_MAX = 2025   # clamp: drop years after this
CHUNKSIZE = 200_000
MIN_COOCCURRENCE = 2
MAX_NODES = 5_000  # 0 = no cap
MAX_EDGES = 150_000 # 0 = no cap

## 2. Scan Records CSV

Stream through the CSV in chunks, extract genres/styles and release year
from each row, and accumulate co-occurrence counts with year-based timeline.

In [None]:
import datetime, re
from collections import Counter, defaultdict
from itertools import combinations
from typing import Dict, List, Optional, Tuple

import pandas as pd
from tqdm import tqdm

from traverse.processing.normalize import split_tags, pretty_label

# ── Year parsing ─────────────────────────────────────────────────────
YR4 = re.compile(r"(?:^|[^0-9])(\d{4})(?:[^0-9]|$)")

def parse_year(v: object) -> Optional[int]:
    if v is None: return None
    s = str(v).strip()
    try:
        y = int(float(s))
        if 0 < y < 10000: return y
    except Exception: pass
    m = YR4.search(s)
    return int(m.group(1)) if m and 0 < int(m.group(1)) < 10000 else None

def clamp_year(y: Optional[int]) -> Optional[int]:
    if y is None or y < YEAR_MIN or y > YEAR_MAX: return None
    return y

def year_to_ts(y: Optional[int]) -> Optional[int]:
    """
    Jan 1 of the given year as Unix epoch **milliseconds** (UTC).
    Returns None for years before 1970 or invalid input.
    """
    if y is None or not isinstance(y, int):
        return None
    if y < 1970 or y > 9999:
        return None
    dt = datetime.datetime(y, 1, 1, tzinfo=datetime.timezone.utc)
    return int(dt.timestamp() * 1000)

def detect_col(colmap: Dict[str, str], *candidates: str) -> Optional[str]:
    for c in candidates:
        if c in colmap: return colmap[c]
    return None

# ── Accumulate co-occurrences ─────────────────────────────────────────
counts: Counter[Tuple[str, str]] = Counter()
edge_first_year: Dict[Tuple[str, str], int] = {}
point_first_year: Dict[str, int] = {}
first_label: Dict[str, str] = {}
tag_category_counts: Dict[str, Counter] = defaultdict(Counter)
total_rows = 0
tagged_rows = 0

reader = pd.read_csv(RECORDS_CSV, chunksize=CHUNKSIZE, dtype="string",
                      keep_default_na=True, na_filter=True)

for chunk in tqdm(reader, desc="Reading records", unit="chunk"):
    total_rows += len(chunk)
    colmap = {c.lower(): c for c in chunk.columns}

    gcol = detect_col(colmap, "genres", "genre")
    scol = detect_col(colmap, "styles", "style")
    ycol = detect_col(colmap, "release_year", "year", "releaseyear",
                      "released_year", "release year", "released")

    for gval, sval, yval in zip(
        chunk[gcol] if gcol else [""] * len(chunk),
        chunk[scol] if scol else [""] * len(chunk),
        chunk[colmap[ycol]] if ycol else [None] * len(chunk),
    ):
        genre_tags = split_tags(gval)
        style_tags = split_tags(sval)
        tags = genre_tags + style_tags
        if not tags:
            continue
        tagged_rows += 1
        y = clamp_year(parse_year(yval))

        for t in set(tags):
            if t not in first_label:
                first_label[t] = pretty_label(t)
            if y is not None:
                cur = point_first_year.get(t)
                if cur is None or y < cur:
                    point_first_year[t] = y

        # Track genre vs style category per tag
        for t in set(genre_tags):
            tag_category_counts[t]["genre"] += 1
        for t in set(style_tags):
            tag_category_counts[t]["style"] += 1

        for a, b in combinations(sorted(set(tags)), 2):
            counts[(a, b)] += 1
            if y is not None:
                cur = edge_first_year.get((a, b))
                if cur is None or y < cur:
                    edge_first_year[(a, b)] = y

print(f"Scanned {total_rows:,} rows, {tagged_rows:,} with tags, "
      f"{len(counts):,} unique edge keys")

## 3. Build the Graph

Apply thresholds and caps, then assemble points and links with
release-year timeline (`first_seen_ts`).

In [None]:
# Filter by min co-occurrence
edges = [(a, b, w) for (a, b), w in counts.items() if w >= MIN_COOCCURRENCE]
edges.sort(key=lambda x: x[2], reverse=True)

# Cap nodes by weighted degree
strength: Dict[str, int] = defaultdict(int)
for a, b, w in edges:
    strength[a] += w
    strength[b] += w

if MAX_NODES > 0:
    keep = {n for n, _ in sorted(strength.items(),
            key=lambda kv: kv[1], reverse=True)[:MAX_NODES]}
    edges = [(a, b, w) for a, b, w in edges if a in keep and b in keep]

if MAX_EDGES > 0 and len(edges) > MAX_EDGES:
    edges = edges[:MAX_EDGES]

# Assemble points
node_ids = set()
for a, b, _ in edges:
    node_ids.add(a)
    node_ids.add(b)

points = []
for nid in sorted(node_ids):
    p = {"id": nid, "label": first_label.get(nid, nid)}
    fy = point_first_year.get(nid)
    if fy is not None:
        ts = year_to_ts(fy)
        if ts is not None:
            p["first_seen_ts"] = ts
    # Majority-vote category from tag_category_counts
    cats = tag_category_counts.get(nid)
    if cats:
        p["category"] = cats.most_common(1)[0][0]
    points.append(p)

# Assemble links
links = []
for a, b, w in edges:
    lk = {"source": a, "target": b, "weight": w}
    fy = edge_first_year.get((a, b))
    if fy is not None:
        ts = year_to_ts(fy)
        if ts is not None:
            lk["first_seen_ts"] = ts
    links.append(lk)

graph = {"points": points, "links": links}

# Stats
pts_with_ts = sum(1 for p in points if "first_seen_ts" in p)
lks_with_ts = sum(1 for l in links if "first_seen_ts" in l)
pts_with_cat = sum(1 for p in points if "category" in p)
cat_values = {p.get("category") for p in points} - {None}
print(f"Graph: {len(points)} nodes ({pts_with_ts} with timeline, {pts_with_cat} with category), "
      f"{len(links)} edges ({lks_with_ts} with timeline)")
print(f"Categories: {cat_values}")

## 4. Export JSON and Serve

Write the graph to the frontend's `dist/` directory, then start the
built-in static server. Open the printed URL in your browser.

In [None]:
import json
from traverse.cosmograph.server import serve, _default_dist_dir

# Build output with meta for cluster field
has_category = any("category" in p for p in points)
output = {}
if has_category:
    output["meta"] = {"clusterField": "category"}
output["points"] = graph["points"]
output["links"] = graph["links"]

out_path = _default_dist_dir() / "cosmo_genres_records_timeline.json"
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(output, indent=2), encoding="utf-8")
print(f"Wrote {out_path} ({len(points)} nodes, {len(links)} edges)")
if has_category:
    print("  (includes meta.clusterField = 'category')")
print()
print("Starting server — open in browser:")
print("  http://127.0.0.1:8080/?data=/cosmo_genres_records_timeline.json")
print()
print("Press Ctrl+C (or interrupt the kernel) to stop.")

serve(port=8080)