# Imports

In [1]:
from sms_norm import normalize_and_hash_series, dedupe_by_hash
from sms_embed import embed_dedup_dataframe, save_embeddings, load_embeddings
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import umap.umap_ as umap
import hdbscan
from plots import *
from sklearn.preprocessing import normalize
import joblib
import importlib
import os
import umap
import hdbscan
from plots import plot_campaigns

# Stage 1. Reference Profile

## Normalize and embed

In [2]:
# Load the ready-made synthetic data
df_raw = pd.read_csv("./artifacts/synthetic_one_originator.csv")  

# Normalize + dedupe
norm = normalize_and_hash_series(df_raw["raw_text"], seed=0)
norm.insert(0, "originator_id", df_raw["originator_id"])
norm.insert(1, "message_id", df_raw["message_id"])
dedup_df, _ = dedupe_by_hash(norm)

# Embed (offline MiniLM) and save CSV+NPY
LOCAL_MODEL = r"C:/models/all-MiniLM-L6-v2"  # local model folder

meta_df, X = embed_dedup_dataframe(
    dedup_df, text_col="normalized_text", id_col="template_hash_xx64",
    batch_size=64, normalize=True, model_name=LOCAL_MODEL
)
csv_path, npy_path = save_embeddings(meta_df, X, out_dir="./artifacts", prefix="week_synth")
print("Saved:", csv_path, npy_path)

Saved: ./artifacts\week_synth.csv ./artifacts\week_synth.npy


## UMAP+HDBSCAN, save cetroids + exemplars (LLM for campaign name) 

In [3]:
# --- Cell: build reference, compute centroids, nearest samples, call LLM, persist ---
import os
import numpy as np
import pandas as pd
import joblib
import umap
import hdbscan
from datetime import datetime
from typing import Any, Dict


ART_DIR   = "./artifacts"
PREFIX    = "week_synth"  # from save_embeddings(...)
META_CSV  = os.path.join(ART_DIR, f"{PREFIX}.csv")
VEC_NPY   = os.path.join(ART_DIR, f"{PREFIX}.npy")

assert os.path.exists(META_CSV) and os.path.exists(VEC_NPY), "Run Stage 4 first to create CSV+NPY."

# local helpers (write these files next to the notebook)
from llm_client import summarize_samples
from persist_utils import save_campaign_footprint

# PARAMETERS
N_NEAREST = 5              # take top K = min(N_NEAREST, cluster_size)
UMAP_PARAMS: Dict[str, Any] = dict(
    n_neighbors=15,
    min_dist=0.1,
    metric="cosine",
    random_state=42,
    force_approximation_algorithm=False,
    transform_seed=42,
)

HDBSCAN_PARAMS: Dict[str, Any] = dict(
    min_cluster_size=5,
    min_samples=2,
    metric="euclidean",
)

# Load meta + embeddings
meta = pd.read_csv(META_CSV, encoding="utf-8")
X = np.load(VEC_NPY)  # shape [M, D]

# Safety: ensure unit-norm (cosine-ready)
X = X.astype(np.float32)
X = X / np.maximum(np.linalg.norm(X, axis=1, keepdims=True), 1e-12)

# UMAP to 2D (for visualization only)
umap_model = umap.UMAP(**UMAP_PARAMS)
X_2d = umap_model.fit_transform(X)
joblib.dump(umap_model, os.path.join(ART_DIR, "umap_model.pkl"))

# HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(**HDBSCAN_PARAMS)
labels = clusterer.fit_predict(X)

# attach labels into meta so saved points include label column
meta = meta.copy()
meta["label"] = labels

unique, counts = np.unique(labels, return_counts=True)
print("Cluster counts:")
for k, v in zip(unique, counts):
    print(f"  C{k:>3}: {v:>5d}")

# Compute centroids and select exemplars (nearest samples)
cluster_labels = sorted([c for c in set(labels) if c != -1])
centroids = []
campaign_rows = []
examples_rows = []  # will hold campaign_examples (one row per exemplar)
for row_idx, cl in enumerate(cluster_labels):
    member_idx = np.where(labels == cl)[0]
    if member_idx.size == 0:
        continue
    # centroid in original embedding space (384-D), unit-norm
    cvec = X[member_idx].mean(axis=0)
    cvec = cvec / (np.linalg.norm(cvec) + 1e-12)
    centroids.append(cvec.astype(np.float32))

    # sims & ordering (cosine because unit-norm)
    sims = X[member_idx] @ cvec
    order = np.argsort(-sims)
    ordered_idx = member_idx[order]

    k = min(N_NEAREST, ordered_idx.size)
    top_idx = ordered_idx[:k]

    # collect exemplars rows
    for rank, global_i in enumerate(top_idx, start=1):
        sim_value = float(sims[order][rank-1])  # matches rank order
        examples_rows.append({
            "campaign_row_index": row_idx,
            "campaign_label": int(cl),
            "rank": int(rank),
            "message_id": meta.iloc[global_i].get("message_id", ""),
            "template_hash_xx64": meta.iloc[global_i].get("template_hash_xx64", ""),
            # Prefer normalized_text (masked). Use raw_text only if you understand privacy/PII.
            "text_sample": meta.iloc[global_i].get("normalized_text", meta.iloc[global_i].get("raw_text", "")),
            "sim_score": sim_value,
            "count_in_window": int(meta.iloc[global_i].get("count_in_window", 1))
        })

    # campaign-level metadata
    proto_count = int(member_idx.size)
    msg_count = int(meta.iloc[member_idx]["count_in_window"].sum()) if "count_in_window" in meta.columns else proto_count
    # date range: try to use timestamp fields if present else use current build date
    if {"window_start", "window_end"}.issubset(meta.columns):
        dr_start = meta.iloc[member_idx]["window_start"].min()
        dr_end = meta.iloc[member_idx]["window_end"].max()
    elif {"timestamp"}.issubset(meta.columns):
        dr_start = meta.iloc[member_idx]["timestamp"].min()
        dr_end = meta.iloc[member_idx]["timestamp"].max()
    else:
        dr_start = dr_end = datetime.utcnow().isoformat()

    campaign_rows.append({
        "row_index": row_idx,                   # index into centroids array
        "cluster_label": int(cl),               # original HDBSCAN label
        "proto_count": proto_count,
        "msg_count": msg_count,
        "date_range_start": dr_start,
        "date_range_end": dr_end,
        "campaign_name": "",                    # filled later by LLM/human
        "status": "Known"
    })

# Stack centroids
if len(centroids) > 0:
    C = np.vstack(centroids)
else:
    C = np.zeros((0, X.shape[1]), dtype=np.float32)

# Call LLM summarizer for each campaign using the nearest samples (deterministic)
campaigns_df = pd.DataFrame(campaign_rows)
examples_df = pd.DataFrame(examples_rows)

# For each campaign_row_index gather sorted samples and call LLM
campaign_names = []
for row in campaigns_df.itertuples(index=False):
    row_idx = int(row.row_index) # type: ignore
    samples = examples_df[examples_df["campaign_row_index"] == row_idx].sort_values("rank")["text_sample"].tolist()
    # Ensure samples are masked/normalized. summarize_samples will double-check.
    if len(samples) == 0:
        summary = ""
        raw_resp = None
    else:
        # Use the llm_client default model (gpt-4o-mini) or specify explicitly
        summary, raw_resp = summarize_samples(samples, max_words=5, model="gpt-4o-mini", temperature=0.0)
    campaign_names.append(summary or "")

campaigns_df["campaign_name"] = campaign_names

# Persist everything using helper (creates campaigns.csv, campaigns.npy, campaign_examples.csv, points.csv)
save_campaign_footprint(
    out_dir=ART_DIR,
    prefix=PREFIX,
    campaigns_df=campaigns_df,
    centroids=C,
    campaign_examples_df=examples_df,
    points_meta=meta,
    points_2d=X_2d, # type: ignore
)

print("Reference build complete. Artifacts saved under:", ART_DIR)


dotenv load error: No module named 'dotenv'


  warn(
  dr_start = dr_end = datetime.utcnow().isoformat()


Cluster counts:
  C -1:     3
  C  0:     5
  C  1:     6
  C  2:     6
  C  3:     6
  C  4:    11
Saved artifacts:
 - campaigns_csv: ./artifacts\week_synth_campaigns.csv
 - centroids_npy: ./artifacts\week_synth_campaign_centroids.npy
 - campaign_examples_csv: ./artifacts\week_synth_campaign_examples.csv
 - points_csv: ./artifacts\week_synth_points.csv
Reference build complete. Artifacts saved under: ./artifacts


## Plot UMAP of reference

In [None]:
plot_path     = os.path.join(ART_DIR, "umap_hdbscan_campaigns.png")
cluster_ids = campaigns_df["cluster_label"].tolist()
plot_campaigns(X_2d, labels, cluster_ids, plot_path)
print(" - plot          :", plot_path)

# Stage 2. New data

In [None]:
# ---------------- Stage 2: assign new batch to saved campaign centroids ----------------
import os
import numpy as np
import pandas as pd

# exact Stage-1 helpers (must exist)
from sms_norm import normalize_and_hash_series, dedupe_by_hash
from sms_embed import embed_dedup_dataframe

ART_DIR   = "./artifacts"
PREFIX    = "week_synth"
#NEW_BATCH = "./artifacts/mixed_160_orig__40_new.csv"  
NEW_BATCH = "./artifacts/synthetic_one_originator.csv"
SIM_THRESHOLD = 0.8  # per SOW

# --- validate artifacts ---
centroids_path = os.path.join(ART_DIR, f"{PREFIX}_campaign_centroids.npy")
campaigns_csv   = os.path.join(ART_DIR, f"{PREFIX}_campaigns.csv")
meta_csv        = os.path.join(ART_DIR, f"{PREFIX}.csv")   # Stage-1 prototypes meta

assert os.path.exists(centroids_path), f"Missing centroids: {centroids_path}"
assert os.path.exists(campaigns_csv), f"Missing campaigns CSV: {campaigns_csv}"
assert os.path.exists(NEW_BATCH), f"Missing new batch: {NEW_BATCH}"
assert os.path.exists(meta_csv), f"Missing reference meta CSV: {meta_csv}"

centroids = np.load(centroids_path).astype(np.float32)  # [K, D]
campaigns_df = pd.read_csv(campaigns_csv)
ref_meta_df  = pd.read_csv(meta_csv, encoding="utf-8")

# normalize centroids for cosine
centroids = centroids / np.maximum(np.linalg.norm(centroids, axis=1, keepdims=True), 1e-12)

# --- load new batch and produce deduplicated prototypes (exactly as Stage-1) ---
df_new = pd.read_csv(NEW_BATCH, encoding="utf-8")

# Choose raw text column (Stage-1 used "raw_text")
if "raw_text" in df_new.columns:
    raw_col = "raw_text"
elif "text" in df_new.columns:
    raw_col = "text"
else:
    raw_col = [c for c in df_new.columns if df_new[c].dtype == object][0]

# Normalize + compute template hashes (identical function used in Stage-1)
norm_df = normalize_and_hash_series(df_new[raw_col].astype(str), seed=0)

# Preserve identifiers if present
if "message_id" in df_new.columns:
    norm_df.insert(0, "message_id", df_new["message_id"].values) # type: ignore
if "originator_id" in df_new.columns and "originator_id" not in norm_df.columns:
    norm_df.insert(0, "originator_id", df_new["originator_id"].values) # type: ignore

# Deduplicate by hash -> dedup_df is prototypes (one row per unique normalized_text)
dedup_df, _ = dedupe_by_hash(norm_df)

# Diagnostics about dedup
n_total_rows = len(df_new)
n_prototypes = len(dedup_df)
print(f"New batch rows = {n_total_rows}; dedup prototypes = {n_prototypes}")

# Also show dedup counts inside the first / last slices (prototype counts)
first_slice_count = dedup_df.iloc[:min(160, n_total_rows)]['normalized_text'].nunique()
last_slice_count  = dedup_df.iloc[min(160, n_total_rows):]['normalized_text'].nunique()
print(f"Dedup prototypes in first slice (<=160 rows): {first_slice_count}")
print(f"Dedup prototypes in last slice (~new campaign): {last_slice_count}")

# --- Embed prototypes only (same function used in Stage-1) ---
# embed_dedup_dataframe returns (meta_df, X) where meta_df contains 'template_hash_xx64' and 'normalized_text'
meta_df, X_proto = embed_dedup_dataframe(
    dedup_df,
    text_col="normalized_text",
    id_col="template_hash_xx64",
    batch_size=64,
    normalize=True
)

# Ensure dtype and normalize
X_proto = np.asarray(X_proto, dtype=np.float32)
X_proto = X_proto / np.maximum(np.linalg.norm(X_proto, axis=1, keepdims=True), 1e-12)

# Save dedup prototype embeddings & meta for traceability (optional)
out_meta_prototypes = os.path.join(ART_DIR, f"{PREFIX}_new_prototypes.csv")
out_npy_prototypes  = os.path.join(ART_DIR, f"{PREFIX}_new_prototypes.npy")
meta_df.to_csv(out_meta_prototypes, index=False, encoding="utf-8")
np.save(out_npy_prototypes, X_proto.astype(np.float32))

# --- Assign prototypes to nearest campaign centroids (prototype-level assignment only) ---
sims_proto = X_proto @ centroids.T     # [P, K]
assigned_proto_idx = np.argmax(sims_proto, axis=1)
assigned_proto_score = np.max(sims_proto, axis=1)
proto_status = np.where(assigned_proto_score >= SIM_THRESHOLD, "Known", "Unknown")

# Attach assignment results to meta_df (prototype-level output)
meta_df["assigned_campaign_row_index"] = assigned_proto_idx
meta_df["assigned_campaign_score"] = assigned_proto_score
meta_df["assigned_campaign_label"] = [campaigns_df.loc[i,"cluster_label"] if (i < len(campaigns_df)) else None for i in assigned_proto_idx]
meta_df["assigned_campaign_name"] = [campaigns_df.loc[i,"campaign_name"] if (i < len(campaigns_df)) else "" for i in assigned_proto_idx]
meta_df["status"] = proto_status

# Save prototype-level assignments 
proto_out_path = os.path.join(ART_DIR, f"{PREFIX}_new_prototypes_assignments.csv")
meta_df.to_csv(proto_out_path, index=False, encoding="utf-8")

# --- Prototype-level diagnostics (what you care about) ---
print("\nPrototype-level assignment (status counts):")
print(meta_df["status"].value_counts())

print("\nUnique prototype counts by status:")
print(meta_df.groupby("status")["template_hash_xx64"].nunique().reset_index(name="unique_prototypes"))

# Overlap with Stage-1 prototypes (ref_meta_df)
ref_norms = set(ref_meta_df["normalized_text"].astype(str).unique())
new_norms = set(meta_df["normalized_text"].astype(str).unique())
overlap = new_norms & ref_norms
print(f"\nOverlap with reference prototypes: {len(overlap)} prototypes present in new prototypes")
print("Sample of new normalized_texts NOT in reference (up to 10):")
print(list(new_norms - ref_norms)[:10])

# Status counts in first / last prototype slices (prototype-space)
P = len(meta_df)
first_p_slice = meta_df.iloc[:min(160, P)]
last_p_slice  = meta_df.iloc[min(160, P):]
print(f"\nStatus counts in first {len(first_p_slice)} prototypes (expected original):")
print(first_p_slice["status"].value_counts())

print(f"\nStatus counts in last {len(last_p_slice)} prototypes (expected new campaign prototypes):")
print(last_p_slice["status"].value_counts())

print(f"\nSaved prototype assignments to: {proto_out_path}")
print(f"Saved prototype embeddings to: {out_npy_prototypes}")


New batch rows = 200; dedup prototypes = 37
Dedup prototypes in first slice (<=160 rows): 37
Dedup prototypes in last slice (~new campaign): 0

Prototype-level assignment (status counts):
status
Known      26
Unknown    11
Name: count, dtype: int64

Unique prototype counts by status:
    status  unique_prototypes
0    Known                 26
1  Unknown                 11

Overlap with reference prototypes: 37 prototypes present in new prototypes
Sample of new normalized_texts NOT in reference (up to 10):
[]

Status counts in first 37 prototypes (expected original):
status
Known      26
Unknown    11
Name: count, dtype: int64

Status counts in last 0 prototypes (expected new campaign prototypes):
Series([], Name: count, dtype: int64)

Saved prototype assignments to: ./artifacts\week_synth_new_prototypes_assignments.csv
Saved prototype embeddings to: ./artifacts\week_synth_new_prototypes.npy


# debug plot of new data

In [None]:
# Debug plot (aligned with Stage 5 UMAP) ===
# - Uses saved reducer (umap_model.pkl) with deterministic transform
# - Stars = mean of saved 2D coords (points.csv)
# - New points: colored by assigned centroid's LABEL; noise in gray


# load artifacts
umap_model = joblib.load("./artifacts/umap_model.pkl")          # same reducer from Stage 5
points_df  = pd.read_csv("./artifacts/points.csv")              # has 'umap_x','umap_y','label'
X_ref      = np.load("./artifacts/week_synth.npy")              # optional (not used if X_ref_2d passed)
X_ref_2d   = points_df[["umap_x","umap_y"]].to_numpy()
ref_labels = points_df["label"].to_numpy()

# row -> label mapping (used to color new points by campaign label)
cmap_df = pd.read_csv("./artifacts/campaigns.csv").sort_values("row_index")
centroid_labels = cmap_df["cluster_label"].to_numpy(dtype=int)

# plotting function (reloaded to ensure latest version with X_ref_2d=)
import plots
importlib.reload(plots)
from plots import plot_ref_stars_mean2d_with_new

# call the plotter (reuses a_idx, noise_mask, C, X from Cell 1)
plot_ref_stars_mean2d_with_new(
    reducer=umap_model,
    X_ref=X_ref,                 # not used if X_ref_2d provided; kept for API symmetry
    ref_labels=ref_labels,
    C=C,
    X_new=X,
    a_idx=a_idx,
    noise_mask=noise_mask,
    centroid_labels=centroid_labels,
    X_ref_2d=X_ref_2d,           # ensures stars match Stage-5 exactly
    title="Stage 6 Debug: New vs Reference (aligned UMAP)",
    save_path="./artifacts/stage6_debug_umap.png",
)
