## MCCV Lite Demo Notebook

This notebook demonstrates the **end-to-end MCCV prototype** using the **lite (pure-Python)** implementation:

- Generate **multi-level synthetic data** (beneficiary → diagnosis → multi-modality evidence)
- Score **clinical coherence** (0 to 1)
- Compare **fraud vs. non-fraud** cases (ground truth from the generator)
- Produce a **human-readable audit report** for a flagged case

> Note: This uses `mccv.lite` so it runs even when `numpy` / `pandas` are unavailable or unstable.


In [None]:
import os
import sys
from statistics import mean

# Ensure we can import the local package
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from mccv.lite.synthetic_generator import MedicareSyntheticGeneratorLite, HCC_DEFINITIONS_LITE
from mccv.lite.knowledge_graph import ClinicalKnowledgeGraphLite
from mccv.lite.rule_based_scorer import RuleBasedCoherenceScorerLite
from mccv.lite.audit_report import AuditReportGeneratorLite

print("Imports OK")


In [None]:
# 1) Generate synthetic data (multi-level)
gen = MedicareSyntheticGeneratorLite(
    n_beneficiaries=500,
    fraud_rate=0.18,
    seed=42,
)
data = gen.generate()

meta = (data.get("meta") or [{}])[0]
print("Measurement window:", meta.get("measurement_start_date"), "→", meta.get("measurement_end_date"))
print("Tables:", sorted(data.keys()))
print("Beneficiaries:", len(data["beneficiaries"]))
print("Diagnosis records:", len(data["diagnosis_records"]))
print("Pharmacy claims:", len(data["pharmacy_claims"]))
print("Lab claims:", len(data["lab_claims"]))
print("Specialist visits:", len(data["specialist_visits"]))
print("Procedure claims:", len(data["procedure_claims"]))
print("HRA records:", len(data["hra_records"]))
print("Labels:", len(data["labels"]))


In [None]:
# 2) Build knowledge graph + score coherence (rule-based)
kg = ClinicalKnowledgeGraphLite(hcc_definitions={k: v.__dict__ for k, v in HCC_DEFINITIONS_LITE.items()})
weights_path = os.path.join(PROJECT_ROOT, "mccv", "configs", "hcc_weights.yaml")

scorer = RuleBasedCoherenceScorerLite(
    knowledge_graph=kg,
    weights_config_path=weights_path,
    measurement_start_date=meta.get("measurement_start_date"),
    measurement_end_date=meta.get("measurement_end_date"),
)

preds = scorer.score_dataset(data)  # list[dict]
labels = data["labels"]
label_by_key = {(l["beneficiary_id"], l["hcc_code"]): l for l in labels}

# Merge predictions with ground truth labels
joined = []
for p in preds:
    l = label_by_key.get((p["beneficiary_id"], p["hcc_code"]), {})
    joined.append({**p, **{"is_fraudulent": l.get("is_fraudulent", False), "fraud_type": l.get("fraud_type", "")}})

# Summary stats
fraud_scores = [r["coherence_score"] for r in joined if r["is_fraudulent"]]
legit_scores = [r["coherence_score"] for r in joined if not r["is_fraudulent"]]
print("Scored rows:", len(joined))
print("Mean coherence (fraud):", round(mean(fraud_scores), 3) if fraud_scores else None)
print("Mean coherence (non-fraud):", round(mean(legit_scores), 3) if legit_scores else None)

# Breakdown of high-risk flags by ground-truth fraud_type
high_risk = [r for r in joined if r["coherence_score"] < scorer.high_risk_threshold]
counts = {}
for r in high_risk:
    if not r["is_fraudulent"]:
        continue
    ft = r.get("fraud_type") or "unknown"
    counts[ft] = counts.get(ft, 0) + 1

print(f"High-risk (score < {scorer.high_risk_threshold:.2f}):", len(high_risk), "/", len(joined))
if counts:
    print("High-risk fraud breakdown:")
    for k in sorted(counts.keys()):
        print(f"  - {k}: {counts[k]}")


In [None]:
# 3) Show example cases: fraud vs non-fraud

def pick_examples(rows, *, is_fraudulent: bool, coherence_min=None, coherence_max=None, n=5):
    out = []
    for r in sorted(rows, key=lambda x: x["coherence_score"]):
        if bool(r["is_fraudulent"]) != bool(is_fraudulent):
            continue
        s = r["coherence_score"]
        if coherence_min is not None and s < coherence_min:
            continue
        if coherence_max is not None and s > coherence_max:
            continue
        out.append(r)
        if len(out) >= n:
            break
    return out

fraud_low = pick_examples(joined, is_fraudulent=True, coherence_max=scorer.high_risk_threshold, n=5)
legit_high = pick_examples(joined, is_fraudulent=False, coherence_min=0.70, n=5)

print("\nFraud examples (low coherence):")
for r in fraud_low:
    print(
        f"- {r['beneficiary_id']} {r['hcc_code']} fraud_type={r.get('fraud_type')} "
        f"score={r['coherence_score']:.2f} (ph={r['pharmacy_score']:.2f}, lab={r['lab_score']:.2f}, spec={r['specialist_score']:.2f}, proc={r['procedure_score']:.2f})"
    )

print("\nNon-fraud examples (high coherence):")
for r in legit_high:
    print(
        f"- {r['beneficiary_id']} {r['hcc_code']} score={r['coherence_score']:.2f} "
        f"(ph={r['pharmacy_score']:.2f}, lab={r['lab_score']:.2f}, spec={r['specialist_score']:.2f}, proc={r['procedure_score']:.2f})"
    )


In [None]:
# 4) Generate an audit report for one fraud case (with evidence)

# build one combined claims list
all_claims = []
for k in ["pharmacy_claims", "lab_claims", "specialist_visits", "procedure_claims"]:
    all_claims.extend(data.get(k, []))

if fraud_low:
    sample = fraud_low[0]
    report_gen = AuditReportGeneratorLite(kg)
    weights_for_report = scorer.get_weights_for_report(sample["hcc_code"])

    report = report_gen.generate_report(
        beneficiary_id=sample["beneficiary_id"],
        hcc_code=sample["hcc_code"],
        coherence_score=sample["coherence_score"],
        all_claims=all_claims,
        diagnosis_origin={"source": "Synthetic", "date": "(see diagnosis_records)", "provider": "Synthetic"},
        weights=weights_for_report,
    )

    print(report_gen.format_report(report))
else:
    print("No low-coherence fraud examples found. Try increasing fraud_rate.")
