## MCCV Lite Demo Notebook

This notebook demonstrates the **end-to-end MCCV prototype** using the **lite (pure-Python)** implementation:

- Generate **multi-level synthetic data** (beneficiary → diagnosis → multi-modality evidence)
- Score **clinical coherence** (0 to 1)
- Compare **fraud vs. non-fraud** cases (ground truth from the generator)
- Produce a **human-readable audit report** for a flagged case

> Note: This uses `mccv.lite` so it runs even when `numpy` / `pandas` are unavailable or unstable.


In [6]:
import os
import sys
from statistics import mean

# Ensure we can import the local package
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from mccv.lite.synthetic_generator import MedicareSyntheticGeneratorLite, HCC_DEFINITIONS_LITE
from mccv.lite.knowledge_graph import ClinicalKnowledgeGraphLite
from mccv.lite.rule_based_scorer import RuleBasedCoherenceScorerLite
from mccv.lite.audit_report import AuditReportGeneratorLite

print("Imports OK")


Imports OK


In [7]:
# 1) Generate synthetic data (multi-level)
#
# For *very large* synthetic datasets, use streaming generation to disk:
#   gen.generate_to_disk("./outputs/synth_big", keep_in_memory=False)
#
# This notebook keeps a moderate dataset in memory so we can inspect cases.

N_BENEFICIARIES = 5_000
FRAUD_RATE = 0.18
SEED = 42

# Optional: generate a very large dataset to disk (do NOT keep in memory)
GENERATE_LARGE_TO_DISK = False
LARGE_N = 200_000
LARGE_OUTDIR = os.path.join(PROJECT_ROOT, "outputs", "synthetic_big")

if GENERATE_LARGE_TO_DISK:
    gen_big = MedicareSyntheticGeneratorLite(n_beneficiaries=LARGE_N, fraud_rate=FRAUD_RATE, seed=SEED)
    gen_big.generate_to_disk(LARGE_OUTDIR, keep_in_memory=False, progress_every=10_000)
    print("Wrote large synthetic dataset to:", LARGE_OUTDIR)

# In-memory demo dataset
gen = MedicareSyntheticGeneratorLite(n_beneficiaries=N_BENEFICIARIES, fraud_rate=FRAUD_RATE, seed=SEED)
data = gen.generate()

meta = (data.get("meta") or [{}])[0]
print("Measurement window:", meta.get("measurement_start_date"), "→", meta.get("measurement_end_date"))
print("Tables:", sorted(data.keys()))
print("Beneficiaries:", len(data["beneficiaries"]))
print("Diagnosis records:", len(data["diagnosis_records"]))
print("Pharmacy claims:", len(data["pharmacy_claims"]))
print("Lab claims:", len(data["lab_claims"]))
print("Specialist visits:", len(data["specialist_visits"]))
print("Procedure claims:", len(data["procedure_claims"]))
print("HRA records:", len(data["hra_records"]))
print("Labels:", len(data["labels"]))


Measurement window: 2023-01-01 → 2024-12-31
Tables: ['beneficiaries', 'diagnosis_records', 'hra_records', 'lab_claims', 'labels', 'meta', 'pharmacy_claims', 'procedure_claims', 'specialist_visits']
Beneficiaries: 5000
Diagnosis records: 8707
Pharmacy claims: 120196
Lab claims: 48549
Specialist visits: 35054
Procedure claims: 32592
HRA records: 8707
Labels: 8707


In [8]:
data

{'meta': [{'measurement_start_date': '2023-01-01',
   'measurement_end_date': '2024-12-31',
   'seed': 42}],
 'beneficiaries': [{'beneficiary_id': 'BENE_1BD834F24C',
   'age': 71,
   'sex': 'M',
   'state': 'TX',
   'hcc_codes': ['HCC19', 'HCC18'],
   'is_fraudulent': True,
   'fraud_type': 'hra_only'},
  {'beneficiary_id': 'BENE_B8A24B3D02',
   'age': 70,
   'sex': 'F',
   'state': 'NC',
   'hcc_codes': ['HCC85', 'HCC19', 'HCC18', 'HCC111'],
   'is_fraudulent': False,
   'fraud_type': ''},
  {'beneficiary_id': 'BENE_78428FFBCA',
   'age': 75,
   'sex': 'M',
   'state': 'CA',
   'hcc_codes': ['HCC19'],
   'is_fraudulent': False,
   'fraud_type': ''},
  {'beneficiary_id': 'BENE_7354EFD389',
   'age': 69,
   'sex': 'M',
   'state': 'IL',
   'hcc_codes': ['HCC111'],
   'is_fraudulent': False,
   'fraud_type': ''},
  {'beneficiary_id': 'BENE_95F1F114CF',
   'age': 79,
   'sex': 'F',
   'state': 'IL',
   'hcc_codes': ['HCC111', 'HCC19', 'HCC18'],
   'is_fraudulent': True,
   'fraud_type': '

In [9]:
# 2) Build knowledge graph + score coherence (rule-based)
kg = ClinicalKnowledgeGraphLite(hcc_definitions={k: v.__dict__ for k, v in HCC_DEFINITIONS_LITE.items()})
weights_path = os.path.join(PROJECT_ROOT, "mccv", "configs", "hcc_weights.yaml")

scorer = RuleBasedCoherenceScorerLite(
    knowledge_graph=kg,
    weights_config_path=weights_path,
    measurement_start_date=meta.get("measurement_start_date"),
    measurement_end_date=meta.get("measurement_end_date"),
)

preds = scorer.score_dataset(data)  # list[dict]
labels = data["labels"]
label_by_key = {(l["beneficiary_id"], l["hcc_code"]): l for l in labels}

# Merge predictions with ground truth labels
joined = []
for p in preds:
    l = label_by_key.get((p["beneficiary_id"], p["hcc_code"]), {})
    joined.append({**p, **{"is_fraudulent": l.get("is_fraudulent", False), "fraud_type": l.get("fraud_type", "")}})

# Summary stats
fraud_scores = [r["coherence_score"] for r in joined if r["is_fraudulent"]]
legit_scores = [r["coherence_score"] for r in joined if not r["is_fraudulent"]]
print("Scored rows:", len(joined))
print("Mean coherence (fraud):", round(mean(fraud_scores), 3) if fraud_scores else None)
print("Mean coherence (non-fraud):", round(mean(legit_scores), 3) if legit_scores else None)

# Breakdown of high-risk flags by ground-truth fraud_type
high_risk = [r for r in joined if r["coherence_score"] < scorer.high_risk_threshold]
counts = {}
for r in high_risk:
    if not r["is_fraudulent"]:
        continue
    ft = r.get("fraud_type") or "unknown"
    counts[ft] = counts.get(ft, 0) + 1

print(f"High-risk (score < {scorer.high_risk_threshold:.2f}):", len(high_risk), "/", len(joined))
if counts:
    print("High-risk fraud breakdown:")
    for k in sorted(counts.keys()):
        print(f"  - {k}: {counts[k]}")


Scored rows: 8707
Mean coherence (fraud): 0.184
Mean coherence (non-fraud): 0.768
High-risk (score < 0.30): 1070 / 8707
High-risk fraud breakdown:
  - coding_lag: 226
  - hra_only: 96
  - paper_diagnosis: 533
  - upcoding: 215


In [10]:
# 3) Better case presentation (audit-style)

def fmt_pct(x):
    return f"{x:.2f}"

def weights_for(hcc_code: str):
    return scorer.get_weights_for_report(hcc_code)

def contribs(row):
    w = weights_for(row["hcc_code"])
    return {
        "med": w["medication"] * row["pharmacy_score"],
        "lab": w["lab"] * row["lab_score"],
        "spec": w["specialist"] * row["specialist_score"],
        "proc": w["procedure"] * row["procedure_score"],
    }

def dx_record(beneficiary_id: str, hcc_code: str):
    for r in data["diagnosis_records"]:
        if r["beneficiary_id"] == beneficiary_id and r["hcc_code"] == hcc_code:
            return r
    return None

def as_table(rows, limit=10):
    cols = [
        "beneficiary_id",
        "hcc_code",
        "fraud_type",
        "score",
        "source_type",
        "dx_date",
        "wMed*ph",
        "wLab*lab",
        "wSpec*spec",
        "wProc*proc",
    ]

    out = []
    for r in rows[:limit]:
        dx = dx_record(r["beneficiary_id"], r["hcc_code"]) or {}
        c = contribs(r)
        out.append(
            {
                "beneficiary_id": r["beneficiary_id"],
                "hcc_code": r["hcc_code"],
                "fraud_type": r.get("fraud_type") or "",
                "score": fmt_pct(r["coherence_score"]),
                "source_type": dx.get("source_type", ""),
                "dx_date": dx.get("diagnosis_date", ""),
                "wMed*ph": fmt_pct(c["med"]),
                "wLab*lab": fmt_pct(c["lab"]),
                "wSpec*spec": fmt_pct(c["spec"]),
                "wProc*proc": fmt_pct(c["proc"]),
            }
        )

    # simple fixed-width table (no pandas)
    widths = {k: max(len(k), max((len(str(r.get(k, ""))) for r in out), default=0)) for k in cols}
    header = " | ".join(k.ljust(widths[k]) for k in cols)
    sep = "-+-".join("-" * widths[k] for k in cols)
    lines = [header, sep]
    for r in out:
        lines.append(" | ".join(str(r.get(k, "")).ljust(widths[k]) for k in cols))
    return "\n".join(lines)

# pick representative fraud cases: 2 per fraud_type, lowest coherence
fraud_cases = [r for r in joined if r["is_fraudulent"]]
fraud_cases.sort(key=lambda x: x["coherence_score"])

by_type = {}
for r in fraud_cases:
    ft = r.get("fraud_type") or "unknown"
    by_type.setdefault(ft, [])
    if len(by_type[ft]) < 2:
        by_type[ft].append(r)

picked_fraud = []
for ft in sorted(by_type.keys()):
    picked_fraud.extend(by_type[ft])

# pick coherent non-fraud: highest coherence
legit_cases = [r for r in joined if not r["is_fraudulent"]]
legit_cases.sort(key=lambda x: x["coherence_score"], reverse=True)
picked_legit = legit_cases[:10]

print("\nFraud cases (representative; lowest coherence per fraud_type):")
print(as_table(picked_fraud, limit=50))

print("\nNon-fraud cases (highest coherence):")
print(as_table(picked_legit, limit=10))



Fraud cases (representative; lowest coherence per fraud_type):
beneficiary_id  | hcc_code | fraud_type      | score | source_type | dx_date    | wMed*ph | wLab*lab | wSpec*spec | wProc*proc
----------------+----------+-----------------+-------+-------------+------------+---------+----------+------------+-----------
BENE_95F1F114CF | HCC111   | coding_lag      | 0.00  | EDS         | 2019-09-18 | 0.00    | 0.00     | 0.00       | 0.00      
BENE_95F1F114CF | HCC19    | coding_lag      | 0.00  | Encounter   | 2020-08-05 | 0.00    | 0.00     | 0.00       | 0.00      
BENE_52FB83FD1B | HCC85    | hra_only        | 0.07  | HRA         | 2024-10-04 | 0.04    | 0.04     | 0.00       | 0.00      
BENE_7B7F42361E | HCC18    | hra_only        | 0.08  | HRA         | 2023-03-12 | 0.04    | 0.04     | 0.00       | 0.00      
BENE_A29159334E | HCC111   | paper_diagnosis | 0.00  | EDS         | 2024-09-18 | 0.00    | 0.00     | 0.00       | 0.00      
BENE_A29159334E | HCC18    | paper_diagnosis | 

In [11]:
# 4) Full explainability report for one selected case

# build one combined claims list
all_claims = []
for k in ["pharmacy_claims", "lab_claims", "specialist_visits", "procedure_claims"]:
    all_claims.extend(data.get(k, []))

# Choose a case to explain: prefer a coding_lag/paper_diagnosis example if present
case_to_explain = None
for ft in ["coding_lag", "paper_diagnosis", "hra_only", "upcoding"]:
    for r in picked_fraud:
        if r.get("fraud_type") == ft:
            case_to_explain = r
            break
    if case_to_explain:
        break

if case_to_explain:
    dx = dx_record(case_to_explain["beneficiary_id"], case_to_explain["hcc_code"]) or {}

    report_gen = AuditReportGeneratorLite(kg)
    weights_for_report = scorer.get_weights_for_report(case_to_explain["hcc_code"])

    report = report_gen.generate_report(
        beneficiary_id=case_to_explain["beneficiary_id"],
        hcc_code=case_to_explain["hcc_code"],
        coherence_score=case_to_explain["coherence_score"],
        all_claims=all_claims,
        diagnosis_origin={
            "source": dx.get("source_type", "Unknown"),
            "date": dx.get("diagnosis_date", "Unknown"),
            "provider": dx.get("provider_npi", "Unknown"),
            "claim_id": dx.get("claim_id", ""),
        },
        weights=weights_for_report,
        measurement_start_date=meta.get("measurement_start_date"),
        measurement_end_date=meta.get("measurement_end_date"),
    )

    print(report_gen.format_report(report))
else:
    print("No fraud cases found to explain (try increasing fraud_rate).")


TypeError: AuditReportGeneratorLite.generate_report() got an unexpected keyword argument 'measurement_start_date'