In [None]:
import os
from dotenv import load_dotenv
import pandas as pd

# -----------------------------------------------------------
# Load environment + imports
# -----------------------------------------------------------
load_dotenv()

from params import (
    DATA_DIR,
    LOCAL_MODEL,
    HDBSCAN_STAGE1,
    adaptive_hdbscan_params,
)

from ref_data_helpers import (
    build_reference_embeddings_from_csv,
    build_reference_profile,
    export_campaign_names_csv,
)

# -----------------------------------------------------------
# List of ORIGINATORS to process
# -----------------------------------------------------------
df = pd.read_csv('./data/data_raw_input/2025-11-13/CAMPAIGNS.csv')

ORIGINATORS = [str(o) for o in df['business_number'].unique().tolist()]
#ORIGINATORS = ["692484"]


# -----------------------------------------------------------
# MAIN LOOP OVER ORIGINATORS
# -----------------------------------------------------------
for ORIGINATOR in ORIGINATORS:

    print("\n" + "="*70)
    print(f"Processing ORIGINATOR {ORIGINATOR}")
    print("="*70)

    # Directories
    data_dir = DATA_DIR
    input_dir = os.path.join(data_dir, "input")
    output_dir = os.path.join(data_dir, "output")
    os.makedirs(output_dir, exist_ok=True)

    # Input CSV
    REF_CSV = os.path.join(input_dir, f"ref_{ORIGINATOR}.csv")

    # Output prefix
    RUN_PREFIX = ORIGINATOR

    print("REF_CSV :", REF_CSV)
    
    if not os.path.exists(REF_CSV):
        print(f"  !!! WARNING: REF_CSV does not exist; skipping ORIGINATOR {ORIGINATOR}.")
        continue

    # -------------------------------------------------------
    # Stage 1a — embeddings + prototypes
    # -------------------------------------------------------
    meta_df, X = build_reference_embeddings_from_csv(
        csv_path=REF_CSV,
        data_dir=output_dir,
        prefix=RUN_PREFIX,
        text_col="raw_text",
        model_path=LOCAL_MODEL,
    )

    # Adaptive params
    n_stage1 = len(X)
    hdbscan_params_stage1 = adaptive_hdbscan_params(
        HDBSCAN_STAGE1,
        n_points=n_stage1,
        min_cluster_frac=0.01,
    )
    print("HDBSCAN params:", hdbscan_params_stage1)

    # -------------------------------------------------------
    # Stage 1b — cluster → campaigns
    #     write_outputs=False → suppress internal CSVs
    # -------------------------------------------------------
    campaigns_df, examples_df, C = build_reference_profile(
        data_dir=output_dir,
        prefix=RUN_PREFIX,
        hdbscan_params=hdbscan_params_stage1,
        write_outputs=True,        # << suppress all internal outputs
    )

    # -------------------------------------------------------
    # Write ONLY campaign names
    # -------------------------------------------------------
    export_campaign_names_csv(
        campaigns_df=campaigns_df,
        out_dir=output_dir,
        originator=RUN_PREFIX,
        filename=f"ORIGINATOR_{ORIGINATOR}_campaign_names.csv",
    )

print("\nDone processing all ORIGINATORS.")



Processing ORIGINATOR 692484
REF_CSV : ./data/stage1\input\ref_692484.csv
[Stage 1] rows = 4191; prototypes = 2160
[Stage 1] Saved embeddings to: ./data/stage1\output\692484.csv, ./data/stage1\output\692484.npy
HDBSCAN params: {'min_cluster_size': 21, 'min_samples': 1, 'cluster_selection_epsilon': 0.0, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}
[HDBSCAN] Using parameters:
  min_cluster_size = 21
  min_samples = 1
  cluster_selection_epsilon = 0.0
  cluster_selection_method = eom
  metric = euclidean




[Stage 1] HDBSCAN: total=2160, noise=1693, clusters=6
Saved artifacts:
 - campaigns_csv: ./data/stage1\output\692484_campaigns.csv
 - centroids_npy: ./data/stage1\output\692484_campaign_centroids.npy
 - campaign_examples_csv: ./data/stage1\output\692484_campaign_examples.csv
[Stage 1] Reference build complete. Saved artifacts for prefix=692484 under ./data/stage1\output
[Stage 1] Saved campaign-name CSV to: data\stage1\output\ORIGINATOR_692484_campaign_names.csv

Done processing all ORIGINATORS.
