In [None]:
import os
from dotenv import load_dotenv

# -----------------------------------------------------------
# Load environment + imports
# -----------------------------------------------------------
load_dotenv()

from params import (
    DATA_DIR,
    LOCAL_MODEL,
    HDBSCAN_STAGE1,
    adaptive_hdbscan_params,
)

from ref_data_helpers import (
    build_reference_embeddings_from_csv,
    build_reference_profile,
    export_campaign_names_csv,
)

# -----------------------------------------------------------
# List of ORIGINATORS to process
# -----------------------------------------------------------
originators = [ 73981,   7535, 692632, 266278,  53849,  95773, 227898,  88022,
        62297, 347268,  89700,  36682,  39747,  74843,  98900,  22395,
        57513,  82539,  61746,  21398,  87844,  74454,  24255,  86753,
        51270,  65821,  66458,  52927,  28777,  34799,  80607,   3538,
        36794,  35213,  28732,  34646, 898287,  70392,  22948,  24273,
       454545,  85166,  93729,  26266, 692484]

ORIGINATORS = [str(o) for o in originators]

#ORIGINATORS = ["24273", "61746"]   # << change as needed


# -----------------------------------------------------------
# MAIN LOOP OVER ORIGINATORS
# -----------------------------------------------------------
for ORIGINATOR in ORIGINATORS:

    print("\n" + "="*70)
    print(f"Processing ORIGINATOR {ORIGINATOR}")
    print("="*70)

    # Directories
    data_dir = DATA_DIR
    input_dir = os.path.join(data_dir, "input")
    output_dir = os.path.join(data_dir, "output")
    os.makedirs(output_dir, exist_ok=True)

    # Input CSV
    REF_CSV = os.path.join(input_dir, f"ref_{ORIGINATOR}.csv")

    # Output prefix
    RUN_PREFIX = ORIGINATOR

    print("REF_CSV :", REF_CSV)

    # -------------------------------------------------------
    # Stage 1a — embeddings + prototypes
    # -------------------------------------------------------
    meta_df, X = build_reference_embeddings_from_csv(
        csv_path=REF_CSV,
        data_dir=output_dir,
        prefix=RUN_PREFIX,
        text_col="raw_text",
        model_path=LOCAL_MODEL,
    )

    # Adaptive params
    n_stage1 = len(X)
    hdbscan_params_stage1 = adaptive_hdbscan_params(
        HDBSCAN_STAGE1,
        n_points=n_stage1,
        min_cluster_frac=0.02,
    )
    print("HDBSCAN params:", hdbscan_params_stage1)

    # -------------------------------------------------------
    # Stage 1b — cluster → campaigns
    #     write_outputs=False → suppress internal CSVs
    # -------------------------------------------------------
    campaigns_df, examples_df, C = build_reference_profile(
        data_dir=output_dir,
        prefix=RUN_PREFIX,
        hdbscan_params=hdbscan_params_stage1,
        write_outputs=False,        # << suppress all internal outputs
    )

    # -------------------------------------------------------
    # Write ONLY campaign names
    # -------------------------------------------------------
    export_campaign_names_csv(
        campaigns_df=campaigns_df,
        out_dir=output_dir,
        originator=RUN_PREFIX,
        filename=f"ORIGINATOR_{ORIGINATOR}_campaign_names.csv",
    )

print("\nDone processing all ORIGINATORS.")


  from .autonotebook import tqdm as notebook_tqdm


.env loaded from: C:\Users\ee7823\OneDrive - AT&T Services, Inc\Documents\campaign_drift\.env

Processing ORIGINATOR 24273
REF_CSV : ./data\input\ref_24273.csv
[Stage 1] rows = 4854; prototypes = 1903
[Stage 1] Saved embeddings to: ./data\output\24273.csv, ./data\output\24273.npy
HDBSCAN params: {'min_cluster_size': 38, 'min_samples': 1, 'cluster_selection_epsilon': 0.0, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}
[HDBSCAN] Using parameters:
  min_cluster_size = 38
  min_samples = 1
  cluster_selection_epsilon = 0.0
  cluster_selection_method = eom
  metric = euclidean




[Stage 1] HDBSCAN: total=1903, noise=105, clusters=5
[Stage 1] Saved campaign-name CSV to: data\output\ORIGINATOR_24273_campaign_names.csv

Processing ORIGINATOR 61746
REF_CSV : ./data\input\ref_61746.csv
[Stage 1] rows = 4953; prototypes = 3695
[Stage 1] Saved embeddings to: ./data\output\61746.csv, ./data\output\61746.npy
HDBSCAN params: {'min_cluster_size': 73, 'min_samples': 1, 'cluster_selection_epsilon': 0.0, 'cluster_selection_method': 'eom', 'metric': 'euclidean'}
[HDBSCAN] Using parameters:
  min_cluster_size = 73
  min_samples = 1
  cluster_selection_epsilon = 0.0
  cluster_selection_method = eom
  metric = euclidean




[Stage 1] HDBSCAN: total=3695, noise=2640, clusters=4
[Stage 1] Saved campaign-name CSV to: data\output\ORIGINATOR_61746_campaign_names.csv

Done processing all ORIGINATORS.
