In [6]:
# ----------------------
# Build & persist 5 separate LabelEncoders for downstream inference
# ---------------------------------------------------------------

import os
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder

###############################################################################
# 1. Configuration – adjust paths / columns only in this block
###############################################################################

# Path to the dataset that contains all labelled rows
#DATA_PATH = r"\\vi240c060002.woc.prod\e$\datasets\Fields\2ND Source_Of_Incident\WCMLDataset12_23.xlsx"
#DATA_PATH = r"\\vi240c060002.woc.prod\e$\datasets\Fields\5TH Event_Of_Injury\WCMLDataset12_30_EVENT_INJURY.xlsx"
#DATA_PATH = r"\\vi240c060002.woc.prod\e$\datasets\Fields\1ST Event_Of_Incident\WCMLDataset12_30_EVENT_INCIDENT.xlsx"
#DATA_PATH = r"\\vi240c060002.woc.prod\e$\datasets\Fields\3RD Source_Of_Injury\WCMLDataset12_23.xlsx"
DATA_PATH = r"\\vi240c060002.woc.prod\e$\datasets\Fields\4TH EDI CAUSE\ML_EDI_CAUSE_DATA.xlsx"
# Folder where the encoder .pkl files will be written
ENCODER_OUT_DIR = r"\\vi240c060002.woc.prod\e$\Model_Label_Encoders"

# Your canonical column names (case‑sensitive)
TARGET_COLUMNS = [
    #"Event of Injury Desc",
    #"Source of Injury Desc",
    #"Event of Incident Desc",
    #"Source of Incident Desc",
    "EDI Cause Desc",
]

###############################################################################
# 2. Ensure output directory exists
###############################################################################

os.makedirs(ENCODER_OUT_DIR, exist_ok=True)

###############################################################################
# 3. Load the labelled data
###############################################################################

print(f"Loading data from: {DATA_PATH}")
df = pd.read_excel(DATA_PATH) if DATA_PATH.endswith(".xlsx") else pd.read_csv(DATA_PATH)
print(f"Loaded {len(df):,} rows.")

###############################################################################
# 4. Fit & save an encoder for each target column
###############################################################################

for col in TARGET_COLUMNS:
    if col not in df.columns:
        raise ValueError(f"Column ‘{col}’ not found in the dataset!")

    le = LabelEncoder()
    le.fit(df[col].astype(str))                        # fit on string values

    # Sanitize filename (remove spaces and slashes)
    safe_name = col.replace(" ", "_").replace("/", "_")
    out_file = os.path.join(ENCODER_OUT_DIR, f"{safe_name}_encoder.pkl")

    joblib.dump(le, out_file)
    print(f"✔  Saved encoder for “{col}” →  {out_file}")

print("\nAll encoders saved successfully.")

###############################################################################
# 5. Optional: quick sanity print of classes
###############################################################################
if __name__ == "__main__":
    print("\nSample class counts:")
    for col in TARGET_COLUMNS:
        print(f"  {col:27s}:  {df[col].nunique()} unique labels")


Loading data from: \\vi240c060002.woc.prod\e$\datasets\Fields\4TH EDI CAUSE\ML_EDI_CAUSE_DATA.xlsx
Loaded 5,105 rows.
✔  Saved encoder for “EDI Cause Desc” →  \\vi240c060002.woc.prod\e$\Model_Label_Encoders\EDI_Cause_Desc_encoder.pkl

All encoders saved successfully.

Sample class counts:
  EDI Cause Desc             :  10 unique labels
