In [1]:
import pandas as pd
import ast
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from transformers import Trainer, TrainingArguments
import torch.nn as nn
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss
import evaluate # Hugging Face Evaluate library
from sklearn.metrics import classification_report
import torch.nn.functional as F # For sigmoid if needed manually

In [2]:
df_train = pd.read_csv("multi_labels/train_heuristics.tsv",sep="\t")

In [3]:
df_train['heuristic_labels'] = df_train['heuristic_labels'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_train = df_train[df_train['heuristic_labels'].apply(lambda x: len(x)>0)]

In [4]:
df_train.drop(columns=['label'],inplace=True)

In [5]:
# Get all unique intents
all_intents = sorted(list(set(intent for sublist in df_train['heuristic_labels'] for intent in sublist)))
num_labels = len(all_intents)
id2label = {i: label for i, label in enumerate(all_intents)}
label2id = {label: i for i, label in id2label.items()}

print("Intents:", label2id)
print("Num Labels:", num_labels)

# Multi-hot encode
def encode_labels(row):
    encoding = [0.0] * num_labels # Use float for BCEWithLogitsLoss
    for intent in row['heuristic_labels']:
        if intent in label2id:
            encoding[label2id[intent]] = 1.0
    return encoding

df_train['labels'] = df_train.apply(encode_labels, axis=1)

# Split data (important!)
train_df, test_df = train_test_split(df_train, test_size=0.2, random_state=42) # Add validation split if needed

# Convert to Hugging Face Dataset object
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

Intents: {'express_apology': 0, 'express_greetings': 1, 'inform_attachment': 2, 'mark_calendar': 3, 'meeting_location': 4, 'meeting_time': 5, 'offer_assistance': 6, 'propose_meeting': 7, 'provide_link': 8, 'purchase_order': 9, 'request_access': 10, 'request_add_cc': 11, 'request_addition_to_list': 12, 'request_attendance': 13, 'request_attendance_info': 14, 'request_availability': 15, 'request_bug_report': 16, 'request_call': 17, 'request_close_bug': 18, 'request_conference_call': 19, 'request_confidentiality': 20, 'request_contact_information': 21, 'request_coordination': 22, 'request_create_baseline': 23, 'request_deletion': 24, 'request_disregard_of_previous_request': 25, 'request_error_details': 26, 'request_fax': 27, 'request_follow_up': 28, 'request_further_information': 29, 'request_holding_off': 30, 'request_inclusion': 31, 'request_instructions_on_how_to_proceed': 32, 'request_link': 33, 'request_login_credentials': 34, 'request_meeting': 35, 'request_print': 36, 'request_remi

In [6]:
model_name = "distilbert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenize_function(examples):
    # Tokenize text
    tokenized_inputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) # Adjust max_length
    # Add labels
    tokenized_inputs["labels"] = examples["labels"]
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Remove original columns, set format for PyTorch
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'heuristic_labels', '__index_level_0__']) # Adjust if using different loading
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['text', 'heuristic_labels', '__index_level_0__'])
tokenized_train_dataset.set_format("torch")
tokenized_test_dataset.set_format("torch")

Map:   0%|          | 0/2267 [00:00<?, ? examples/s]

Map:   0%|          | 0/567 [00:00<?, ? examples/s]

In [8]:
# Calculate frequencies from the multi-hot encoded training labels
labels_array = np.array(tokenized_train_dataset['labels'])
num_samples = labels_array.shape[0]
pos_counts = np.sum(labels_array, axis=0)
neg_counts = num_samples - pos_counts

# Calculate pos_weight (handle potential division by zero if a class is never positive)
pos_weight = np.ones_like(pos_counts, dtype=np.float32)
# Where pos_counts > 0, calculate weight. Avoid division by zero.
mask = pos_counts > 0
pos_weight[mask] = neg_counts[mask] / pos_counts[mask]

# Clamp weights to avoid extreme values if desired (optional)
# max_weight = 100
# pos_weight = np.clip(pos_weight, a_min=1.0, a_max=max_weight)

# Convert to a Tensor and move to device (CPU or GPU)
pos_weight_tensor = torch.tensor(pos_weight, dtype=torch.float32)
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
pos_weight_tensor = pos_weight_tensor.to(device)

print("Calculated pos_weights for BCE:", pos_weight_tensor)

Calculated pos_weights for BCE: tensor([376.8333, 250.8889,  11.5944, 112.3500, 124.9444,  18.7130,  16.5736,
          8.8565, 187.9167,  49.3778, 322.8571,  33.3485, 187.9167,  42.5962,
        132.3529,  60.2703, 160.9286,   5.3324, 132.3529,  69.8438,  72.1290,
         30.4861,  72.1290, 160.9286, 150.1333,  77.1724,  54.2927,  19.9907,
         13.9145,  15.9179, 173.3846,  18.2119, 187.9167,  77.1724, 102.0455,
         79.9643,  74.5667, 322.8571, 118.3158, 376.8333,  30.4861,  20.7981,
         30.0548,  28.4416, 205.0909,  28.4416, 150.1333, 132.3529, 754.6667,
        565.7500, 118.3158,  65.6765,  14.7431], device='mps:0')


In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification", # Important!
    id2label=id2label,
    label2id=label2id
).to(device) # Move model to device

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class WeightedBCETrainer(Trainer):
    # You might want to pass pos_weight_tensor during initialization for cleaner access
    # def __init__(self, *args, pos_weight=None, **kwargs):
    #     super().__init__(*args, **kwargs)
    #     self.pos_weight = pos_weight.to(self.args.device) # Move weight to device here

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs): 
        """
        Override the compute_loss method to use BCEWithLogitsLoss with pos_weight.
        """
        # Ensure labels are on the correct device if not already handled by Trainer
        labels = inputs.pop("labels").to(self.args.device)

        # Get model outputs
        outputs = model(**inputs)
        logits = outputs.logits

        # Define the loss function WITH the weights
        # Access pos_weight_tensor here. Ensure it's on the same device as logits/labels.
        # If passed via __init__: loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        # If global/accessible otherwise:
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=pos_weight_tensor.to(logits.device)) # Ensure device match

        # Calculate loss
        loss = loss_fct(logits, labels.float()) # Ensure labels are float type

        return (loss, outputs) if return_outputs else loss

In [11]:
# Define metrics suitable for multi-label
# Using sigmoid for multi-label prediction
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
    # roc_auc = roc_auc_score(y_true, y_pred, average = 'macro') # AUC needs probabilities
    accuracy = accuracy_score(y_true, y_pred) # Subset accuracy
    ham_loss = hamming_loss(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_macro_average,
               # 'roc_auc': roc_auc, # Requires probs, not thresholded preds
               'accuracy': accuracy,
               'hamming_loss': ham_loss}
    return metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return multi_label_metrics(predictions, labels)

In [12]:
training_args = TrainingArguments(
    output_dir='./results_heuristics_weighted_bce',
    num_train_epochs=10, # Adjust as needed
    per_device_train_batch_size=16, # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_weighted_bce',
    logging_steps=50,
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",       # Save checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="f1",  # Use F1 macro score to determine the best model
    greater_is_better=True,
    fp16=torch.cuda.is_available(), # Enable mixed precision if GPU supports it
    # Add other arguments as needed
)

# Instantiate the custom Trainer
trainer = WeightedBCETrainer( # Use the custom trainer
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = WeightedBCETrainer( # Use the custom trainer


In [13]:
# Start training
trainer.train()

# Evaluate the best model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

  0%|          | 0/1420 [00:00<?, ?it/s]

{'loss': 1.3515, 'grad_norm': 0.9440268278121948, 'learning_rate': 5e-06, 'epoch': 0.35}
{'loss': 1.3521, 'grad_norm': 2.06913685798645, 'learning_rate': 1e-05, 'epoch': 0.7}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 1.2485315799713135, 'eval_f1': 0.11359917906909195, 'eval_accuracy': 0.0, 'eval_hamming_loss': 0.31027253668763105, 'eval_runtime': 3.0265, 'eval_samples_per_second': 187.347, 'eval_steps_per_second': 11.895, 'epoch': 1.0}
{'loss': 1.2963, 'grad_norm': 1.5546590089797974, 'learning_rate': 1.5e-05, 'epoch': 1.06}
{'loss': 1.1877, 'grad_norm': 3.407956123352051, 'learning_rate': 2e-05, 'epoch': 1.41}
{'loss': 1.1853, 'grad_norm': 4.6357879638671875, 'learning_rate': 2.5e-05, 'epoch': 1.76}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.8985627889633179, 'eval_f1': 0.2051530858641974, 'eval_accuracy': 0.0, 'eval_hamming_loss': 0.15370536754184552, 'eval_runtime': 2.8255, 'eval_samples_per_second': 200.67, 'eval_steps_per_second': 12.741, 'epoch': 2.0}
{'loss': 0.9759, 'grad_norm': 1.2032235860824585, 'learning_rate': 3e-05, 'epoch': 2.11}
{'loss': 0.8199, 'grad_norm': 1.6396270990371704, 'learning_rate': 3.5e-05, 'epoch': 2.46}
{'loss': 0.6987, 'grad_norm': 2.2360751628875732, 'learning_rate': 4e-05, 'epoch': 2.82}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.5166792869567871, 'eval_f1': 0.35657488452669683, 'eval_accuracy': 0.08994708994708994, 'eval_hamming_loss': 0.07726864330637916, 'eval_runtime': 2.977, 'eval_samples_per_second': 190.463, 'eval_steps_per_second': 12.093, 'epoch': 3.0}
{'loss': 0.5692, 'grad_norm': 1.9430384635925293, 'learning_rate': 4.5e-05, 'epoch': 3.17}
{'loss': 0.4444, 'grad_norm': 2.3931643962860107, 'learning_rate': 5e-05, 'epoch': 3.52}
{'loss': 0.3475, 'grad_norm': 4.718079566955566, 'learning_rate': 4.7282608695652177e-05, 'epoch': 3.87}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.3245914876461029, 'eval_f1': 0.5304295969367079, 'eval_accuracy': 0.18871252204585537, 'eval_hamming_loss': 0.03976573158963096, 'eval_runtime': 3.1414, 'eval_samples_per_second': 180.494, 'eval_steps_per_second': 11.46, 'epoch': 4.0}
{'loss': 0.2986, 'grad_norm': 0.9996858835220337, 'learning_rate': 4.456521739130435e-05, 'epoch': 4.23}
{'loss': 0.2544, 'grad_norm': 0.799519419670105, 'learning_rate': 4.1847826086956525e-05, 'epoch': 4.58}
{'loss': 0.2418, 'grad_norm': 1.2546864748001099, 'learning_rate': 3.91304347826087e-05, 'epoch': 4.93}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.2524603605270386, 'eval_f1': 0.6100229287359769, 'eval_accuracy': 0.345679012345679, 'eval_hamming_loss': 0.025623107384113674, 'eval_runtime': 3.089, 'eval_samples_per_second': 183.556, 'eval_steps_per_second': 11.654, 'epoch': 5.0}
{'loss': 0.181, 'grad_norm': 0.9965406060218811, 'learning_rate': 3.641304347826087e-05, 'epoch': 5.28}
{'loss': 0.1561, 'grad_norm': 0.48984572291374207, 'learning_rate': 3.369565217391305e-05, 'epoch': 5.63}
{'loss': 0.1586, 'grad_norm': 3.839911937713623, 'learning_rate': 3.0978260869565215e-05, 'epoch': 5.99}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.2426837533712387, 'eval_f1': 0.6759171271472227, 'eval_accuracy': 0.49382716049382713, 'eval_hamming_loss': 0.015972846161525407, 'eval_runtime': 3.0424, 'eval_samples_per_second': 186.366, 'eval_steps_per_second': 11.833, 'epoch': 6.0}
{'loss': 0.1202, 'grad_norm': 0.7410796284675598, 'learning_rate': 2.826086956521739e-05, 'epoch': 6.34}
{'loss': 0.1045, 'grad_norm': 0.8197495937347412, 'learning_rate': 2.554347826086957e-05, 'epoch': 6.69}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.2499128133058548, 'eval_f1': 0.7226787342774348, 'eval_accuracy': 0.582010582010582, 'eval_hamming_loss': 0.01291138398056637, 'eval_runtime': 2.8693, 'eval_samples_per_second': 197.61, 'eval_steps_per_second': 12.547, 'epoch': 7.0}
{'loss': 0.117, 'grad_norm': 0.5175654888153076, 'learning_rate': 2.282608695652174e-05, 'epoch': 7.04}
{'loss': 0.0954, 'grad_norm': 0.29954877495765686, 'learning_rate': 2.0108695652173915e-05, 'epoch': 7.39}
{'loss': 0.0875, 'grad_norm': 0.6638591885566711, 'learning_rate': 1.739130434782609e-05, 'epoch': 7.75}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.2477291375398636, 'eval_f1': 0.7383224444036468, 'eval_accuracy': 0.6031746031746031, 'eval_hamming_loss': 0.011280822601577319, 'eval_runtime': 2.8761, 'eval_samples_per_second': 197.14, 'eval_steps_per_second': 12.517, 'epoch': 8.0}
{'loss': 0.076, 'grad_norm': 0.5936986207962036, 'learning_rate': 1.4673913043478263e-05, 'epoch': 8.1}
{'loss': 0.0784, 'grad_norm': 0.24234621226787567, 'learning_rate': 1.1956521739130435e-05, 'epoch': 8.45}
{'loss': 0.0687, 'grad_norm': 0.6689178943634033, 'learning_rate': 9.239130434782608e-06, 'epoch': 8.8}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.25666719675064087, 'eval_f1': 0.7550458652489768, 'eval_accuracy': 0.654320987654321, 'eval_hamming_loss': 0.009317493594223155, 'eval_runtime': 2.9785, 'eval_samples_per_second': 190.365, 'eval_steps_per_second': 12.087, 'epoch': 9.0}
{'loss': 0.0719, 'grad_norm': 0.423505961894989, 'learning_rate': 6.521739130434783e-06, 'epoch': 9.15}
{'loss': 0.066, 'grad_norm': 0.7690907120704651, 'learning_rate': 3.804347826086957e-06, 'epoch': 9.51}
{'loss': 0.066, 'grad_norm': 0.8439431190490723, 'learning_rate': 1.0869565217391306e-06, 'epoch': 9.86}


  0%|          | 0/36 [00:00<?, ?it/s]

{'eval_loss': 0.24874401092529297, 'eval_f1': 0.7546069033971352, 'eval_accuracy': 0.6701940035273368, 'eval_hamming_loss': 0.009117833017204086, 'eval_runtime': 2.8104, 'eval_samples_per_second': 201.754, 'eval_steps_per_second': 12.81, 'epoch': 10.0}
{'train_runtime': 409.6108, 'train_samples_per_second': 55.345, 'train_steps_per_second': 3.467, 'train_loss': 0.44000075660960775, 'epoch': 10.0}


  0%|          | 0/36 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.25666719675064087, 'eval_f1': 0.7550458652489768, 'eval_accuracy': 0.654320987654321, 'eval_hamming_loss': 0.009317493594223155, 'eval_runtime': 2.7984, 'eval_samples_per_second': 202.618, 'eval_steps_per_second': 12.865, 'epoch': 10.0}


In [14]:
def detailed_multi_label_metrics(predictions, labels, id2label, threshold=0.5):
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    y_true = labels

    # Overall metrics (as before)
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro', zero_division=0)
    accuracy = accuracy_score(y_true, y_pred) # Subset accuracy
    ham_loss = hamming_loss(y_true, y_pred)

    # Per-label report
    report = classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(len(id2label))], zero_division=0, output_dict=True)
    print("\n--- Classification Report ---")
    # Pretty print essential parts or just log the dict
    for label, metrics in report.items():
        if isinstance(metrics, dict): # Print per-label stats
             print(f"Label: {label:<20} F1: {metrics.get('f1-score', 0):.3f} | Precision: {metrics.get('precision', 0):.3f} | Recall: {metrics.get('recall', 0):.3f} | Support: {metrics.get('support', 0)}")
    print("---------------------------\n")


    metrics = {'f1': f1_macro, 'accuracy': accuracy, 'hamming_loss': ham_loss}
    # Optionally add specific per-label f1s to the returned dict if needed by trainer
    # for i, label_name in id2label.items():
    #    metrics[f'f1_{label_name}'] = report[label_name]['f1-score']

    return metrics

def compute_metrics_detailed(eval_pred):
     predictions, labels = eval_pred
     # Assuming id2label is accessible here (might need to make it global or pass differently)
     return detailed_multi_label_metrics(predictions, labels, id2label) # Pass id2label mapping

# Use compute_metrics_detailed in your Trainer setup
# trainer = WeightedBCETrainer(..., compute_metrics=compute_metrics_detailed)

In [15]:
def predict_intents(text, model, tokenizer, id2label, device, threshold=0.5):
    """
    Predicts intents for a given text using the loaded multi-label model.

    Args:
        text (str): The input text.
        model: The loaded transformer model.
        tokenizer: The loaded tokenizer.
        id2label (dict): Mapping from label index to label name.
        device: The torch device (cpu or cuda).
        threshold (float): The probability threshold for classifying an intent as present.

    Returns:
        list: A list of predicted intent strings.
        dict: A dictionary of intents and their probabilities.
    """
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128) # Use same max_length as training
    inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as model

    # Get predictions without calculating gradients
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply sigmoid to get probabilities (shape: [1, num_labels])
    probabilities = torch.sigmoid(logits).squeeze().cpu() # Move probabilities to CPU for easier handling

    # Apply threshold and get predicted label indices
    predicted_indices = np.where(probabilities >= threshold)[0]

    # Map indices to labels
    predicted_labels = [id2label[idx] for idx in predicted_indices]

    # Create probability dictionary
    probabilities_dict = {id2label[i]: prob.item() for i, prob in enumerate(probabilities)}

    return predicted_labels, probabilities_dict

In [51]:


# --- Configuration ---
model_path = './results_heuristics_weighted_bce/checkpoint-1420' # Or the specific checkpoint path like './results_weighted_bce/checkpoint-XYZ'
device = torch.device("cuda" if torch.cuda.is_available() else "mps")
threshold = 0.7 # Default threshold, tune this if necessary!

# --- Load Tokenizer and Model ---
print(f"Loading tokenizer from: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path)

print(f"Loading model from: {model_path}")
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to(device) # Move model to GPU or CPU
model.eval() # IMPORTANT: Set model to evaluation mode (disables dropout etc.)

# --- Load id2label mapping ---
# The model config should store id2label if provided during initialization
if hasattr(model.config, 'id2label'):
    id2label = model.config.id2label
    # Convert keys to integers if they are loaded as strings
    id2label = {int(k): v for k, v in id2label.items()}
    print("Loaded id2label from model config:", id2label)
else:
    # !! If not saved in config, you MUST load/recreate the exact same mapping used during training !!
    # Example: Recreate it manually (replace with your actual mapping)
    # all_intents = [...] # List of your intents in the correct order
    # id2label = {i: label for i, label in enumerate(all_intents)}
    raise ValueError("Could not load id2label from model config. Please load or recreate it manually.")

num_labels = len(id2label)

Loading tokenizer from: ./results_heuristics_weighted_bce/checkpoint-1420
Loading model from: ./results_heuristics_weighted_bce/checkpoint-1420
Loaded id2label from model config: {0: 'express_apology', 1: 'express_greetings', 2: 'inform_attachment', 3: 'mark_calendar', 4: 'meeting_location', 5: 'meeting_time', 6: 'offer_assistance', 7: 'propose_meeting', 8: 'provide_link', 9: 'purchase_order', 10: 'request_access', 11: 'request_add_cc', 12: 'request_addition_to_list', 13: 'request_attendance', 14: 'request_attendance_info', 15: 'request_availability', 16: 'request_bug_report', 17: 'request_call', 18: 'request_close_bug', 19: 'request_conference_call', 20: 'request_confidentiality', 21: 'request_contact_information', 22: 'request_coordination', 23: 'request_create_baseline', 24: 'request_deletion', 25: 'request_disregard_of_previous_request', 26: 'request_error_details', 27: 'request_fax', 28: 'request_follow_up', 29: 'request_further_information', 30: 'request_holding_off', 31: 'reques

In [52]:
new_text_1 = "who is participating in the meeting."
predicted_labels_1, probabilities_1 = predict_intents(new_text_1, model, tokenizer, id2label, device, threshold)
print(f"Text: '{new_text_1}'")
print(f"Predicted Intents (Threshold={threshold}): {predicted_labels_1}")
print(f"Probabilities: {probabilities_1}\n")

Text: 'who is participating in the meeting.'
Predicted Intents (Threshold=0.7): ['request_attendance', 'request_attendance_info']
Probabilities: {'express_apology': 0.008848713710904121, 'express_greetings': 0.006704783998429775, 'inform_attachment': 0.14822807908058167, 'mark_calendar': 0.023072104901075363, 'meeting_location': 0.05554280802607536, 'meeting_time': 0.02029978483915329, 'offer_assistance': 0.008224942721426487, 'propose_meeting': 0.4431264102458954, 'provide_link': 0.011614724062383175, 'purchase_order': 0.032264336943626404, 'request_access': 0.00207375455647707, 'request_add_cc': 0.08847587555646896, 'request_addition_to_list': 0.011693660169839859, 'request_attendance': 0.9190656542778015, 'request_attendance_info': 0.8433091640472412, 'request_availability': 0.16333596408367157, 'request_bug_report': 0.0017878497019410133, 'request_call': 0.0025272679049521685, 'request_close_bug': 0.008892985992133617, 'request_conference_call': 0.011782544665038586, 'request_confi

In [62]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm.auto import tqdm # For progress bar

texts = df_train['text'].tolist()

BATCH_SIZE = 32  # Adjust based on your GPU memory
POOLING_STRATEGY = 'cls' # Options: 'cls', 'mean'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps")
print(f"Using device: {DEVICE}")


def get_embeddings(texts_batch, strategy='cls'):
    """
    Generates embeddings for a batch of texts using a fine-tuned
    Sequence Classification model (like DistilBertForSequenceClassification).

    Args:
        texts_batch (list): A list of strings.
        strategy (str): 'cls' to use the [CLS] token embedding,
                        'mean' to use mean pooling of token embeddings.

    Returns:
        torch.Tensor: Embeddings tensor for the batch.
    """
    # Tokenize the batch
    inputs = tokenizer(
        texts_batch,
        padding=True,        # Pad sequences to max length in batch
        truncation=True,     # Truncate sequences longer than model max length
        max_length=512,      # DistilBERT's max sequence length (usually 512)
        return_tensors="pt"  # Return PyTorch tensors
    )

    # Move inputs to the same device as the model
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}

    # Get model outputs without calculating gradients
    with torch.no_grad():
        # --- Key Change 1: Request hidden states ---
        # Pass output_hidden_states=True to the model's forward pass
        outputs = model(**inputs, output_hidden_states=True)
        # ------------------------------------------

    # --- Key Change 2: Access hidden states correctly ---
    # outputs.hidden_states is a tuple. The last element [-1] contains
    # the hidden states from the final layer of the DistilBERT backbone.
    last_hidden_state = outputs.hidden_states[-1]
    # ---------------------------------------------------

    # Now, proceed with pooling strategies as before
    if strategy == 'cls':
        # Use the embedding of the [CLS] token (first token)
        embeddings = last_hidden_state[:, 0, :]
    elif strategy == 'mean':
        # Mean pooling: average token embeddings, ignore padding
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) # Avoid division by zero
        embeddings = sum_embeddings / sum_mask
    else:
        raise ValueError(f"Unknown pooling strategy: {strategy}")

    return embeddings
# --- Process Data in Batches ---
all_embeddings = []
print(f"Generating embeddings for {len(texts)} texts...")


Using device: mps
Generating embeddings for 2834 texts...


In [63]:
for i in tqdm(range(0, len(texts), BATCH_SIZE)):
    batch_texts = texts[i : i + BATCH_SIZE]
    batch_embeddings = get_embeddings(batch_texts, strategy=POOLING_STRATEGY)
    # Move embeddings to CPU and convert to NumPy (optional, but common for clustering)
    all_embeddings.append(batch_embeddings.cpu().numpy())

# Concatenate embeddings from all batches
final_embeddings = np.concatenate(all_embeddings, axis=0)

print(f"Finished generating embeddings. Shape: {final_embeddings.shape}")

  0%|          | 0/89 [00:00<?, ?it/s]

Finished generating embeddings. Shape: (2834, 768)


In [39]:
import umap
import plotly.express as px
import pandas as pd
import numpy as np

# --- Your Data (Make sure these variables are loaded correctly) ---
# final_embeddings = ... # Your (num_samples, embedding_dim) numpy array
# texts = ...            # Your list of text strings (length num_samples)
# heuristic_labels = ... # Optional: List/array of your heuristic multi-labels (length num_samples)
# original_labels = ...  # Optional: List/array of original single labels (length num_samples)



print(f"Original embedding shape: {final_embeddings.shape}")
print(f"Number of texts: {len(texts)}")
# Optional: Check lengths match
# print(f"Number of heuristic labels: {len(heuristic_labels)}")
# print(f"Number of original labels: {len(original_labels)}")


# --- Configure and Run UMAP ---
# You might need to tune these parameters based on your data size and desired output
reducer = umap.UMAP(
    n_neighbors=15,     # Controls local vs global structure (lower = more local)
    min_dist=0.1,       # Controls how tightly points are packed (lower = tighter)
    n_components=2,     # Reduce to 2 dimensions for plotting
    metric='cosine',    # Cosine distance is often good for text embeddings
    random_state=42     # For reproducible results
)

print("Running UMAP...")
umap_embeddings = reducer.fit_transform(final_embeddings)
print(f"UMAP reduced embedding shape: {umap_embeddings.shape}")


# --- Prepare Data for Plotting with Pandas ---
df_plot = pd.DataFrame({
    'umap_x': umap_embeddings[:, 0],
    'umap_y': umap_embeddings[:, 1],
    'text': texts
    # Add any labels you want to use for coloring or hover info
    # 'heuristic_label': heuristic_labels, # Example
    # 'original_label': original_labels    # Example
})

# If your heuristic labels are lists, convert them to strings for easier plotting/coloring
# Choose one strategy:
# 1. Join list elements:
# df_plot['heuristic_label_str'] = [' | '.join(map(str, lbl_list)) if isinstance(lbl_list, list) else str(lbl_list) for lbl_list in heuristic_labels]
# 2. Use only the first label (if applicable):
# df_plot['heuristic_label_first'] = [lbl_list[0] if isinstance(lbl_list, list) and lbl_list else 'None' for lbl_list in heuristic_labels]

# *** Select which label column you want to use for coloring ***
COLOR_COLUMN = None # Set to None to color all points the same
# COLOR_COLUMN = 'heuristic_label' # Or 'original_label', 'heuristic_label_str', etc. - Make sure this column exists in df_plot!


# --- Create Interactive Plot with Plotly ---
print("Creating plot...")
fig = px.scatter(
    df_plot,
    x='umap_x',
    y='umap_y',
    color=COLOR_COLUMN,          # Color points by this column (set to None for uniform color)
    hover_data=['text'],         # Show 'text' column when hovering over points
    title="UMAP Projection of Text Embeddings",
    labels={'umap_x': 'UMAP Dimension 1', 'umap_y': 'UMAP Dimension 2'}, # Nicer axis labels
    opacity=0.7,                 # Adjust point transparency if needed
    # size_max=5 # Uncomment to control max point size if using a size aesthetic
)

# Optional: Make points smaller if the plot is crowded
fig.update_traces(marker=dict(size=5))

# Optional: Customize hover text formatting (makes long text wrap)
fig.update_layout(hoverlabel=dict(bgcolor="white", font_size=12, namelength=-1))

print("Displaying plot...")
fig.show()

Original embedding shape: (2834, 768)
Number of texts: 2834
Running UMAP...



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



UMAP reduced embedding shape: (2834, 2)
Creating plot...
Displaying plot...


In [65]:
# -*- coding: utf-8 -*-
import umap
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.colors

# --- Assume these variables are pre-loaded and available ---
# df_train: pandas DataFrame containing your data, including a column named 'heuristic_labels'
#           where each entry is expected to be a list of labels.
# final_embeddings: A list of NumPy arrays OR a single 2D NumPy array.
#                   If a list: each array is the embedding for a sample, len(final_embeddings) == len(df_train),
#                   and all inner arrays must have the same shape (embedding_dimension,).
#                   If an array: shape must be (len(df_train), embedding_dimension).
# -----------------------------------------------------------

# --- Input Validation and Checks ---
if 'df_train' not in locals():
    raise NameError("DataFrame 'df_train' is not defined. Please ensure it's loaded.")
if 'final_embeddings' not in locals():
    raise NameError("Variable 'final_embeddings' is not defined. Please ensure it's loaded.")
if not isinstance(df_train, pd.DataFrame):
     raise TypeError("'df_train' must be a pandas DataFrame.")

# Check 'heuristic_labels' column existence
if 'heuristic_labels' not in df_train.columns:
    raise KeyError("DataFrame 'df_train' must contain the column 'heuristic_labels'.")

# Validate final_embeddings type and shape consistency with df_train
if isinstance(final_embeddings, list):
     print("Info: 'final_embeddings' detected as a list.")
     if len(df_train) != len(final_embeddings):
          raise ValueError(f"DataFrame length ({len(df_train)}) does not match embeddings list length ({len(final_embeddings)})!")
     # Further check list content type later during conversion
elif isinstance(final_embeddings, np.ndarray):
     print("Info: 'final_embeddings' detected as a NumPy array.")
     if final_embeddings.ndim != 2:
          raise ValueError(f"Expected 'final_embeddings' NumPy array to be 2D (samples, features), but got {final_embeddings.ndim} dimensions.")
     if len(df_train) != final_embeddings.shape[0]:
          raise ValueError(f"DataFrame length ({len(df_train)}) does not match embeddings array rows ({final_embeddings.shape[0]})!")
     final_embeddings_np = final_embeddings # Use the existing array directly
else:
     raise TypeError(f"'final_embeddings' must be a list of NumPy arrays or a single 2D NumPy array. Got type: {type(final_embeddings)}")


# --- CONVERT LIST OF EMBEDDINGS TO 2D NUMPY ARRAY (if necessary) ---
if isinstance(final_embeddings, list):
    print(f"\n'final_embeddings' is a list. Attempting conversion to a 2D NumPy array...")
    if not final_embeddings:
        raise ValueError("'final_embeddings' list is empty. Cannot proceed.")
    # Check the type of the first element to guide expectations
    if not isinstance(final_embeddings[0], np.ndarray):
         raise TypeError(f"Expected elements within 'final_embeddings' list to be NumPy arrays, but the first element is type: {type(final_embeddings[0])}")

    try:
        # np.array() is generally robust for lists of 1D arrays
        final_embeddings_np = np.array(final_embeddings)
        # Alternative: np.vstack() is more explicit for vertical stacking
        # final_embeddings_np = np.vstack(final_embeddings)

        # Verify the resulting shape is 2D
        if final_embeddings_np.ndim != 2:
             raise ValueError(f"Converted embeddings array has {final_embeddings_np.ndim} dimensions. Expected 2 (samples, features). Check if inner arrays in the list have consistent shapes/lengths.")
        print(f"Successfully converted 'final_embeddings' list to NumPy array with shape: {final_embeddings_np.shape}")

    except ValueError as e:
        print("\n--- ERROR ---")
        print("Failed to convert the list of embeddings into a single 2D NumPy array.")
        print("This commonly occurs if the NumPy arrays within the 'final_embeddings' list have *different lengths* (embedding dimensions).")
        print(f"Original error message: {e}")
        # Optional: Print shapes of first few elements for debugging
        try:
            print("Shapes of first 5 elements in 'final_embeddings' list:")
            for i in range(min(5, len(final_embeddings))):
                # Check element type again before accessing shape
                if isinstance(final_embeddings[i], np.ndarray):
                    print(f"  Element {i} shape: {final_embeddings[i].shape}")
                else:
                    print(f"  Element {i} is not a NumPy array (type: {type(final_embeddings[i])}).")
        except Exception as dbg_e:
            print(f"  (Could not reliably get shapes: {dbg_e})")
        raise ValueError("Inconsistent embedding dimensions or types in 'final_embeddings' list.") from e
    except Exception as e:
        print(f"\n--- UNEXPECTED ERROR during embedding conversion ---: {e}")
        raise

# Ensure the NumPy array exists after potential conversion
if 'final_embeddings_np' not in locals():
     # This condition should only be met if final_embeddings was already a valid NumPy array initially
     if isinstance(final_embeddings, np.ndarray) and final_embeddings.ndim == 2:
          final_embeddings_np = final_embeddings # It was already assigned earlier
     else:
          # This indicates a logic error or unhandled case in the checks above
          raise RuntimeError("Failed to prepare 'final_embeddings_np' array. Check input data and validation logic.")

# Final confirmation of the array UMAP will use
print(f"Using embeddings array 'final_embeddings_np' with shape for UMAP: {final_embeddings_np.shape}")
if len(df_train) != final_embeddings_np.shape[0]:
     # Safety check
     raise ValueError(f"Mismatch after processing: DataFrame length ({len(df_train)}) vs final embeddings array rows ({final_embeddings_np.shape[0]})")


# --- 1. Process Heuristic Labels for Unique Combinations ---
def create_label_combination_string(label_list):
    """
    Sorts labels within a list and joins them into a consistent string.
    Handles non-lists, empty lists, and ensures order invariance.
    Converts items to strings before sorting/joining.
    """
    if not isinstance(label_list, list):
        return "Invalid/NA Labels" # Handle None, NaN, strings, etc.
    if not label_list:
        return "No Labels" # Handle empty list []
    try:
        # Ensure all items are strings before sorting
        sorted_labels = sorted([str(lbl) for lbl in label_list])
        return " | ".join(sorted_labels)
    except Exception as e:
        # Catch potential errors during string conversion or sorting
        print(f"Warning: Could not process label list: {label_list}. Error: {e}")
        return "Processing Error"

# Create a working copy for plotting data to avoid modifying df_train directly
df_plot_data = df_train.copy()
df_plot_data['label_combination'] = df_plot_data['heuristic_labels'].apply(create_label_combination_string)

# Find and report unique combinations
unique_combinations = sorted(df_plot_data['label_combination'].unique()) # Sort for consistent color mapping & legend
n_combinations = len(unique_combinations)
print(f"\nFound {n_combinations} unique label combinations based on 'heuristic_labels'.")
print("Top 15 combinations by frequency:")
print(df_plot_data['label_combination'].value_counts().head(15))
if n_combinations > 15: print(f"... and {n_combinations - 15} more.")


# --- 2. Assign a Color to Each Unique Combination ---
print("\nAssigning distinct colors to each unique label combination...")
color_map = {} # Initialize color map
if n_combinations == 0:
    print("Warning: No valid label combinations found. Plot will likely have uniform color.")
else:
    # Select a suitable Plotly qualitative color scale based on the number of combinations
    if n_combinations <= len(px.colors.qualitative.Plotly):
        color_scale = px.colors.qualitative.Plotly
    elif n_combinations <= len(px.colors.qualitative.Alphabet):
         color_scale = px.colors.qualitative.Alphabet
    elif n_combinations <= len(px.colors.qualitative.G10) + len(px.colors.qualitative.Plotly):
         # Combine scales if one isn't enough
         color_scale = px.colors.qualitative.Plotly + px.colors.qualitative.G10
    else:
        # For a very large number of combinations, generate colors - distinctness may decrease
        print(f"Warning: High number of combinations ({n_combinations}). Generating colors; visual distinctness may vary.")
        # Cycle through a combination of qualitative scales (often better than sampling continuous)
        base_colors = px.colors.qualitative.Plotly + px.colors.qualitative.Set3 + px.colors.qualitative.G10 + px.colors.qualitative.Alphabet
        # Ensure enough colors by cycling
        color_scale = [base_colors[i % len(base_colors)] for i in range(n_combinations)]

    # Create the dictionary: {combination_string: color_hex_code}
    # Uses the sorted unique_combinations list for consistent assignment across runs
    color_map = {combination: color_scale[i % len(color_scale)]
                 for i, combination in enumerate(unique_combinations)}

if not color_map and n_combinations > 0:
     # Fallback if color generation failed unexpectedly
     print("Warning: Color map generation issue. Using default Plotly automatic coloring.")
     color_map = None # Let Plotly handle colors automatically
elif color_map:
     print(f"Generated specific color map for {len(color_map)} combinations.")


# --- 3. Run UMAP ---
# Instantiate the UMAP reducer
reducer = umap.UMAP(
    n_neighbors=15,        # Controls local vs global structure focus (default 15)
    min_dist=0.1,          # Controls minimum distance between points (default 0.1)
    n_components=2,        # Reduce to 2 dimensions for the scatter plot
    metric='cosine',       # Common distance metric for text/high-dimensional embeddings
    random_state=42,       # Ensures reproducibility of the UMAP layout
    verbose=True           # Display progress messages from UMAP
)

print("\nRunning UMAP reduction on 'final_embeddings_np'...")
# *** Perform the UMAP transformation using the prepared 2D NumPy array ***
umap_embeddings = reducer.fit_transform(final_embeddings_np)
print(f"UMAP reduction complete. Output shape: {umap_embeddings.shape}")


# --- 4. Prepare Final Data Structure for Plotting ---
# Add the generated UMAP coordinates to our plotting DataFrame
df_plot_data['umap_x'] = umap_embeddings[:, 0]
df_plot_data['umap_y'] = umap_embeddings[:, 1]

# Define which columns from df_plot_data to show on hover
# Start with the label information, add text/content if available
hover_data_cols = ['label_combination', 'heuristic_labels']
if 'text' in df_plot_data.columns:
    hover_data_cols.insert(0, 'text') # Add text at the beginning if present
elif 'content' in df_plot_data.columns:
     hover_data_cols.insert(0, 'content') # Alternative common column name for text
# Add any other relevant columns from the original df_train that are now in df_plot_data
# Example: if df_train had an 'original_label' column:
# if 'original_label' in df_plot_data.columns:
#      hover_data_cols.append('original_label')
print(f"\nColumns included in hover data: {hover_data_cols}")


# --- 5. Create Interactive Plot with Plotly ---
print("Creating Plotly scatter plot...")
fig = px.scatter(
    df_plot_data,                 # The DataFrame containing UMAP coords and label combinations
    x='umap_x',                   # Column for the x-axis
    y='umap_y',                   # Column for the y-axis
    color='label_combination',    # Column determining the color of points
    color_discrete_map=color_map if color_map else None, # Apply our specific color mapping, or let Plotly choose if mapping failed
    hover_data=hover_data_cols,   # List of columns to display on hover
    title="UMAP Projection Colored by Heuristic Label Combination", # Plot title
    labels={                      # Custom labels for axes and legend
        'umap_x': 'UMAP Dimension 1',
        'umap_y': 'UMAP Dimension 2',
        'label_combination': 'Label Combination' # Title for the color legend
    },
    # Ensure legend items are sorted alphabetically based on the combination string
    category_orders={'label_combination': unique_combinations}
)

# --- 6. Optional Visual Enhancements ---
# Fine-tune marker appearance
fig.update_traces(
    marker=dict(
        size=5,                     # Adjust marker size (e.g., 4, 5, 6)
        opacity=0.7                 # Adjust marker transparency (0.0 to 1.0)
        # line=dict(width=0.5, color='DarkSlateGrey') # Optional: Add subtle border
        ),
    selector=dict(mode='markers') # Ensure this applies only to marker traces
)

# Customize layout elements like legend and hover labels
fig.update_layout(
    legend_title_text='Label Combination', # Explicitly set the legend title
    legend=dict(
        itemsizing='constant',      # Keep legend markers a consistent size
        traceorder='normal'         # Respect the order set by category_orders
    ),
    hoverlabel=dict(
        bgcolor="white",            # Background color of the hover box
        font_size=12,               # Font size inside the hover box
        namelength=-1               # Prevent truncation of field names in hover box
    ),
    # Adjust margins if elements overlap (optional)
    # margin=dict(l=40, r=40, b=40, t=50)
)
print("Plot object created and styled.")

# --- 7. Display Plot ---
print("Displaying interactive plot...")
fig.show()

Info: 'final_embeddings' detected as a NumPy array.
Using embeddings array 'final_embeddings_np' with shape for UMAP: (2834, 768)

Found 269 unique label combinations based on 'heuristic_labels'.
Top 15 combinations by frequency:
label_combination
request_call                       335
inform_attachment                  191
propose_meeting                    170
offer_assistance                   101
request_run_test                    85
request_follow_up                   81
request_add_cc                      76
meeting_time                        68
request_inclusion                   66
sign_and_approval                   66
request_fax                         61
request_send_document               58
request_further_information         56
request_fax | sign_and_approval     50
request_send_feedback               48
Name: count, dtype: int64
... and 254 more.

Assigning distinct colors to each unique label combination...
Generated specific color map for 269 combinations.

Running 


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



Thu Apr  3 12:21:56 2025 Finding Nearest Neighbors
Thu Apr  3 12:21:56 2025 Finished Nearest Neighbor Search
Thu Apr  3 12:21:57 2025 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Thu Apr  3 12:22:00 2025 Finished embedding
UMAP reduction complete. Output shape: (2834, 2)

Columns included in hover data: ['text', 'label_combination', 'heuristic_labels']
Creating Plotly scatter plot...
Plot object created and styled.
Displaying interactive plot...
