# Experiments on Benchmark Intent Datasets

In [1]:
from collections import Counter, defaultdict

import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

tqdm.pandas()
import itertools

import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import nltk
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import spacy
import umap.umap_ as umap
from nltk.corpus import stopwords
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [2]:
# Run this in your environment in order load the pre-trained model
# python -m spacy download en_core_web_sm


# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

## Load Data and obtain Embeddings

In [3]:
dataset_name = "banking" # 'stackoverflow' 'banking' 'clinc'
model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
# Ensure stopwords are downloaded for lexical analysis
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/padeck/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df_train = pd.read_csv(f"../data/{dataset_name}/train.tsv",sep="\t")
df_eval = pd.read_csv(f"../data/{dataset_name}/dev.tsv",sep="\t")
df_test = pd.read_csv(f"../data/{dataset_name}/test.tsv",sep="\t")
df = pd.concat([df_train, df_eval, df_test])
del df_train
del df_eval
del df_test

In [6]:
def calc_embeddings(text):
    return model.encode(text,normalize_embeddings=True)

df['text_embds'] = df['text'].progress_apply(calc_embeddings)

100%|██████████| 13083/13083 [02:27<00:00, 88.56it/s]


## UMAP Projection

In [7]:
# Stack embeddings into a 2D array
embedding_matrix = np.vstack(df['text_embds'].values)

# UMAP dimensionality reduction
reducer = umap.UMAP(n_neighbors=30, min_dist=0.3, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(embedding_matrix)

# Add 2D projection to original dataframe
df['x'] = embedding_2d[:, 0]
df['y'] = embedding_2d[:, 1]

# Compute label-wise centroids in original embedding space
label_centroids = df.groupby('label')['text_embds'].apply(
    lambda x: np.mean(np.vstack(x.values), axis=0)
)

# Transform centroids to 2D using UMAP
centroid_embeddings = np.vstack(label_centroids.values)
centroid_2d = reducer.transform(centroid_embeddings)
centroids_df = pd.DataFrame(centroid_2d, columns=['x', 'y'])
centroids_df['label'] = label_centroids.index

fig = go.Figure()

# Add data points grouped by label
for label in df['label'].unique():
    subset = df[df['label'] == label]
    fig.add_trace(
        go.Scatter(
            x=subset['x'],
            y=subset['y'],
            mode='markers',
            name=str(label),
            marker=dict(size=4, opacity=1.0),
            text=subset['text'],
            hoverinfo='text'
        )
    )

# Add centroids last — so they are on top
fig.add_trace(
    go.Scatter(
        x=centroids_df['x'],
        y=centroids_df['y'],
        mode='markers+text',
        name='Centroids',
        marker=dict(
            color='black',
            size=10,
            symbol='x',
            line=dict(width=2, color='white')
        ),
        text=centroids_df['label'],
        textposition='top center',
        textfont=dict(size=14, color='black'),
        hoverinfo='text'
    )
)

# Layout
fig.update_layout(
    title=f'UMAP Projection for the {dataset_name} dataset',
    width=900,
    height=700,
    showlegend=True
)

fig.show()

  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Lexical Analysis

In [8]:
# Function to extract nouns and verbs
def extract_nouns_verbs(text):
    doc = nlp(text)
    nouns = [token.lemma_.lower() for token in doc if token.pos_ == 'NOUN' and not token.is_stop and token.is_alpha]
    verbs = [token.lemma_.lower() for token in doc if token.pos_ == 'VERB' and not token.is_stop and token.is_alpha]
    return nouns, verbs

# Initialize result dictionary
label_stats = {}

# Enable tqdm for the outer loop over labels
for label, group in tqdm(df.groupby('label'), desc="Processing Labels"):
    all_nouns = []
    all_verbs = []

    # tqdm for inner loop over texts in each label
    for text in tqdm(group['text'], desc=f"Texts in '{label}'", leave=False):
        nouns, verbs = extract_nouns_verbs(text)
        all_nouns.extend(nouns)
        all_verbs.extend(verbs)

    noun_counts = Counter(all_nouns).most_common(3)
    verb_counts = Counter(all_verbs).most_common(3)

    label_stats[label] = {
        'top_nouns': [word for word, _ in noun_counts],
        'top_verbs': [word for word, _ in verb_counts]
    }

# Convert results to DataFrame
result_df = pd.DataFrame.from_dict(label_stats, orient='index').reset_index()
result_df.columns = ['label', 'top_nouns', 'top_verbs']
result_df

Processing Labels: 100%|██████████| 77/77 [01:07<00:00,  1.14it/s]


Unnamed: 0,label,top_nouns,top_verbs
0,Refund_not_showing_up,"[refund, statement, account]","[show, request, check]"
1,activate_my_card,"[card, activation, process]","[activate, need, get]"
2,age_limit,"[account, age, child]","[open, need, use]"
3,apple_pay_or_google_pay,"[apple, pay, watch]","[work, use, pay]"
4,atm_support,"[card, atm, money]","[use, accept, withdraw]"
...,...,...,...
72,virtual_card_not_working,"[card, work, payment]","[work, reject, use]"
73,visa_or_mastercard,"[card, visa, mastercard]","[choose, use, like]"
74,why_verify_identity,"[identity, account, verification]","[verify, need, use]"
75,wrong_amount_of_cash_received,"[cash, money, app]","[give, receive, withdraw]"


## Clustering Metrics

### Intra-Class Similarities

In [9]:
# Convert embeddings into a matrix
embedding_matrix = np.vstack(df["text_embds"].values)

# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embedding_matrix)

# Store in DataFrame for better readability
similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)

In [10]:
intra_label_stats = defaultdict(lambda: {"Avg Similarity": 0, "Variance": 0})

labels = df['label'].unique()

for label in labels:
    indices = df[df["label"] == label].index  # Get indices of this intent
    sims = similarity_df.loc[indices, indices].values  # Extract similarity scores
    np.fill_diagonal(sims, np.nan)  # Ignore self-similarity (diagonal values)
    
    avg_sim = np.nanmean(sims)  # Compute mean similarity
    variance_sim = np.nanvar(sims)  # Compute variance
    
    intra_label_stats[label]["Avg Similarity"] = avg_sim
    intra_label_stats[label]["Variance"] = variance_sim

# Convert to DataFrame
intra_df = pd.DataFrame.from_dict(intra_label_stats, orient="index").reset_index()
intra_df.columns = ["Intent", "Avg Intra Similarity", "Variance"]

# Sort by average similarity
intra_df = intra_df.sort_values(by="Avg Intra Similarity", ascending=False)
intra_df

Unnamed: 0,Intent,Avg Intra Similarity,Variance
72,activate_my_card,0.425552,0.078733
75,cash_withdrawal_charge,0.394563,0.052797
61,card_payment_wrong_exchange_rate,0.393004,0.061460
35,card_payment_fee_charged,0.392541,0.054812
37,reverted_card_payment?,0.386511,0.047020
...,...,...,...
53,lost_or_stolen_phone,0.267720,0.045084
24,edit_personal_details,0.263792,0.050983
60,age_limit,0.262392,0.059358
19,automatic_top_up,0.248708,0.065560


In [11]:
# Averaging over all avg. intra class similarities for an overall metric
intra_df['Avg Intra Similarity'].mean()

np.float32(0.3324929)

### Inter-Category Similarities

In [12]:
# List of all unique intent labels
intent_pairs = list(itertools.combinations(labels, 2))  # All possible intent pairs

# Function to compute inter-intent similarity
def compute_inter_intent_similarity(df, intent_a, intent_b):
    # Extract embeddings for each intent
    emb_a = np.stack(df[df["label"] == intent_a]["text_embds"].values)
    emb_b = np.stack(df[df["label"] == intent_b]["text_embds"].values)
    
    # Compute cosine similarity between all utterances
    return np.mean(cosine_similarity(emb_a, emb_b))

# Compute similarity for all intent pairs and store in a dict
inter_intent_similarities = {
    (intent_a, intent_b): compute_inter_intent_similarity(df, intent_a, intent_b)
    for intent_a, intent_b in intent_pairs
}

# Create a symmetric DataFrame for the similarity matrix
intent_sim_matrix = pd.DataFrame(index=labels, columns=labels, dtype=float)

# Populate the upper triangle of the matrix (no need to fill the lower triangle)
for (intent_a, intent_b), sim in inter_intent_similarities.items():
    intent_sim_matrix.loc[intent_a, intent_b] = sim
    intent_sim_matrix.loc[intent_b, intent_a] = sim  # Symmetric

# Optionally, fill diagonal values with NaN (since comparing same intents is not necessary)
np.fill_diagonal(intent_sim_matrix.values, np.nan)

# Display the resulting similarity matrix
intent_sim_matrix

Unnamed: 0,exchange_charge,balance_not_updated_after_cheque_or_cash_deposit,beneficiary_not_allowed,pending_card_payment,top_up_limits,get_disposable_virtual_card,pending_top_up,supported_cards_and_currencies,transfer_into_account,verify_source_of_funds,...,terminate_account,get_physical_card,pin_blocked,receiving_money,virtual_card_not_working,activate_my_card,card_acceptance,atm_support,cash_withdrawal_charge,compromised_card
exchange_charge,,0.145167,0.156918,0.121305,0.086375,0.089109,0.042258,0.328414,0.293649,0.188610,...,0.103914,0.079822,0.014842,0.294437,0.018159,0.067660,0.173363,0.187416,0.282950,0.097837
balance_not_updated_after_cheque_or_cash_deposit,0.145167,,0.269969,0.372855,0.022865,0.077505,0.212218,0.198464,0.312096,0.358022,...,0.187047,0.264073,0.219082,0.226886,0.173853,0.185078,0.188935,0.249622,0.378936,0.255386
beneficiary_not_allowed,0.156918,0.269969,,0.227016,0.031706,0.053347,0.124210,0.159095,0.354014,0.210845,...,0.209143,0.129029,0.184383,0.242346,0.196983,0.200749,0.168418,0.168420,0.264472,0.221368
pending_card_payment,0.121305,0.372855,0.227016,,0.000498,0.139319,0.281746,0.199379,0.161625,0.200655,...,0.159229,0.278887,0.221339,0.153213,0.239790,0.318492,0.258724,0.220952,0.338794,0.301922
top_up_limits,0.086375,0.022865,0.031706,0.000498,,0.026241,0.357483,0.150411,0.120142,0.047859,...,0.068317,0.012163,0.070819,0.065697,-0.000697,0.054915,0.083536,0.106386,0.068382,0.049110
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
activate_my_card,0.067660,0.185078,0.200749,0.318492,0.054915,0.370095,0.190337,0.261398,0.232973,0.127782,...,0.238651,0.400994,0.354135,0.140029,0.430022,,0.440403,0.292080,0.212053,0.417200
card_acceptance,0.173363,0.188935,0.168418,0.258724,0.083536,0.374580,0.098261,0.367565,0.199796,0.162445,...,0.139489,0.360515,0.260077,0.198029,0.334096,0.440403,,0.398596,0.205923,0.367024
atm_support,0.187416,0.249622,0.168420,0.220952,0.106386,0.251309,0.092986,0.325746,0.286549,0.239574,...,0.159458,0.348529,0.269545,0.208232,0.201430,0.292080,0.398596,,0.313662,0.258718
cash_withdrawal_charge,0.282950,0.378936,0.264472,0.338794,0.068382,0.144417,0.154380,0.188886,0.282143,0.278875,...,0.212823,0.236425,0.219807,0.210203,0.167699,0.212053,0.205923,0.313662,,0.296642


In [13]:
# Assuming centroid_sim_df is already defined as a similarity matrix
# Mask the diagonal (self-similarity) by setting it to NaN
np.fill_diagonal(intent_sim_matrix.values, np.nan)

# Replace values that are 1 or 0 with NaN (self-similarity or exact dissimilarity)
intent_sim_df = intent_sim_matrix.replace({1: np.nan, 0: np.nan})

# Keep only the upper triangle to remove redundant pairs
centroid_sim_df = intent_sim_df.where(np.triu(np.ones(intent_sim_df.shape), k=1).astype(bool))

# Flatten the matrix, sort values, and get the top 100 highest similarity pairs
most_similar = centroid_sim_df.stack().nlargest(100)


# Display top 5 highest similarity pairs
print("Most Similar Pairs:")
for (pair, similarity) in most_similar.items():
    print(f"The pair '{pair[0]}' and '{pair[1]}' has a cosine similarity of {similarity:.4f}")


Most Similar Pairs:
The pair 'why_verify_identity' and 'verify_my_identity' has a cosine similarity of 0.5756
The pair 'change_pin' and 'get_physical_card' has a cosine similarity of 0.5719
The pair 'getting_virtual_card' and 'virtual_card_not_working' has a cosine similarity of 0.5706
The pair 'wrong_exchange_rate_for_cash_withdrawal' and 'card_payment_wrong_exchange_rate' has a cosine similarity of 0.5505
The pair 'get_disposable_virtual_card' and 'getting_virtual_card' has a cosine similarity of 0.5388
The pair 'card_linking' and 'activate_my_card' has a cosine similarity of 0.5347
The pair 'declined_card_payment' and 'reverted_card_payment?' has a cosine similarity of 0.5345
The pair 'exchange_charge' and 'exchange_rate' has a cosine similarity of 0.5316
The pair 'Refund_not_showing_up' and 'request_refund' has a cosine similarity of 0.5296
The pair 'failed_transfer' and 'declined_transfer' has a cosine similarity of 0.5287
The pair 'verify_my_identity' and 'unable_to_verify_identi

In [14]:
# Function to find the most similar utterance-pair from the most similar cluster pair
def find_most_similar_utterance_pair(df, top_clusters):
    most_similar_utterances = []
    
    # Loop through the most similar cluster pairs
    for cluster_1, cluster_2 in top_clusters:
        # Extract the data for both clusters
        cluster_1_entries = df[df['label'] == cluster_1]
        cluster_2_entries = df[df['label'] == cluster_2]
        
        # Get the embeddings for the utterances in these clusters
        cluster_1_embeddings = np.array(cluster_1_entries['text_embds'].tolist())
        cluster_2_embeddings = np.array(cluster_2_entries['text_embds'].tolist())
        
        # Compute cosine similarities between the utterances from the two clusters
        similarity_matrix = cosine_similarity(cluster_1_embeddings, cluster_2_embeddings)
        
        # Find the index of the most similar utterance-pair
        idx_1, idx_2 = np.unravel_index(np.argmax(similarity_matrix), similarity_matrix.shape)
        
        # Get the most similar utterance-pair and its similarity score
        similarity_score = similarity_matrix[idx_1, idx_2]
        
        # Add to the results list with cluster labels
        most_similar_utterances.append({
            'cluster_1_label': cluster_1,  # Label of cluster 1
            'cluster_2_label': cluster_2,  # Label of cluster 2
            'cluster_1_utterance': cluster_1_entries.iloc[idx_1]['text'],  # Original text from cluster 1
            'cluster_2_utterance': cluster_2_entries.iloc[idx_2]['text'],  # Original text from cluster 2
            'similarity': similarity_score
        })
    
    return most_similar_utterances

# Get the top similar clusters (using the `top_5_similar` from previous code)
# Assuming `top_5_similar.index` is a list of tuples: (cluster_1, cluster_2)
top_clusters = most_similar.index

# Find the most similar utterance-pair for the most similar clusters
most_similar_utterances = find_most_similar_utterance_pair(df, top_clusters)

# Display the most similar utterance-pair(s) for each cluster pair, including cluster labels
print("\nMost Similar Utterance Pair Between the Most Similar Clusters:")
for entry in most_similar_utterances:
    print(f"Cluster 1 (Label: '{entry['cluster_1_label']}') Utterance: '{entry['cluster_1_utterance']}'")
    print(f"Cluster 2 (Label: '{entry['cluster_2_label']}') Utterance: '{entry['cluster_2_utterance']}'")
    print(f"Cosine Similarity: {entry['similarity']:.4f}")
    print("-" * 50)



Most Similar Utterance Pair Between the Most Similar Clusters:
Cluster 1 (Label: 'why_verify_identity') Utterance: 'How to verify my identity'
Cluster 2 (Label: 'verify_my_identity') Utterance: 'How do I verify my identity?'
Cosine Similarity: 0.9854
--------------------------------------------------
Cluster 1 (Label: 'change_pin') Utterance: 'How do I change my card PIN?'
Cluster 2 (Label: 'get_physical_card') Utterance: 'How do I set-up my PIN for the new card?'
Cosine Similarity: 0.8897
--------------------------------------------------
Cluster 1 (Label: 'getting_virtual_card') Utterance: 'What do I have to do to get my virtual card?'
Cluster 2 (Label: 'virtual_card_not_working') Utterance: 'What do I have to do to get the virtual card to work?'
Cosine Similarity: 0.8843
--------------------------------------------------
Cluster 1 (Label: 'wrong_exchange_rate_for_cash_withdrawal') Utterance: 'The exchange rate applied was incorrect when I was traveling outside the country.'
Cluster

### DBI and Silhouette Score

In [15]:
# Function to calculate cosine distance
def cosine_distance(v1, v2):
    return 1 - np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Function to compute centroids of clusters
def compute_centroids(df, label_col, emb_col):
    centroids = []
    labels = df[label_col].unique()
    for label in labels:
        cluster = np.vstack(df[df[label_col] == label][emb_col])
        centroid = np.mean(cluster, axis=0)
        centroid = normalize(centroid.reshape(1, -1))[0]
        centroids.append(centroid)
    return np.array(centroids), labels

# Function to compute Davies-Bouldin Index using cosine distance
def dbi_cosine(df, label_col, emb_col):
    centroids, labels = compute_centroids(df, label_col, emb_col)
    n_clusters = len(centroids)
    S = np.zeros(n_clusters)
    M = np.full((n_clusters, n_clusters), np.inf)

    # Compute S_i and M_ij
    for i in range(n_clusters):
        cluster_i = np.vstack(df[df[label_col] == labels[i]][emb_col])
        for x in cluster_i:
            S[i] += cosine_distance(x, centroids[i])
        S[i] /= len(cluster_i)

        for j in range(i + 1, n_clusters):
            M[i, j] = M[j, i] = cosine_distance(centroids[i], centroids[j])

    # Compute DBI
    DBI = np.mean([max((S[i] + S[j]) / M[i, j] for j in range(n_clusters) if i != j) for i in range(n_clusters)])
    return DBI

In [16]:
# Calculate DBI
dbi_value = dbi_cosine(df, 'label', 'text_embds')
print(f"Davies-Bouldin Index (cosine): {dbi_value}")

Davies-Bouldin Index (cosine): 2.469843970385049


In [17]:
# Silhouette Score already supports cosine as a distance metric
silhouette_score(df['text_embds'].tolist(),df['label'].tolist(),metric='cosine')

np.float32(0.15642716)