# NLP Clustering for Modal Spaces
Author: Eli Hecht. Adapted from Alina Dracheva's code
Purpose: Compute text embeddings and clusters for possibility generation and decision responses.

## Imports

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd


from sentence_transformers import SentenceTransformer #load the model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

import umap
from sklearn.cluster import KMeans
from scipy.spatial import distance_matrix
from sklearn.cluster import DBSCAN
# from sklearn.metrics import silhouette_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import numpy as np
import plotly.graph_objects as go
import plotly.express as px
# import kaleido
import plotly.io as pio
import matplotlib.pyplot as plt

## Define analysis functions

In [None]:
# Function to filter specified words  and pronouns
def filter_words(sentence, words_to_remove):
    # Tokenize
    words = word_tokenize(sentence)
    
    # Tag each word with its part of speech
    pos_tags = nltk.pos_tag(words)
    
    # Define the pos tags for personal words
    pronoun_tags = {'PRP', 'PRP$'}
    
    # list of words directly specified the user to remove
    words_to_remove_lower = [noun.lower() for noun in words_to_remove]

    #Filter out words that are personal words
    filtered_words = [word for word, tag in pos_tags if tag not in pronoun_tags and word.lower() not in words_to_remove_lower]
    
    #Reassemble the sentence
    return ' '.join(filtered_words)

In [None]:
# Function to compute text embeddings for a single context
def compute_embeddings(df, text_column, context_number, words_to_remove):
    # select only responses from specified context
    texts_df = df[df['context'] == context_number] 
    texts_df = texts_df.reset_index()

    # Convert all responses to strings
    texts_df[text_column] = texts_df[text_column].astype(str)

    # Strips text of specified words and pronouns
    texts_df[text_column] = texts_df[text_column].apply(lambda x: filter_words(x, words_to_remove))

    texts = texts_df[text_column].tolist()

    # compute option embeddings for each response
    text_embeddings = model.encode(texts, show_progress_bar=True) 
    return text_embeddings, texts_df



In [None]:
# dimensionality reduction with pre-defined parameters
def reduce_dimensions(texts_df, text_embeddings, umap_params):
    umap_embeddings = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                    n_components=umap_params['n_components'], 
                                    metric=umap_params['metric'],
                                    min_dist=umap_params['min_dist'],
                                    random_state=umap_params['random_state'])
                                .fit_transform(text_embeddings))

    # Split the UMAP embeddings into individual columns for easier processing later
    for i in range(umap_embeddings.shape[1]):
        texts_df[f'umap_dim_{i}'] = umap_embeddings[:, i]

    # Convert the UMAP embeddings from individual columns to lists for use in the centroid distance calculation
    umap_list = [f'umap_dim_{i}' for i in range(umap_embeddings.shape[1])]
    texts_df['umap_embedding_list'] = texts_df[umap_list].apply(lambda row: row.tolist(), axis=1)
    
    # dimensionality reduction to only two dimensions
    # this is used for plotting clusters later
    umap_embeddings_2 = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                    n_components=2, # only 2 dimensions
                                    metric=umap_params['metric'],
                                    min_dist=umap_params['min_dist'],
                                    random_state=umap_params['random_state'])
                                .fit_transform(text_embeddings))
    
    # Split the UMAP embeddings into individual columns
    for i in range(umap_embeddings_2.shape[1]):
        texts_df[f'umap_dim2_{i}'] = umap_embeddings_2[:, i]
    
    return umap_embeddings

In [None]:
# computes clusters on reduced embeddings
def compute_clusters(umap_embeddings, clustering_params):
    if clustering_params['algorithm_name'] == 'KMeans':
        # initialize KMeans model
        clustering_model = KMeans(n_clusters=clustering_params['num_clusters'], n_init='auto')
    elif clustering_params['algorithm_name'] == 'DBSCAN':
        # initialize DBSCAN model
        clustering_model = DBSCAN(eps=clustering_params['eps'], min_samples=clustering_params['min_samples'])
    else:
        raise ValueError("Invalid clustering method. Supported methods are 'kmeans' and 'dbscan'.")
    
    # perform clustering using specified paramaters
    clustering_model.fit(umap_embeddings)

    # labels includes labels of each point, clustering_model
    return clustering_model.labels_, clustering_model

In [None]:
# function to print clusters for row by row evaluation
def print_clusters(texts_df, text_column_name, cluster_assignment, num_clusters):
    clustered_sentences = [[] for i in range(num_clusters)]
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(texts_df[text_column_name][sentence_id])

    for i, cluster in enumerate(clustered_sentences):
        print("Cluster ", i+1)
        cluster_list = []
        for item in cluster:
            # removes identical context that is included in text of each result
            cluster_list.append(item.split(":", 1)[-1].strip())
        print(cluster_list)
        print("")

In [None]:
# function to create elbow-plot to identify ideal number of clusters
def k_plot(text_embeddings, umap_params, directory):
    distortions = []
    K_range = range(1, 18)


    umap_embeddings = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                        n_components=umap_params['n_components'], 
                                        metric=umap_params['metric'],
                                        min_dist=umap_params['min_dist'],
                                        random_state=umap_params['random_state'])
                                    .fit_transform(text_embeddings))
    
    # perform KMeans for each k value
    for k in K_range:
        clustering_model = KMeans(n_clusters=k, n_init='auto')
        clustering_model.fit(umap_embeddings)
        distortions.append(clustering_model.inertia_) # appends inertia to distortians list

    # Plotting the elbow curve
    plt.plot(K_range, distortions, marker='o')
    plt.title('Elbow Method For Optimal k')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Sum of Squared Distances')
    # saves figure to specified folder
    plt.savefig(directory)
    # plt.show()
    plt.clf()

In [None]:
# create a df with clusters, centroids and umap embeddings
def compute_centroid(texts_df, text_column_name, original_text_column_name, clustering_params, cluster_assignment, clustering_model):
    # attach the cluster assignments to the dataframe
    texts_df['cluster'] = pd.Series(cluster_assignment, index=texts_df.index)
    
    # DBSCAN has no applicable concept of centroids so this is just performed on clusters generated by kMeans
    if clustering_params['algorithm_name']=='KMeans':
        # Attach the centroids to the dataframe
        # In sklearn, the cluster centers are available directly via clustering_model.cluster_centers_
        texts_df['centroid'] = texts_df['cluster'].apply(lambda x: clustering_model.cluster_centers_[x])
        # Define a function to compute the distance of each embedding from its cluster's centroid
        def distance_from_centroid(row):
            return distance_matrix([row['umap_embedding_list']], [row['centroid']])[0][0]
        texts_df['distance_from_centroid'] = texts_df.apply(distance_from_centroid, axis=1)
    else:
        texts_df['distance_from_centroid'] = np.nan

    # Select the response closest to each cluster centroid to serve as a summary of the cluster
    # For DBSCAN the response selected will be whichever happens to be first, but this will still be useful for later naming clusters
    summary = texts_df.sort_values('distance_from_centroid', ascending=True).groupby('cluster').head(1).sort_index()[text_column_name].tolist()

    # Create a dictionary linking each summary response to its corresponding cluster number
    clusters = {}
    for i in range(len(summary)):
        clusters[summary[i]] = texts_df.loc[texts_df[text_column_name] == summary[i], "cluster"].iloc[0]

    # create a dictionary with cluster numbers as keys and their centroid texts as values
    id_to_name = {v: k for k, v in clusters.items()}

    texts_df['cluster_name'] = texts_df['cluster'].map(id_to_name) # create a column with centroid texts

    # Create a dictionary with cluster names as keys and original response names of centroids as values
    id_to_original_response = {}
    for cluster_name, centroid_text in id_to_name.items():
        original_response = texts_df.loc[texts_df[text_column_name] == centroid_text, original_text_column_name].iloc[0]
        id_to_original_response[cluster_name] = original_response

    # Map original response names of centroids to cluster names
    texts_df['cluster_name'] = texts_df['cluster'].map(id_to_original_response)

    return clusters

In [None]:
# Function to plot clusters on the joint embedding space

def plot_clusters(component1, component2, cluster, name, response,
                   data_source, 
                   umap_params, clustering_params,
                   plot_location):
    pio.renderers.default = "browser"
    color_palette = px.colors.qualitative.Light24
    
    fig = go.Figure()
    
    
    title_str = f"UMAP Parameters: n_neighbors={umap_params['n_neighbors']}, n_components={umap_params['n_components']}, min_dist={umap_params['min_dist']} | Clustering: {clustering_params['algorithm_name']} with k={clustering_params['num_clusters']} clusters"
    if clustering_params['algorithm_name']=='DBSCAN':
        title_str += f" at EPS={clustering_params['eps']} and min_samples={clustering_params['min_samples']}"
    
    # Get unique clusters and data sources
    unique_clusters = sorted(cluster.unique())
    unique_data_sources = sorted(data_source.unique())

    color_map = {uc: color_palette[i % len(color_palette)] for i, uc in enumerate(unique_clusters)}
    if len(unique_data_sources) == 1:
        marker_symbols = {
            unique_data_sources[0]: 'square'
        }
    if len(unique_data_sources) == 2:
        marker_symbols = {
            unique_data_sources[1]: 'square',
            unique_data_sources[0]: 'diamond'
        }
    
    
    # Add a trace for each Source to indicate the shape in the legend
    for ds, symbol in marker_symbols.items():
        fig.add_trace(go.Scatter(
            x=[None],
            y=[None],
            mode='markers',
            marker=dict(
                size=10,
                symbol=symbol,
                # color=trajectory_colors[ds]
            ),
            name=f'{ds.capitalize()} (shape)'
        ))

    added_cluster_names = set()
    for ds in unique_data_sources:
        for uc in unique_clusters:
            mask = (cluster == uc) & (data_source == ds)
            if mask.any():  # Check if there are any rows after applying the mask
                show_in_legend = name[mask].iloc[0] not in added_cluster_names
                added_cluster_names.add(name[mask].iloc[0])
                
                fig.add_trace(go.Scatter(
                    x=component1[mask],
                    y=component2[mask],
                    mode='text+markers',
                    name=name[mask].iloc[0] if show_in_legend else None,
                    legendgroup=f'group{uc}',
                    showlegend=show_in_legend,
                    hovertext= str(uc) + ": " + response[mask] ,
                    # text='gen_num[mask].astype(str)',
                    marker=dict(
                        size=12,
                        color=color_map[uc],
                        symbol=marker_symbols[ds],  # Use the marker symbol based on the data source
                        line_width=1,
                        opacity=1
                    ),
                    textfont=dict(
                        size=10,
                        color='black'
                    )
                ))


    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=2000,
        height=1200,
        showlegend=True,
        title=title_str,
        paper_bgcolor='white',  # White background for the entire plot area
        plot_bgcolor='white',
        legend=dict(
            yanchor="top",
            y=1,
            xanchor="left",
            x=0.01,
            bgcolor='rgba(255,255,255,0)'
        )
    )

    fig.layout.template = 'ggplot2'
    fig.write_html(plot_location)

## Load in data

In [None]:
agent_list = ['Heinz', 'Josh', 'Brian', 'Liz', 'Mary', 'Brad', 'Darya', 'Eunice', 'Eamon', 'Cameron', 'Erica', 'Carl', 'Daniel', 'Andy', 'Ahmed', 'Eva', 'Jeff', 'Shania']

In [None]:
# load first-person decision study data
df_decision = pd.read_csv('../data/decision.csv')

# add id column
df_decision = df_decision.reset_index().rename(columns={'index': 'id'})
df_decision['id'] += 1

# select only participants who finished
df_decision = df_decision[df_decision['finished']]

# exclude ids of participants who gave non-sensical responses
exclude_ids = [21, 64, 72, 74, 84, 86, 89]
df_decision[~df_decision['id'].isin(exclude_ids)].reset_index(drop=True) 

# Select columns 'id', 'S1_1' to 'S18_1'
df_decision = df_decision[['id', 'S1_1', 'S2_1', 'S3_1', 'S4_1', 'S5_1', 'S6_1', 'S7_1', 'S8_1', 'S9_1', 'S10_1', 'S11_1', 'S12_1', 'S13_1', 'S14_1', 'S15_1', 'S16_1', 'S17_1', 'S18_1']]

#  Melt the DataFrame to long format
df_decision = pd.melt(df_decision, id_vars=['id'], var_name='context', value_name='decision')

# Extract numeric values from 'context' column using str.extract
df_decision['context'] = df_decision['context'].str.extract('(\d+)').astype(int)

# drop empty responses
df_decision.dropna(subset=['decision'], inplace=True)
df_decision.rename(columns={'decision': 'response'}, inplace=True)

# add source column indicating that this is from the decision study
df_decision['source'] = 'decision'

# df_decision

In [None]:
# load third-person possibility generation study data
df_pg = pd.read_csv('../manualCoding/pg_coded_final.csv', index_col=0)
df_pg = df_pg[['context', 'id', 'answer', 'text', 'value']]
df_pg.rename(columns={"text":"response"}, inplace=True)
df_pg['source'] = 'pg'

In [None]:
# Merge decision study data and possibility generation study data into one data frame for clustering
df = pd.merge(df_decision, df_pg, how='outer')

In [None]:
# Add full scenario texts to merged_text to give LLM context for responses
contexts_list = pd.read_csv('../materials/contextsTable.csv', index_col=0)['text']

df_merge = pd.merge(df, contexts_list, left_on='context', right_index=True)
df_merge['merged_text'] = df_merge['text'] + ' : ' + df_merge['response']
df_merge.rename(columns={"response": "response_original", "text": "scenario_text"}, inplace=True)

df = df_merge

In [None]:
df

## Compute embeddings and Analyze clusters

In [None]:
### To run the analyses edit the values in this cell then run this cell and the one below


# directory to send resulting plots and clusters
dir = "pg_decision_clusters/"

## data sources list: "pg" (just Study 1 possibility generation data), "decision" (just Study 2decision data)
## For convergence of participant responses, just "pg" should be in sources.
## For clustering to model decision likelihood both "pg" and "decision" should be included in sources.
sources = ["pg", "decision"]
df_clustering = df[df['source'].isin(sources)]

## text_column_name is the column the embeddings will be performed on
## text_column_name options: 'response_original' or 'merged_text'
# merged_text gives context by merging response with the scenario text and generally gives better results
text_column_name = 'merged_text'
original_text_column_name = 'response_original' # this is used so you can keep track of original texts for labelling


# Define UMAP and clustering parameters
umap_params = {'n_neighbors': 100, 'n_components': 10, 'metric': 'cosine', 'min_dist': 0.05, 'random_state': None}


# Define range of k values to test kMeans clustering at
k_range = range(1, 19)

# Define range of epsilon and min_samples values to test DBSCAN clustering at
eps_list = [0.2, 0.3, 0.4, 0.5, 0.5, 0.5, 0.7, 0.8, 0.9]
samples_list = [3,4,5,6,7,8,9,10]

In [None]:
# Create directories to store results in
if not os.path.exists(dir):
    os.mkdir(dir)


# loop through each context
for scenario_number in range(1,19):
    print(f"Computing embeddings for scenario {scenario_number}")
    # select appropriate agent name
    agent_name = agent_list[scenario_number-1]
    # list containing words that will be removed from 
    words_to_remove = [agent_name, 'should', 'would', 'could']

    # Strip each response for context of words_to_remove and compute SBERT embeddings
    text_embeddings, texts_df = compute_embeddings(df_clustering, text_column_name, scenario_number, words_to_remove)
    # reduce dimensionality of embeddings according to predetermined parameters
    umap_embeddings = reduce_dimensions(texts_df, text_embeddings, umap_params)



    ### compute DBSCAN clustering on embeddings ###
    print(f"Performing DBSCAN clustering on embeddings")

    for i, eps in enumerate(eps_list):
        for j, min_samples in enumerate(samples_list):
            # create directories to store resulting plots and tables
            tempdir = f"{dir}/eps{eps}_samp{min_samples}/"
            if not os.path.exists(tempdir):
                os.mkdir(tempdir)
            if not os.path.exists(tempdir + "/plots/"):
                os.mkdir(tempdir + "/plots/")
            if not os.path.exists(tempdir + "/tables/"):
                os.mkdir(tempdir + "/tables/")

            clustering_params = {'algorithm_name': 'DBSCAN', 'eps': eps, 'min_samples': min_samples}

            # compute clusters on text embeddings
            cluster_assignment, clustering_model = compute_clusters(umap_embeddings, clustering_params)

            # store number of clusters generated by DBSCAN, not including those marked as outliers
            num_clusters = len(set(cluster_assignment)) - (1 if -1 in cluster_assignment else 0)
            clustering_params['num_clusters'] = num_clusters
            
            if num_clusters > 1:
                # name each cluster
                clusters = compute_centroid(texts_df, text_column_name, original_text_column_name, clustering_params, cluster_assignment, clustering_model)

                # Plot clusters with the appropriate DataFrame columns
                plot_clusters(
                    texts_df['umap_dim2_0'],
                    texts_df['umap_dim2_1'],
                    texts_df['cluster'],
                    texts_df["cluster_name"],
                    texts_df["response_original"],
                    texts_df["source"],
                    umap_params,
                    clustering_params,
                    f'{tempdir}plots/S{scenario_number}.html'
                )
            # save texts_df for this scenario to tables
            texts_df.to_csv(f"{tempdir}/tables/S{scenario_number}.csv")
