## Head
Author: Eli Hecht
Purpose: adapt Alina's sentence embedding code to group decision-study responses

## Imports

In [None]:
import os

# 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import pandas as pd
import json

In [None]:
# Load the model

from sentence_transformers import SentenceTransformer #load the model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')


In [None]:
import umap
from sklearn.cluster import KMeans
from scipy.spatial import distance_matrix

In [None]:
## filtering for personal words

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

In [None]:
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
# import kaleido
import plotly.io as pio
import matplotlib.pyplot as plt

## Define analysis functions

In [None]:
# function to build directories at specified location
def make_directories(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)
    if not os.path.exists(dir + 'plots/'):
        os.mkdir(dir + 'plots/')
    if not os.path.exists(dir + 'cluster_tables/'):
        os.mkdir(dir + 'cluster_tables/')

In [None]:
# A function to filter specified words  and pronouns

def filter_words(sentence, words_to_remove):
    # Tokenize
    words = word_tokenize(sentence)
    
    # Tag each word with its part of speech
    pos_tags = nltk.pos_tag(words)
    
    # Define the pos tags for personal words
    pronoun_tags = {'PRP', 'PRP$'}
    
    # list of words directly specified the user to remove
    words_to_remove_lower = [noun.lower() for noun in words_to_remove]

    #Filter out words that are personal words
    filtered_words = [word for word, tag in pos_tags if tag not in pronoun_tags and word.lower() not in words_to_remove_lower]
    
    #Reassemble the sentence
    return ' '.join(filtered_words)

In [None]:
# function to compute embeddings for a single vignette

def compute_embeddings(df, text_column, context_number, words_to_remove):
    # select only responses from specified context
    texts_df = df[df['context'] == context_number] 
    texts_df = texts_df.reset_index()

    # Convert all responses to strings
    texts_df[text_column] = texts_df[text_column].astype(str)

    # Strips text of specified words and pronouns
    texts_df[text_column] = texts_df[text_column].apply(lambda x: filter_words(x, words_to_remove))

    texts = texts_df[text_column].tolist()

    # compute option embeddings for each response
    text_embeddings = model.encode(texts, show_progress_bar=True) 
    return text_embeddings, texts_df



In [None]:
# dimensionality reduction with pre-defined parameters
def compute_clusters(text_embeddings, clustering_params, umap_params):
    # initialize kmeans model
    clustering_model = KMeans(n_clusters=clustering_params['num_clusters'], n_init='auto')

    # reduce dimensionality of text embeddings
    umap_embeddings = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                    n_components=umap_params['n_components'], 
                                    metric=umap_params['metric'],
                                    min_dist=umap_params['min_dist'],
                                    random_state=umap_params['random_state'])
                                .fit_transform(text_embeddings))
    
    # perform KMeans clustering and look at clusters 
    clustering_model.fit(umap_embeddings)
    return umap_embeddings, clustering_model.labels_, clustering_model

In [None]:
# function to print clusters for row by row evaluation
def print_clusters(texts_df, text_column_name, cluster_assignment, numb_clusters):
    clustered_sentences = [[] for i in range(numb_clusters)]
    for sentence_id, cluster_id in enumerate(cluster_assignment):
        clustered_sentences[cluster_id].append(texts_df[text_column_name][sentence_id])

    for i, cluster in enumerate(clustered_sentences):
        print("Cluster ", i+1)
        cluster_list = []
        for item in cluster:
            # removes identical context that is included in text of each result
            cluster_list.append(item.split(":", 1)[-1].strip())
        print(cluster_list)
        print("")

In [None]:
# function to create elbow-plot to identify ideal number of clusters
def k_plot(text_embeddings, umap_params, directory):
    distortions = []
    K_range = range(1, 18)


    umap_embeddings = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                        n_components=umap_params['n_components'], 
                                        metric=umap_params['metric'],
                                        min_dist=umap_params['min_dist'],
                                        random_state=umap_params['random_state'])
                                    .fit_transform(text_embeddings))
    
    # perform KMeans for each k value
    for k in K_range:
        clustering_model = KMeans(n_clusters=k, n_init='auto')
        clustering_model.fit(umap_embeddings)
        distortions.append(clustering_model.inertia_) # appends inertia to distortians list

    # Plotting the elbow curve
    plt.plot(K_range, distortions, marker='o')
    plt.title('Elbow Method For Optimal k')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Sum of Squared Distances')
    # saves figure to specified folder
    plt.savefig(directory)
    # plt.show()
    plt.clf()

In [None]:
# create a df with clusters, centroids and umap embeddings
def compute_centroid(texts_df, text_column_name, cluster_assignment, clustering_model, umap_embeddings):
    # attach the cluster assignments to the dataframe
    texts_df['cluster'] = pd.Series(cluster_assignment, index=texts_df.index)

    # Attach the centroids to the dataframe
    # In sklearn, the cluster centers are available directly via clustering_model.cluster_centers_
    texts_df['centroid'] = texts_df['cluster'].apply(lambda x: clustering_model.cluster_centers_[x])

    # Split the UMAP embeddings into individual columns for easier processing later
    for i in range(umap_embeddings.shape[1]):
        texts_df[f'umap_dim_{i}'] = umap_embeddings[:, i]

    # Convert the UMAP embeddings from individual columns to lists for use in the distance calculation
    umap_list = [f'umap_dim_{i}' for i in range(umap_embeddings.shape[1])]
    texts_df['umap_embedding_list'] = texts_df[umap_list].apply(lambda row: row.tolist(), axis=1)

    # Define a function to compute the distance of each embedding from its cluster's centroid
    def distance_from_centroid(row):
        return distance_matrix([row['umap_embedding_list']], [row['centroid']])[0][0]
    texts_df['distance_from_centroid'] = texts_df.apply(distance_from_centroid, axis=1)

    # Select the 'response' closest to each cluster centroid to serve as a summary of the cluster
    summary = texts_df.sort_values('distance_from_centroid', ascending=True).groupby('cluster').head(1).sort_index()[text_column_name].tolist()

    # Create a dictionary linking each summary 'response' to its corresponding cluster number
    clusters = {}
    for i in range(len(summary)):
        clusters[summary[i]] = texts_df.loc[texts_df[text_column_name] == summary[i], "cluster"].iloc[0]

    return clusters

In [None]:
# reduce dimensionality of text vector embeddings to 2 for visualization
def umap_2(texts_df, text_embeddings, clusters, umap_params):
    umap_embeddings_2 = (umap.UMAP(n_neighbors=umap_params['n_neighbors'], 
                                    n_components=2, # only 2 dimensions
                                    metric=umap_params['metric'],
                                    min_dist=umap_params['min_dist'],
                                    random_state=umap_params['random_state'])
                                .fit_transform(text_embeddings))
    for i in range(umap_embeddings_2.shape[1]):
        texts_df[f'umap_dim2_{i}'] = umap_embeddings_2[:, i]

    # create a dictionary with cluster numbers as keys and their centroid texts as values
    id_to_name = {v: k for k, v in clusters.items()}

    texts_df['cluster_name'] = texts_df['cluster'].map(id_to_name) # create a column with centroid texts

    texts_df['cluster_name'] = texts_df['cluster_name'].apply(lambda x: x.split(":", 1)[-1].strip())

In [None]:
# Function to plot clusters on the joint embedding space

def plot_clusters(component1, component2, cluster, name, response,
                   data_source, 
                   umap_params, clustering_params,
                   plot_location):
    pio.renderers.default = "browser"
    color_palette = px.colors.qualitative.Light24
    
    fig = go.Figure()
    
    title_str = f"UMAP Parameters: n_neighbors={umap_params['n_neighbors']}, n_components={umap_params['n_components']}, min_dist={umap_params['min_dist']} | Clustering: {clustering_params['algorithm_name']} with k={clustering_params['num_clusters']} clusters"
    
    # Get unique clusters and data sources
    unique_clusters = sorted(cluster.unique())
    unique_data_sources = sorted(data_source.unique())

    color_map = {uc: color_palette[i % len(color_palette)] for i, uc in enumerate(unique_clusters)}
    marker_symbols = {
        unique_data_sources[1]: 'square',
        unique_data_sources[0]: 'diamond'
    }
    
    
    # Add a trace for each Source to indicate the shape in the legend
    for ds, symbol in marker_symbols.items():
        fig.add_trace(go.Scatter(
            x=[None],
            y=[None],
            mode='markers',
            marker=dict(
                size=10,
                symbol=symbol,
                # color=trajectory_colors[ds]
            ),
            name=f'{ds.capitalize()} (shape)'
        ))

    added_cluster_names = set()
    for ds in unique_data_sources:
        for uc in unique_clusters:
            mask = (cluster == uc) & (data_source == ds)
            if mask.any():  # Check if there are any rows after applying the mask
                show_in_legend = name[mask].iloc[0] not in added_cluster_names
                added_cluster_names.add(name[mask].iloc[0])
                
                fig.add_trace(go.Scatter(
                    x=component1[mask],
                    y=component2[mask],
                    mode='text+markers',
                    name=name[mask].iloc[0] if show_in_legend else None,
                    legendgroup=f'group{uc}',
                    showlegend=show_in_legend,
                    hovertext= str(uc) + ": " + response[mask] ,
                    # text='gen_num[mask].astype(str)',
                    marker=dict(
                        size=12,
                        color=color_map[uc],
                        symbol=marker_symbols[ds],  # Use the marker symbol based on the data source
                        line_width=1,
                        opacity=1
                    ),
                    textfont=dict(
                        size=10,
                        color='black'
                    )
                ))


    fig.update_layout(
        margin=dict(l=100, r=100, b=100, t=100),
        width=2000,
        height=1200,
        showlegend=True,
        title=title_str,
        paper_bgcolor='white',  # White background for the entire plot area
        plot_bgcolor='white',
        legend=dict(
            yanchor="top",
            y=1,
            xanchor="left",
            x=0.01,
            bgcolor='rgba(255,255,255,0)'
        )
    )

    fig.layout.template = 'ggplot2'
    fig.write_html(plot_location)

## Load in data

In [None]:
agent_list = ['Heinz', 'Josh', 'Brian', 'Liz', 'Mary', 'Brad', 'Darya', 'Eunice', 'Eamon', 'Cameron', 'Erica', 'Carl', 'Daniel', 'Andy', 'Ahmed', 'Eva', 'Jeff', 'Shania']

In [None]:
# load first-person decision study data
df_decision = pd.read_csv('../data/decision.csv')

# add id column
df_decision = df_decision.reset_index().rename(columns={'index': 'id'})
df_decision['id'] += 1

# select only participants who finished
df_decision = df_decision[df_decision['finished']]

# exclude ids of participants who gave non-sensical responses
exclude_ids = [21, 64, 72, 74, 84, 86, 89]
df_decision[~df_decision['id'].isin(exclude_ids)].reset_index(drop=True) 

# Select columns 'id', 'S1_1' to 'S18_1'
df_decision = df_decision[['id', 'S1_1', 'S2_1', 'S3_1', 'S4_1', 'S5_1', 'S6_1', 'S7_1', 'S8_1', 'S9_1', 'S10_1', 'S11_1', 'S12_1', 'S13_1', 'S14_1', 'S15_1', 'S16_1', 'S17_1', 'S18_1']]

#  Melt the DataFrame to long format
df_decision = pd.melt(df_decision, id_vars=['id'], var_name='context', value_name='decision')

# Extract numeric values from 'context' column using str.extract
df_decision['context'] = df_decision['context'].str.extract('(\d+)').astype(int)

# drop empty responses
df_decision.dropna(subset=['decision'], inplace=True)
df_decision.rename(columns={'decision': 'response'}, inplace=True)

# add source column indicating that this is from the decision study
df_decision['source'] = 'decision'

# df_decision

In [None]:
# load third-person possibility generation study data
df_pg = pd.read_csv('../manualCoding/pg_coded_final.csv', index_col=0)
df_pg = df_pg[['context', 'id', 'answer', 'text', 'value']]
df_pg.rename(columns={"text":"response"}, inplace=True)
df_pg['source'] = 'pg'

In [None]:
# Merge decision study data and possibility generation study data into one data frame for clustering
df = pd.merge(df_decision, df_pg, how='outer')

In [None]:
# Add full scenario texts to merged_text to give LLM context for responses
contexts_list = pd.read_csv('../materials/contextsTable.csv', index_col=0)['text']

df_merge = pd.merge(df, contexts_list, left_on='context', right_index=True)
df_merge['merged_text'] = df_merge['text'] + ' : ' + df_merge['response']
df_merge.rename(columns={"response": "response_original", "text": "scenario_text"}, inplace=True)

df = df_merge

In [None]:
df

## Create elbow plots to determine appropriate number of clusters per context

In [None]:
# text_column_name texts: 'response_original' or 'merged_text'
# merged_text gives context by merging response with the scenario text and is preferred
text_column_name = 'merged_text'


# directory to place resulting plots and clusters
dir = "numElbow/"

# check that path exists and make it if it doesn't
if not os.path.exists(dir + 'elbow_plots/'):
    if not os.path.exists(dir):
            os.mkdir(dir)
    os.mkdir(dir + 'elbow_plots/')

# Define UMAP and clustering parameters
umap_params = {'n_neighbors': 100, 'n_components': 10, 'metric': 'cosine', 'min_dist': 0.05, 'random_state': None}

# creates plots and cluster_tables for each scenario
for scenario_number in range(1, 19):
    print("Creating k-plot for scenario " + str(scenario_number))
    # selects appropriate agent name based on scenario number
    agent_name = agent_list[scenario_number-1]
    words_to_remove = [agent_name, 'should', 'would', 'could']

    text_embeddings, texts_df = compute_embeddings(df, text_column_name, scenario_number, words_to_remove)

    # elbow curve plots for determining appropriate number of clusters
    k_plot(text_embeddings, umap_params, f'{dir}elbow_plots/S{str(scenario_number)}.png')

Based on visual examination of the resulting plots from the code above, optimal k values were selected and stored in elbow_results.json

## Run all scenarios in loop

In [None]:
### To run the analyses edit the values in this cell then run this cell and the one below
# For the repo this was run twice, once with dir = "numManual/" and clusters_from = 'same_as_manual',
# and once with dir = "numElbow/" and clusters_from = 'elbow_results',


# directory to send resulting plots and clusters
dir = "numManual/"

## text_column_name texts: 'response_original' or 'merged_text'
# merged_text gives context by merging response with the scenario text and generally gives better results
text_column_name = 'merged_text'

## clusters_from texts: 'elbow_results', 'same_as_manual', 'fixed'
# 'elbow_results' use the number of clusters for each context based on the results of the elbow plots (code below)
# 'same_as_manual' uses the same number of clusters for each context as was determine via manual coding for fair comparison between the two
# 'fixed' allows you to directly set the number of clusters
clusters_from = 'same_as_manual'

if clusters_from == 'fixed':
    # Modify this if fixing the number of clusters
    numb_clusters = 11

# Define UMAP and clustering parameters
umap_params = {'n_neighbors': 100, 'n_components': 10, 'metric': 'cosine', 'min_dist': 0.05, 'random_state': None}

In [None]:
# num of groups used for manual coding each scenario
num_manual_groups_by_scenario = [14, 16, 15, 16, 14, 15, 13, 15, 16, 14, 15, 17, 17, 18, 17, 14, 14, 14]

# optimal number of clusters for each scenario as determined by elbow plot analysis above
if clusters_from == 'elbow_results':
    with open(dir +'/elbow_results.json', "r") as json_file:
        elbow_results = json.load(json_file)    

In [None]:
# check that path exists and make it if it doesn't
make_directories(dir)

# creates plots and cluster_tables for each scenario
for scenario_number in range(1, 19):
    print("Running analysis on scenario " + str(scenario_number))
    # selects appropriate agent name based on scenario number
    agent_list = ['Heinz', 'Josh', 'Brian', 'Liz', 'Mary', 'Brad', 'Darya', 'Eunice', 'Eamon', 'Cameron', 'Erica', 'Carl', 'Daniel', 'Andy', 'Ahmed', 'Eva', 'Jeff', 'Shania']
    agent_name = agent_list[scenario_number-1]


    words_to_remove = [agent_name, 'should', 'would', 'could']

    # Strip each response of words_to_remove and compute SBERT embeddings
    text_embeddings, texts_df = compute_embeddings(df, text_column_name, scenario_number, words_to_remove)


    if(clusters_from == "elbow_results"):
        numb_clusters = elbow_results[str(scenario_number)]

    # select appropriate number of clusters if the numbers of clusters is the same as manual coding
    if clusters_from == 'same_as_manual':
        # + 1 is added to above number because some texts were manually coded as 'other', giving one more category than the stated number
        numb_clusters = num_manual_groups_by_scenario[scenario_number-1]+1

    # define clustering params based on numb_clusters defined above
    clustering_params = {'algorithm_name': 'KMeans', 'num_clusters': numb_clusters}
    
    # compute clusters on embedded 
    umap_embeddings, cluster_assignment, clustering_model = compute_clusters(text_embeddings, clustering_params, umap_params)


    # print_clusters(texts_df, text_column_name, cluster_assignment, numb_clusters)

    # create a df with clusters, centroids and umap embeddings
    clusters = compute_centroid(texts_df, text_column_name, cluster_assignment, clustering_model, umap_embeddings)
    # reduce dimensionality to two for plotting
    umap_2(texts_df, text_embeddings, clusters, umap_params)


    # save texts_df for this scenario to cluster_tables
    texts_df.to_csv(dir + 'cluster_tables/S' + str(scenario_number) + '.csv')
    # Call your function with the appropriate DataFrame columns
    plot_clusters(
        texts_df['umap_dim2_0'],
        texts_df['umap_dim2_1'],
        texts_df['cluster'],
        texts_df["cluster_name"],
        texts_df["response_original"],
        texts_df["source"],
        umap_params,
        clustering_params,
        f'{dir}plots/S{scenario_number}.html'
    )