# Leveraging Semantic Embeddings for Topic Analysis

In [1]:
import re
import warnings

import hdbscan
from langchain_dartmouth.llms import ChatDartmouthCloud
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
from sentence_transformers import SentenceTransformer
from umap import UMAP

# Ignore all warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("./data/survey_responses.csv")

In [3]:
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
df["embeddings"] = sentence_model.encode(df.Response).tolist()

In [5]:
umap_model = UMAP(random_state=5)

In [6]:
df[["x", "y"]] = umap_model.fit_transform(np.array(df["embeddings"].values.tolist()))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [7]:
fig = px.scatter(df, x="x", y="y", hover_data=["Response"])
fig.show()

In [8]:
fig = px.scatter(df, x="x", y="y", hover_data=["Response"], color="Major")
fig.show()

In [9]:
df["cluster"] = hdbscan.HDBSCAN().fit_predict(df[["x", "y"]]).astype("str")

In [10]:
fig = px.scatter(df, x="x", y="y", hover_data=["Response"], color="cluster")
fig.show()

In [11]:
llm = ChatDartmouthCloud(model_name="openai.gpt-4o-mini-2024-07-18")


def find_cluster_label(responses):
    responses = "\n--\n".join(responses)
    prompt = (
        "The following are responses to the question: "
        "'What do you think was the biggest benefit of the Guarini Exchange Program "
        "for your personal or professional development?' "
        "All of these responses share a common theme or topic, similar to a headline. "
        "Take a few moments to analyze the responses, then identify the most salient topic. "
        "Finally, respond with the topic between the tags <topic_label></topic_label>. "
        "Here are the responses:\n\n"
        f"{responses}"
    )
    response = llm.invoke(prompt)
    label = re.findall(
        pattern=r"<topic_label>(.*)</topic_label>", string=response.content
    )[0]
    return label


df["topic"] = None
for cluster in df.cluster.unique():
    if cluster == "-1":
        continue
    subset = df[df.cluster == cluster]
    df.loc[df.cluster == cluster, "topic"] = find_cluster_label(subset.Response)
df

Unnamed: 0,Respondent_ID,Major,Response,embeddings,x,y,cluster,topic
0,R001,Economics,The biggest benefit of Guarini Exchange was de...,"[-0.02808222733438015, 0.008524204604327679, 0...",9.312801,3.329983,-1,
1,R002,Computer Science,Learning to code in a different cultural conte...,"[-0.023660454899072647, 0.011696777306497097, ...",6.883540,4.101231,1,Collaborative and Innovative Problem-Solving A...
2,R003,Environmental Studies,Studying at Williams-Mystic completely changed...,"[-0.013019781559705734, 0.04203595221042633, 0...",8.967394,1.310911,5,Hands-on Field Experience and Interdisciplinar...
3,R004,Asian Studies,My time at Waseda Uni in Tokyo improved my Jap...,"[-0.004582987632602453, -0.045444514602422714,...",10.744184,5.385358,2,Language and Cultural Immersion
4,R005,Government,The Guarini program gave me confidence I never...,"[0.0012159398756921291, -0.03586733713746071, ...",9.520711,3.627601,-1,
...,...,...,...,...,...,...,...,...
95,R096,Psychology,Cross-cultural perspectives on developmental p...,"[0.06583299487829208, 0.04757784679532051, -0....",11.198199,0.410375,3,Research Methodology and Cross-Cultural Perspe...
96,R097,Computer Science,AIT Budapest's creative approach to problem-so...,"[-0.06781381368637085, 0.07236985117197037, 0....",6.428737,4.433134,1,Collaborative and Innovative Problem-Solving A...
97,R098,Asian Studies,My time at Keio improved my Japanese dramatica...,"[-0.0008568129851482809, 0.08134118467569351, ...",11.306201,5.250633,2,Language and Cultural Immersion
98,R099,Environmental Studies,The biggest benefit was seeing environmental c...,"[0.01006466243416071, 0.08017655462026596, 0.0...",9.277708,1.327294,5,Hands-on Field Experience and Interdisciplinar...


In [12]:
fig = px.scatter(df, x="x", y="y", hover_data=["Response"], color="topic")
fig.show()

In [13]:
px.histogram(df, x="topic", color="Major")