# An LLM-Approach to Semantic Clustering and Topic Modeling of Academic Literature

[...]

In [2]:
%pip install --upgrade altair datasets hdbscan scikit-learn umap-learn --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m857.8/857.8 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

## 1. Embeddings Transformation

In [3]:

from datasets import load_dataset
import tqdm as notebook_tqdm

ds = load_dataset("dcarpintero/arxiv.cs.CL.embedv3.clustering.medium", split="train")


Downloading readme:   0%|          | 0.00/438 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/289M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
ds

Dataset({
    features: ['url', 'url_pdf', 'title', 'authors', 'primary_category', 'categories', 'abstract', 'updated', 'published', 'embeddings_title', 'embeddings_abstract'],
    num_rows: 10000
})

## 2. Projecting Embeddings for Dimensionality Reduction

In [8]:
import umap

umap_reducer = umap.UMAP(n_neighbors=100, n_components=5, min_dist=0.1, metric='cosine')
umap_embedding = umap_reducer.fit_transform(ds['embeddings_abstract'])

## 3. Semantic Clustering

In [10]:
import hdbscan

hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom')
clusters = hdbscan_model.fit_predict(umap_embedding)

In [11]:
import pandas as pd

reduced_embeddings = umap.UMAP(n_neighbors=100, n_components=2, min_dist=0.1, metric='cosine').fit_transform(ds['embeddings_abstract'])
df = pd.DataFrame(reduced_embeddings, columns=['x', 'y'])
df['cluster'] = clusters
df['title'] = ds['title']

df = df[df['cluster'] != -1] # remove outliers

In [12]:
df.head()

Unnamed: 0,x,y,cluster,title
1,5.100608,2.069562,7,"Modelling Users, Intentions, and Structure in ..."
2,10.522969,4.066667,10,A Lexicalized Tree Adjoining Grammar for English
4,10.632957,4.021417,10,Conditions on Consistency of Probabilistic Tre...
5,10.623777,4.099373,10,Separating Dependency from Constituency in a T...
6,10.741825,4.151511,10,Incremental Parser Generation for Tree Adjoini...


## 4. Topic Modeling with LLMs

In [14]:
%pip install langchain langchain_community openai --quiet

from langchain.chat_models.openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h

### 4.1 Pydantic Model

In [15]:
class Topic(BaseModel):
    """
    Pydantic Model to generate an structured Topic Model
    """
    category: str = Field(..., description="Identified topic")

### 4.2 LangChain Prompt Template

In [16]:
topic_prompt = """
    Your task is to analyze a set of research paper titles related to Natural Language Processing and determine the overarching topic of the cluster.
    Based on the titles provided, you should identify and label the most relevant topic. The response should be concise, clearly stating the single
    identified topic in JSON format. No additional information or follow-up questions are needed.

    TITLES:
    {titles}

    EXPECTED OUTPUT:
    {{"category": "Topic Name"}}
    """

### 4.3 Inference of Topic Identification

In [23]:
from google.colab import userdata

def TopicModeling(titles: List[str]) -> str:
    """
    Infer the common topic of the given titles w/ LangChain, Pydantic, OpenAI
    """
    openai_api_key = userdata.get('OPENAI_API_KEY')
    llm = ChatOpenAI(model='gpt-4o', temperature=0.1, max_tokens=100, openai_api_key=openai_api_key)
    prompt = PromptTemplate.from_template(topic_prompt)
    parser = PydanticOutputParser(pydantic_object=Topic)

    topic_chain = prompt | llm | parser
    return topic_chain.invoke({"titles": titles})

In [24]:
topics = []
for i, cluster in df.groupby('cluster'):
    titles = cluster['title'].head(25).tolist()
    topic = TopicModeling(titles)
    topics.append(topic.category)
    print(f"Cluster {i}: {topic.category}")

Cluster 0: Text Summarization
Cluster 1: Sentiment Analysis
Cluster 2: Question Answering Systems
Cluster 3: Named Entity Recognition
Cluster 4: Biomedical Natural Language Processing
Cluster 5: Relation Extraction
Cluster 6: Natural Language Generation
Cluster 7: Dialogue Systems and Conversational AI
Cluster 8: Machine Translation
Cluster 9: Speech Recognition
Cluster 10: Parsing and Grammar in Natural Language Processing
Cluster 11: Morphological Analysis in NLP
Cluster 12: Neural Network Models for Natural Language Processing
Cluster 13: Word Sense Disambiguation


In [25]:
n_clusters = len(df['cluster'].unique())

topic_map = dict(zip(range(n_clusters), topics))
df['topic'] = df['cluster'].map(topic_map)

## 5. Visualization

In [26]:
%pip install vegafusion[embed]>=1.5.0 --quiet

import altair as alt
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [27]:
chart = alt.Chart(df).mark_circle(size=5).encode(
    x='x',
    y='y',
    color='topic:N',
    tooltip=['title', 'topic']
).interactive().properties(
    title='10K arXiv Abstracts in NLP | Cohere Embedv3 | UMAP | HDBSCAN | OpenAI',
    width=600,
    height=400,
)
chart.display()

### 5.1 Top 15 Topics

In [28]:
df['topic'].value_counts().head(15)

topic
Sentiment Analysis                                       1129
Machine Translation                                      1090
Dialogue Systems and Conversational AI                    610
Word Sense Disambiguation                                 575
Question Answering Systems                                541
Neural Network Models for Natural Language Processing     475
Parsing and Grammar in Natural Language Processing        408
Text Summarization                                        352
Speech Recognition                                        291
Morphological Analysis in NLP                             261
Biomedical Natural Language Processing                    225
Natural Language Generation                               215
Named Entity Recognition                                  214
Relation Extraction                                       147
Name: count, dtype: int64

----