<a href="https://colab.research.google.com/github/futugyou/pyproject/blob/master/transformers_demo/TextClusteringAndTopicModeling_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required dependencies
%pip install --upgrade pip setuptools wheel
%pip install datasets
%pip install sentence_transformers
%pip install umap-learn
%pip install hdbscan
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install tqdm

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from tqdm.notebook import tqdm
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = load_dataset("maartengr/arxiv_nlp")
dataset = data["train"]
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]
print(abstracts)

In [None]:
# "thenlper/gte-small" "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(abstracts, show_progress_bar=True, progress_bar=tqdm, batch_size=16)
print(embeddings)

In [None]:
umap_model = umap.UMAP(n_components=5, min_dist=0.0, metric="cosine", random_state=42, verbose=True)
umap_embeddings = umap_model.fit_transform(embeddings)
print(umap_embeddings)

In [None]:
hdbsacn_model = HDBSCAN(min_cluster_size=50, metric="euclidean", cluster_selection_method="eom").fit(umap_embeddings)
clusters = hdbsacn_model.labels_
print(clusters)

In [None]:
cluster = 0
for index in np.where(clusters == cluster)[0][:3]:
    print(abstracts[int(index)][:300]+".... \n")

In [None]:
reduced_embeddings = umap.UMAP(n_components=2, min_dist=0.0, metric="cosine", random_state=42, verbose=True).fit_transform(embeddings)
df = pd.DataFrame(reduced_embeddings,columns=["x","y"])
df["title"] = titles
df["cluster"] = [str(c) for c in clusters]

clusters_df = df.loc[df.cluster != "-1", :]
outliers_df = df.loc[df.cluster == "-1", :]

In [None]:
plt.scatter(outliers_df.x, outliers_df.y, alpha=0.05, c="grey")
plt.scatter(clusters_df.x, clusters_df.y, c=clusters_df.cluster.astype(int), alpha=0.6, s=2, cmap="tab20b")
plt.axis("off")