<a href="https://colab.research.google.com/github/futugyou/pyproject/blob/master/google_colab/TextClusteringAndTopicModeling_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required dependencies
%pip install --upgrade pip setuptools wheel
%pip install datasets
%pip install sentence_transformers
%pip install umap-learn
%pip install hdbscan
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install tqdm
%pip install bertopic

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from tqdm.notebook import tqdm
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = load_dataset("maartengr/arxiv_nlp")
dataset = data["train"]
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]
print(abstracts)

In [None]:
# "thenlper/gte-small" "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(
    abstracts, show_progress_bar=True, progress_bar=tqdm, batch_size=16
)
print(embeddings)

In [None]:
umap_model = umap.UMAP(
    n_components=5, min_dist=0.0, metric="cosine", random_state=42, verbose=True
)
umap_embeddings = umap_model.fit_transform(embeddings)
print(umap_embeddings)

In [None]:
hdbsacn_model = HDBSCAN(
    min_cluster_size=50, metric="euclidean", cluster_selection_method="eom"
).fit(umap_embeddings)
clusters = hdbsacn_model.labels_
print(clusters)

In [None]:
reduced_embeddings = umap.UMAP(
    n_components=2, min_dist=0.0, metric="cosine", random_state=42, verbose=True
).fit_transform(embeddings)

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,  # option
    hdbscan_model=hdbsacn_model,  # option
    verbose=True,
).fit(abstracts, embeddings)

In [None]:
topic_model.get_topic_info()

In [None]:
topic_model.find_topics("topic modeling")

In [None]:
topic_model.get_topic(23)

In [None]:
titles = list(titles)

fig = topic_model.visualize_documents(
    titles, reduced_embeddings=reduced_embeddings, width=1200, hide_annotations=True
)
fig.update_layout(font=dict(size=16))

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap(n_clusters=30)

In [None]:
topic_model.visualize_hierarchy()