<a href="https://colab.research.google.com/github/futugyou/pyproject/blob/master/google_colab/TextClusteringAndTopicModeling_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required dependencies
%pip install --upgrade pip setuptools wheel
%pip install datasets
%pip install sentence_transformers
%pip install umap-learn
%pip install hdbscan
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install tqdm
%pip install bertopic

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from tqdm.notebook import tqdm
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = load_dataset("maartengr/arxiv_nlp")
dataset = data["train"]
abstracts = dataset["Abstracts"]
titles = dataset["Titles"]
print(abstracts)

In [None]:
# "thenlper/gte-small" "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer("thenlper/gte-small")
embeddings = embedding_model.encode(
    abstracts, show_progress_bar=True, progress_bar=tqdm, batch_size=16
)
print(embeddings)

In [None]:
umap_model = umap.UMAP(
    n_components=5, min_dist=0.0, metric="cosine", random_state=42, verbose=True
)
umap_embeddings = umap_model.fit_transform(embeddings)
print(umap_embeddings)

In [None]:
hdbsacn_model = HDBSCAN(
    min_cluster_size=50, metric="euclidean", cluster_selection_method="eom"
).fit(umap_embeddings)
clusters = hdbsacn_model.labels_
print(clusters)

In [None]:
reduced_embeddings = umap.UMAP(
    n_components=2, min_dist=0.0, metric="cosine", random_state=42, verbose=True
).fit_transform(embeddings)

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,  # option
    hdbscan_model=hdbsacn_model,  # option
    verbose=True,
).fit(abstracts, embeddings)

In [None]:
from copy import deepcopy

original_topics = deepcopy(topic_model.topic_representations_)

In [None]:
def topic_differences(model, original_topics, nr_topics=5):
    df = pd.DataFrame(columns=["topic", "original", "updated"])
    for topic in range(nr_topics):
        og_words = " | ".join(list(zip(*original_topics[topic]))[0][:5])
        new_words = " | ".join(list(zip(*model.get_topic(topic)))[0][:5])
        df.loc[len(df)] = [topic, og_words, new_words]
    return df

In [None]:
%pip install transformers

In [None]:
from bertopic.representation import TextGeneration
from transformers import pipeline

prompt = """i have a topic that contains the following documents:
[DOCUMENTS]

the topic is described by the following keywords: '[KEYWORDS]'.

base on the documents and keywords, what is this topic about?
"""

gerenator = pipeline("text2text-generation", model="google/flan-t5-small")
representation_model = TextGeneration(
    gerenator, prompt=prompt, doc_length=50, tokenizer="whitespace"
)

topic_model.update_topics(abstracts, representation_model=representation_model)
topic_differences(topic_model, original_topics)