In [95]:
import pandas as pd
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic
import matplotlib.pyplot as plt
import string

In [117]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("firqaaa/indo-sentence-bert-base")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
stopwords = stopwords[0].tolist()
more_stopword = ["ga","iya","dg",'dengan', 'ia','bahwa','oleh',"sy","kl","gak","ah","apa","kok","mau","yg","pak","bapak","ibu","krn","nya","ya"]
stopwords = stopwords + more_stopword + list(string.punctuation)
vectorizer_model = CountVectorizer(stop_words= stopwords)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5                       # Step 6 - Diversify topic words
)

In [115]:
sentences = ["saya sedang memakan buah apel",
             "saya sedang memakan buah jeruk",
             "andi sedang bermain bola",
             "budi suka makan mie ayam",
             "sering banget makan bakso di tempat ini",
             "aplikasinya gampang rusak",
             "susah bange make aplikasi ini",
             "aplikasi ini sering error",
             "kadang bisa dipake kadang error kalau make palikasi ini"]

df = pd.DataFrame(sentences, columns=["text"])
df.head()

Unnamed: 0,text
0,saya sedang memakan buah apel
1,saya sedang memakan buah jeruk
2,andi sedang bermain bola
3,budi suka makan mie ayam
4,sering banget makan bakso di tempat ini


In [116]:
topics, probs = topic_model.fit_transform(df["text"])

ValueError: k must be less than or equal to the number of training points

In [113]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,5,0_memakan_makan_buah_suka
1,1,4,1_kadang_error_aplikasi_susah
