In [67]:
import pandas as pd
import numpy as np
import plotly.express as px
from sentence_transformers import SentenceTransformer
#umap 
import umap
import hdbscan
import plotly.graph_objects as go
from bertopic import BERTopic
import string
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
#read the data
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,content,sentiment
0,Min please cek DM,netral
1,cek dm cepet,positif
2,Woww.. telat 1 hari denda 45 hari .. luar bias...,netral
3,hai mimin boleh tolong check dm yaa :,netral
4,utk case bp Indra Bekti yg membutuhkan layanan...,netral


In [3]:
embedding_model = SentenceTransformer("sentence_bert")

In [145]:
def plot_text(df,kelas,embedding_model):
    df = df[df.sentiment == kelas]
    data = embedding_model.encode(df.values.tolist())
    umap_model = umap.UMAP(n_neighbors=min(df.shape[0],5),random_state = 42) 
    umap_data = umap_model.fit_transform(data)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=2)
    clusterer.fit(umap_data)

    labels = ['cluster ' + str(i) for i in clusterer.labels_]
    text = df["content"].str.wrap(50).apply(lambda x: x.replace('\n', '<br>'))
    
    fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = clusterer.labels_)
    # remove legend
    fig = px.scatter(x=umap_data[:,0], y=umap_data[:,1],color = labels,text = text)
    #set text color 
    fig.update_traces(textfont_color='rgba(0,0,0,0)',marker_size = 8)
    # set background color
    fig.update_layout(plot_bgcolor='rgba(0,0,0,0)')
    # set margin 
    fig.update_layout(margin=dict(l=40, r=5, t=45, b=40))
    # set axis color to grey
    fig.update_xaxes(showgrid=False, zeroline=False, linecolor='rgb(200,200,200)')
    fig.update_yaxes( zeroline=False, linecolor='rgb(200,200,200)')
    # set font sans-serif
    fig.update_layout(font_family="sans-serif")
    # remove legend
    fig.update_layout(showlegend=False)

    # set legend title to cluster
    return df['content'],data,fig

In [146]:
text,data,fig = plot_text(df,"negatif",embedding_model)

In [63]:
# filter only word with length > 3
df_neg["content"] =  df_neg['content'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

# remove stopwords
stopwords = pd.read_csv("assets/stopwordbahasa.csv", header=None)
stopwords = stopwords[0].tolist()
more_stopword = ["ga","iya","dg",'dengan', 'ia','bahwa','oleh',"sy","kl","gak","ah","apa","kok","mau","yg","pak","bapak","ibu","krn","nya","ya"]
stopwords = stopwords + more_stopword + list(string.punctuation)


In [75]:
topic_model = BERTopic(
    calculate_probabilities=True,
    verbose=True,
    vectorizer_model=CountVectorizer(stop_words=stopwords),
    nr_topics=10,
)
topics, probs = topic_model.fit_transform(df_neg["content"],data)

2023-01-01 20:48:50,604 - BERTopic - Reduced dimensionality
2023-01-01 20:48:50,654 - BERTopic - Clustered reduced embeddings
2023-01-01 20:48:50,709 - BERTopic - Reduced number of topics from 3 to 3


In [76]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,2,-1_terhubung_server_error_aplikasi
1,0,11,0_chika_whatsapp_pandawa_respon
2,1,137,1_bpjs_faskes_bayar_min


### function to wrap topic modelling

In [106]:
def topic_modelling(df,embed_df,stopwords,nr_topics = 10):
    data = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    # remove empty data 
    topic_model = BERTopic(
        calculate_probabilities=True,
        vectorizer_model=CountVectorizer(stop_words=stopwords),
        language="indonesian",
        nr_topics=nr_topics,
    )
    topics, probs = topic_model.fit_transform(data,embed_df)
    topic_labels = topic_model.generate_topic_labels(
        topic_prefix = False,
        separator = ", ",
    )
    topic_model.set_topic_labels(topic_labels)
    fig = topic_model.visualize_barchart()
    # set title to Kata Kunci tiap Topic 
    fig.update_layout(title_text="Topic yang sering muncul")
    return fig,topic_model

In [119]:
text,data,umap_data,fig = plot_text(df,"netral",embedding_model)

In [152]:
topic_model,topics,probs = topic_modelling(text,data,stopwords)

2023-01-01 21:47:38,780 - BERTopic - Reduced dimensionality
2023-01-01 21:47:39,135 - BERTopic - Clustered reduced embeddings
2023-01-01 21:47:39,415 - BERTopic - Reduced number of topics from 2 to 2


In [161]:
import matplotlib.pyplot as plt

In [163]:
fig = topic_model.visualize_barchart()
fig