In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import LdaModel, LsiModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# Load the data
data = pd.read_csv('data_pemilu.csv')

# Preprocess the data (if necessary)

# Tokenize the content_clean column
documents = data['content_clean'].str.split()

In [3]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,2))

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['content_clean'])

# Create a dictionary from the documents
dictionary = Dictionary(documents)

# Create a corpus from the TF-IDF matrix
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [4]:
# Define the number of topics
num_topics = 5  # You can adjust this parameter


In [5]:
# Build LDA model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics)

In [6]:
# Build LSI model
lsi_model = LsiModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics)

In [7]:
# Compute coherence scores for LDA
coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [8]:
# Compute coherence scores for LSA
coherence_model_lsa = CoherenceModel(model=lsi_model, texts=documents, dictionary=dictionary, coherence='c_v')
coherence_lsa = coherence_model_lsa.get_coherence()

In [9]:
print("Coherence Score (LDA):", coherence_lda)
print("Coherence Score (LSA):", coherence_lsa)

Coherence Score (LDA): 0.515629502759116
Coherence Score (LSA): 0.5068883900718715
