In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import LdaModel, LsiModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# Load the data
data = pd.read_csv('data_pemilu.csv')

# Preprocess the data (if necessary)

# Tokenize the content_clean column
documents = data['content_clean'].str.split()

In [3]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=1,
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,2))

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(data['content_clean'])

# Create a dictionary from the documents
dictionary = Dictionary(documents)

# Create a corpus from the TF-IDF matrix
corpus = [dictionary.doc2bow(doc) for doc in documents]

In [4]:
# Define the range of topics and top words
topics_range = range(1, 11)  # Range of number of topics from 1 to 10
top_words_range = [5]  # Top words per topic: 5 and 10

# Initialize variables to store best coherence scores and corresponding parameters
best_coherence_lda = -1
best_coherence_lsa = -1
best_lda_params = None
best_lsa_params = None

In [5]:
# Iterate through different combinations of topics and top words
for num_topics in topics_range:
    for top_words in top_words_range:
        # Build LDA model
        lda_model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics)

        # Build LSI model
        lsi_model = LsiModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics)

        # Compute coherence scores for LDA
        coherence_model_lda = CoherenceModel(model=lda_model, texts=documents, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        # Compute coherence scores for LSA
        coherence_model_lsa = CoherenceModel(model=lsi_model, texts=documents, dictionary=dictionary, coherence='c_v')
        coherence_lsa = coherence_model_lsa.get_coherence()

        # Print coherence scores for each combination
        print("Number of Topics: {}, Top Words per Topic: {}".format(num_topics, top_words))
        print("Coherence Score (LDA):", coherence_lda)
        print("Coherence Score (LSA):", coherence_lsa)
        print()

        # Update best coherence scores and corresponding parameters if better scores found
        if coherence_lda > best_coherence_lda:
            best_coherence_lda = coherence_lda
            best_lda_params = (num_topics, top_words)

        if coherence_lsa > best_coherence_lsa:
            best_coherence_lsa = coherence_lsa
            best_lsa_params = (num_topics, top_words)

Number of Topics: 1, Top Words per Topic: 5
Coherence Score (LDA): 0.5242330788484322
Coherence Score (LSA): 0.564889012509562

Number of Topics: 2, Top Words per Topic: 5
Coherence Score (LDA): 0.47541796270838244
Coherence Score (LSA): 0.6193245118511104

Number of Topics: 3, Top Words per Topic: 5
Coherence Score (LDA): 0.43489999702307486
Coherence Score (LSA): 0.5064408632843422

Number of Topics: 4, Top Words per Topic: 5
Coherence Score (LDA): 0.5028694666536682
Coherence Score (LSA): 0.5426465358061904

Number of Topics: 5, Top Words per Topic: 5
Coherence Score (LDA): 0.5420010522735208
Coherence Score (LSA): 0.5068883900718715

Number of Topics: 6, Top Words per Topic: 5
Coherence Score (LDA): 0.5669447162419471
Coherence Score (LSA): 0.5318916164119106

Number of Topics: 7, Top Words per Topic: 5
Coherence Score (LDA): 0.4686398200703224
Coherence Score (LSA): 0.42573237838993394

Number of Topics: 8, Top Words per Topic: 5
Coherence Score (LDA): 0.4976981456625654
Coherence

In [6]:
# Print the best coherence scores and corresponding parameters for LDA and LSA
print("Best Coherence Score (LDA):", best_coherence_lda)
print("Best Parameters (LDA): Number of Topics - {}, Top Words per Topic - {}".format(*best_lda_params))
print()
print("Best Coherence Score (LSA):", best_coherence_lsa)
print("Best Parameters (LSA): Number of Topics - {}, Top Words per Topic - {}".format(*best_lsa_params))

Best Coherence Score (LDA): 0.5669447162419471
Best Parameters (LDA): Number of Topics - 6, Top Words per Topic - 5

Best Coherence Score (LSA): 0.6193245118511104
Best Parameters (LSA): Number of Topics - 2, Top Words per Topic - 5
