In [9]:
import pandas as pd

# Membaca file CSV
data = pd.read_csv('tweets.csv')
teks = data['tweet']  # Ganti 'kolom_teks' dengan nama kolom yang sesuai

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))

def preprocessing(teks):
    teks = re.sub(r'\W', ' ', teks)  # Menghapus karakter non-kata
    teks = teks.lower()  # Mengubah ke huruf kecil
    teks = word_tokenize(teks)  # Tokenisasi
    teks = [word for word in teks if word not in stop_words]  # Menghapus stopwords
    return ' '.join(teks)

cleaned_texts = teks.apply(preprocessing)

In [11]:
from gensim import corpora
from gensim.models import LdaModel

# Membuat dictionary dan corpus
dictionary = corpora.Dictionary(cleaned_texts.str.split())
corpus = [dictionary.doc2bow(text.split()) for text in cleaned_texts]

# Menerapkan LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print(f'Topik {idx}: {topic}')

In [13]:
topic_distribution = [lda_model.get_document_topics(bow) for bow in corpus]
# Analisis lebih lanjut untuk menemukan topik yang paling banyak dibahas

In [32]:
from collections import Counter

# Menghitung frekuensi kemunculan setiap topik
topic_counts = Counter()
for doc_topics in topic_distribution:
    for topic, _ in doc_topics:
        topic_counts[topic] += 1

# Menampilkan 5 topik teratas
top_5_topics = topic_counts.most_common(5)
print("5 Topik Paling Banyak Dibahas:")
for topic, count in top_5_topics:
    print(f'Topik {topic}: {count} dokumen')

# Menampilkan kata kunci untuk setiap topik
print("\nKata Kunci untuk Setiap Topik:")
for idx in range(5):  # Hanya untuk 5 topik teratas
    print(f'Topik {idx}: {lda_model.print_topics(num_words=3)[idx][1]}')

# Menampilkan dokumen yang paling relevan untuk setiap topik teratas
print("\nDokumen Relevan untuk Topik Teratas:")
for topic, _ in top_5_topics:
    print(f'\nDokumen untuk Topik {topic}:')
    relevant_docs = [teks[i] for i, doc_topics in enumerate(topic_distribution) if any(t == topic for t, _ in doc_topics)]
    for doc in relevant_docs[:2]:  # Menampilkan 3 dokumen relevan
        print(f'- {doc}')

5 Topik Paling Banyak Dibahas:
Topik 1: 83 dokumen
Topik 0: 81 dokumen
Topik 2: 78 dokumen
Topik 3: 78 dokumen
Topik 4: 75 dokumen

Kata Kunci untuk Setiap Topik:
Topik 0: 0.047*"timnas" + 0.031*"rt" + 0.023*"idextratime"
Topik 1: 0.041*"timnas" + 0.037*"https" + 0.035*"co"
Topik 2: 0.029*"u" + 0.024*"indonesia" + 0.022*"20"
Topik 3: 0.025*"timnas" + 0.023*"cb" + 0.023*"meshaal"
Topik 4: 0.050*"timnas" + 0.029*"rt" + 0.026*"indonesia"

Dokumen Relevan untuk Topik Teratas:

Dokumen untuk Topik 1:
- RT @idextratime: 🚨 BREAKING: Mathew Baker (LB/15/🇮🇩) mengumumkan lewat Story akun Instagramnya bahwa ia tidak akan lanjut di timnas Indones…
- RT @idextratime: Abis ke minimarket beli minuman, terus nemu ini guys,  Indomilk Steril dengan gambar pemain Timnas dan ada coach Shin Tae-…

Dokumen untuk Topik 0:
- RT @idextratime: 🚨 BREAKING: Mathew Baker (LB/15/🇮🇩) mengumumkan lewat Story akun Instagramnya bahwa ia tidak akan lanjut di timnas Indones…
- RT @idextratime: Abis ke minimarket beli min

In [22]:
import pyLDAvis.gensim_models
import pyLDAvis

# Visualisasi model LDA
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis)

In [23]:
for idx, topic in lda_model.print_topics(num_words=10):
    print(f'Topik {idx}: {topic}')

Topik 0: 0.047*"timnas" + 0.031*"rt" + 0.023*"idextratime" + 0.014*"pemain" + 0.013*"guys" + 0.011*"coach" + 0.011*"tae" + 0.011*"shin" + 0.011*"minimarket" + 0.011*"gambar"
Topik 1: 0.041*"timnas" + 0.037*"https" + 0.035*"co" + 0.035*"t" + 0.013*"rt" + 0.011*"pelatih" + 0.010*"gak" + 0.010*"shin" + 0.009*"indonesia" + 0.008*"pemain"
Topik 2: 0.029*"u" + 0.024*"indonesia" + 0.022*"20" + 0.020*"timnas" + 0.013*"pelatih" + 0.011*"matthew" + 0.010*"rt" + 0.008*"piala" + 0.008*"80" + 0.008*"4xl"
Topik 3: 0.025*"timnas" + 0.023*"cb" + 0.023*"meshaal" + 0.016*"2" + 0.014*"nya" + 0.014*"gwijangge" + 0.014*"iqbal" + 0.012*"https" + 0.012*"co" + 0.012*"u"
Topik 4: 0.050*"timnas" + 0.029*"rt" + 0.026*"indonesia" + 0.017*"masuk" + 0.016*"pemain" + 0.014*"morisaki" + 0.014*"ctsubasa_indo" + 0.012*"kluivert" + 0.012*"menunda" + 0.012*"yuzo"
