In [None]:
# Install libraries yang diperlukan
!pip install pandas numpy scikit-learn nltk gensim wordcloud matplotlib seaborn pyLDAvis

In [None]:
import pandas as pd
import numpy as np
import re
import pickle

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Topic Modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora
from gensim.models import LdaModel

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

## 1. Load Data
Upload file CSV Anda ke Colab menggunakan file upload atau dari Google Drive

In [None]:
# Baca file CSV yang sudah diupload ke Colab
# Upload file CSV Anda ke file panel Colab (klik ikon folder di kiri > upload)
# Kemudian sesuaikan nama file di bawah ini

# Ganti 'nama_file.csv' dengan nama file CSV Anda
df = pd.read_csv('itb_news.csv')  # Sesuaikan nama file

# ATAU jika file ada di folder tertentu:
# df = pd.read_csv('folder/nama_file.csv')

# ATAU jika file di Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# df = pd.read_csv('/content/drive/MyDrive/path/to/your/file.csv')

print(f"\nDataset shape: {df.shape}")
print(f"Column names: {df.columns.tolist()}")

# Pastikan ada kolom yang berisi teks berita
# Jika nama kolom berbeda dari 'text', sesuaikan di bawah ini:
# Contoh: jika kolom berita bernama 'content' atau 'artikel':
# df = df.rename(columns={'content': 'text'})

# Atau jika ingin menggabungkan beberapa kolom:
# df['text'] = df['judul'].fillna('') + ' ' + df['konten'].fillna('')

print(f"\nTotal documents: {len(df)}")
df.head()

## 2. Text Preprocessing

In [None]:
class TextPreprocessor:
    def __init__(self, language='english'):
        self.stop_words = set(stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()
    
    def clean_text(self, text):
        """Membersihkan teks dari karakter khusus"""
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def tokenize_and_lemmatize(self, text):
        """Tokenisasi dan lemmatisasi"""
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens 
                 if word not in self.stop_words and len(word) > 2]
        return tokens
    
    def preprocess(self, text):
        """Pipeline preprocessing lengkap"""
        cleaned = self.clean_text(text)
        tokens = self.tokenize_and_lemmatize(cleaned)
        return ' '.join(tokens)

# Preprocessing
preprocessor = TextPreprocessor()
df['cleaned_text'] = df['text'].apply(preprocessor.preprocess)
df['tokens'] = df['cleaned_text'].apply(lambda x: x.split())

print("\nContoh hasil preprocessing:")
print(f"Original: {df['text'][0]}")
print(f"Cleaned: {df['cleaned_text'][0]}")

## 3. Topic Modeling dengan Scikit-learn LDA

In [None]:
# Vectorization
n_topics = 3  # Sesuaikan dengan kebutuhan
vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.8)
doc_term_matrix = vectorizer.fit_transform(df['cleaned_text'])

print(f"Document-Term Matrix shape: {doc_term_matrix.shape}")

# Train LDA model
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=20,
    learning_method='online'
)

lda_output = lda_model.fit_transform(doc_term_matrix)

print(f"\nModel trained with {n_topics} topics")
print(f"Log Likelihood: {lda_model.score(doc_term_matrix)}")
print(f"Perplexity: {lda_model.perplexity(doc_term_matrix)}")

In [None]:
# Display top words per topic
def display_topics(model, feature_names, n_top_words=10):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_indices = topic.argsort()[-n_top_words:][::-1]
        top_words = [feature_names[i] for i in top_indices]
        topics[f"Topic {topic_idx + 1}"] = top_words
        print(f"\nTopic {topic_idx + 1}:")
        print(", ".join(top_words))
    return topics

feature_names = vectorizer.get_feature_names_out()
topics_dict = display_topics(lda_model, feature_names, n_top_words=10)

## 4. Topic Modeling dengan Gensim (Alternatif)

In [None]:
# Membuat dictionary dan corpus untuk Gensim
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

# Train Gensim LDA model
gensim_lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=n_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

# Display topics
print("\nGensim LDA Topics:")
for idx, topic in gensim_lda.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")

## 5. Visualisasi

In [None]:
# Document-Topic Distribution
df['dominant_topic'] = lda_output.argmax(axis=1)

plt.figure(figsize=(10, 6))
topic_counts = df['dominant_topic'].value_counts().sort_index()
sns.barplot(x=topic_counts.index, y=topic_counts.values)
plt.title('Distribution of Documents Across Topics')
plt.xlabel('Topic')
plt.ylabel('Number of Documents')
plt.xticks(range(n_topics), [f'Topic {i+1}' for i in range(n_topics)])
plt.tight_layout()
plt.savefig('topic_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Word Cloud untuk setiap topic
fig, axes = plt.subplots(1, n_topics, figsize=(15, 5))
if n_topics == 1:
    axes = [axes]

for topic_idx, topic in enumerate(lda_model.components_):
    top_indices = topic.argsort()[-20:][::-1]
    top_words = {feature_names[i]: topic[i] for i in top_indices}
    
    wordcloud = WordCloud(
        width=400, 
        height=300, 
        background_color='white'
    ).generate_from_frequencies(top_words)
    
    axes[topic_idx].imshow(wordcloud, interpolation='bilinear')
    axes[topic_idx].set_title(f'Topic {topic_idx + 1}', fontsize=14)
    axes[topic_idx].axis('off')

plt.tight_layout()
plt.savefig('wordclouds.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# PyLDAvis - Visualisasi interaktif (Gensim)
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(gensim_lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(vis)

## 6. Save Model dan Artifacts

In [None]:
# Save semua model dan preprocessor
import joblib

# Save sklearn LDA model
joblib.dump(lda_model, 'lda_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(preprocessor, 'preprocessor.pkl')

# Save Gensim model
gensim_lda.save('gensim_lda_model')
dictionary.save('dictionary.pkl')

# Save topics dictionary
with open('topics.pkl', 'wb') as f:
    pickle.dump(topics_dict, f)

# Save processed data
df.to_csv('processed_data.csv', index=False)

print("\nModel dan artifacts berhasil disimpan!")
print("Files:")
print("- lda_model.pkl")
print("- vectorizer.pkl")
print("- preprocessor.pkl")
print("- gensim_lda_model")
print("- dictionary.pkl")
print("- topics.pkl")
print("- processed_data.csv")

In [None]:
# Download files untuk deploy ke Streamlit
from google.colab import files

files.download('lda_model.pkl')
files.download('vectorizer.pkl')
files.download('preprocessor.pkl')
files.download('topics.pkl')
files.download('processed_data.csv')
files.download('topic_distribution.png')
files.download('wordclouds.png')

## 7. Test Prediction pada Teks Baru

In [None]:
def predict_topic(text, preprocessor, vectorizer, model, n_words=5):
    """Prediksi topik untuk teks baru"""
    # Preprocess
    cleaned = preprocessor.preprocess(text)
    
    # Transform
    vectorized = vectorizer.transform([cleaned])
    
    # Predict
    topic_dist = model.transform(vectorized)[0]
    dominant_topic = topic_dist.argmax()
    
    # Get top words
    feature_names = vectorizer.get_feature_names_out()
    topic = model.components_[dominant_topic]
    top_indices = topic.argsort()[-n_words:][::-1]
    top_words = [feature_names[i] for i in top_indices]
    
    return {
        'dominant_topic': int(dominant_topic + 1),
        'topic_distribution': topic_dist.tolist(),
        'top_words': top_words,
        'confidence': float(topic_dist[dominant_topic])
    }

# Test
test_text = "Python is great for building machine learning models and data analysis."
result = predict_topic(test_text, preprocessor, vectorizer, lda_model)

print(f"\nTest Text: {test_text}")
print(f"\nPredicted Topic: Topic {result['dominant_topic']}")
print(f"Confidence: {result['confidence']:.2%}")
print(f"Top Words: {', '.join(result['top_words'])}")
print(f"\nTopic Distribution:")
for i, prob in enumerate(result['topic_distribution']):
    print(f"  Topic {i+1}: {prob:.2%}")