In [4]:
# --- Install required libraries (Uncomment if running in Colab) ---
!pip install rake-nltk gensim nltk

import nltk
nltk.download('stopwords')

from rake_nltk import Rake
from nltk.corpus import stopwords
import string
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/farisrizky/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/farisrizky/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [16]:


# -----------------------
# Sample Survey Responses
# -----------------------
responses = [
    "Pelayanan puskesmas sudah cukup baik, namun antrian terlalu lama.",
    "Saya berharap ada lebih banyak dokter dan jam buka diperpanjang.",
    "Obat sering tidak tersedia, terutama untuk pasien penyakit kronis.",
    "Petugas ramah dan sigap, tapi ruang tunggu sempit dan panas.",
    "Sebaiknya ada sistem antrean online agar lebih efisien.",
    "Petugasnya lucu, walaupun antrian membludak dan lama sekali."
]

# -----------------------
# RAKE Keyword Extraction
# -----------------------
print("üîπ RAKE Keyword Extraction:\n")

r = Rake(language='indonesian')  # Uses NLTK stopwords
for i, resp in enumerate(responses):
    r.extract_keywords_from_text(resp)
    print(f"Response {i+1}:")
    for score, phrase in r.get_ranked_phrases_with_scores():
        print(f"  - {phrase} ({score})")
    print()

# -----------------------
# Preprocessing for LDA
# -----------------------
print("\nüîπ Preprocessing for LDA...\n")

stop_words = set(stopwords.words('indonesian'))

def preprocess(doc):
    tokens = gensim.utils.simple_preprocess(doc, deacc=True)
    return [t for t in tokens if t not in stop_words and t not in string.punctuation]

processed_docs = [preprocess(doc) for doc in responses]

# -----------------------
# LDA Topic Modeling
# -----------------------
print("üîπ LDA Topic Modeling:\n")

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Fit LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=10, random_state=42)

# Show topics
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx+1}: {topic}")

# -----------------------
# Document Topic Assignment
# -----------------------
print("\nüîπ Document Topic Distribution:\n")

for i, row in enumerate(lda_model[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(f"Response {i+1}: Dominant Topic {row[0][0]+1} (Score: {row[0][1]:.2f})")


üîπ RAKE Keyword Extraction:

Response 1:
  - pelayanan puskesmas (4.0)
  - antrian (1.0)

Response 2:
  - jam buka diperpanjang (9.0)
  - dokter (1.0)
  - berharap (1.0)

Response 3:
  - pasien penyakit kronis (9.0)
  - tersedia (1.0)
  - obat (1.0)

Response 4:
  - ruang tunggu sempit (9.0)
  - petugas ramah (4.0)
  - sigap (1.0)
  - panas (1.0)

Response 5:
  - sistem antrean online (9.0)
  - efisien (1.0)

Response 6:
  - petugasnya lucu (4.0)
  - antrian membludak (4.0)


üîπ Preprocessing for LDA...

üîπ LDA Topic Modeling:

Topic 1: 0.082*"antrian" + 0.049*"buka" + 0.049*"diperpanjang" + 0.049*"jam" + 0.049*"berharap"
Topic 2: 0.061*"petugas" + 0.061*"sigap" + 0.061*"tunggu" + 0.061*"panas" + 0.061*"sempit"

üîπ Document Topic Distribution:

Response 1: Dominant Topic 1 (Score: 0.87)
Response 2: Dominant Topic 1 (Score: 0.91)
Response 3: Dominant Topic 1 (Score: 0.91)
Response 4: Dominant Topic 2 (Score: 0.94)
Response 5: Dominant Topic 2 (Score: 0.90)
Response 6: Dominant T

In [18]:
# --- Install required libraries if needed ---
!pip install sastrawi pyLDAvis

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import gensim
from gensim import corpora
from gensim.models import LdaModel

import string

# Optional visualization
import pyLDAvis.gensim_models
import pyLDAvis

# For display in Jupyter
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import matplotlib.pyplot as plt

# --------------------------
# Sastrawi Preprocessing
# --------------------------

# Initialize tools
stop_factory = StopWordRemoverFactory()
stop_remover = stop_factory.create_stop_word_remover()

stem_factory = StemmerFactory()
stemmer = stem_factory.create_stemmer()

def preprocess_sastrawi(doc):
    cleaned = stop_remover.remove(doc.lower())
    stemmed = stemmer.stem(cleaned)
    tokens = gensim.utils.simple_preprocess(stemmed, deacc=True)
    return tokens

# Apply preprocessing
processed_docs = [preprocess_sastrawi(doc) for doc in responses]

print("üîπ Sample Processed Output:")
for i, tokens in enumerate(processed_docs):
    print(f"Response {i+1}: {tokens}")

# --------------------------
# LDA Topic Modeling
# --------------------------

# Create dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Fit LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2, passes=20, random_state=42)

# Show topics
print("\nüîπ LDA Topics (Top 5 words):")
topics = lda_model.print_topics(num_words=5)
for idx, topic in topics:
    print(f"Topic {idx+1}: {topic}")

# Document topic distribution
print("\nüîπ Dominant Topic Per Document:")
for i, row in enumerate(lda_model[corpus]):
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    print(f"Response {i+1}: Dominant Topic {row[0][0]+1} (Score: {row[0][1]:.2f})")

# --------------------------
# pyLDAvis Visualization
# --------------------------

print("\nüîπ Launching pyLDAvis...")

pyLDAvis.enable_notebook()
vis_data = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis_data


Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.6/2.6 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: sastrawi, funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1 sastrawi-1.0.1


Matplotlib is building the font cache; this may take a moment.


üîπ Sample Processed Output:
Response 1: ['layan', 'puskesmas', 'cukup', 'baik', 'antri', 'terlalu', 'lama']
Response 2: ['harap', 'lebih', 'banyak', 'dokter', 'jam', 'buka', 'panjang']
Response 3: ['obat', 'sering', 'sedia', 'utama', 'pasien', 'sakit', 'kronis']
Response 4: ['tugas', 'ramah', 'sigap', 'ruang', 'tunggu', 'sempit', 'panas']
Response 5: ['baik', 'sistem', 'antre', 'online', 'lebih', 'efisien']
Response 6: ['tugas', 'lucu', 'walaupun', 'antri', 'membludak', 'lama', 'sekali']

üîπ LDA Topics (Top 5 words):
Topic 1: 0.066*"antri" + 0.066*"lama" + 0.066*"baik" + 0.040*"tugas" + 0.039*"lebih"
Topic 2: 0.039*"lebih" + 0.038*"tugas" + 0.038*"sering" + 0.038*"pasien" + 0.038*"sakit"

üîπ Dominant Topic Per Document:
Response 1: Dominant Topic 1 (Score: 0.94)
Response 2: Dominant Topic 2 (Score: 0.93)
Response 3: Dominant Topic 2 (Score: 0.93)
Response 4: Dominant Topic 2 (Score: 0.93)
Response 5: Dominant Topic 1 (Score: 0.92)
Response 6: Dominant Topic 1 (Score: 0.93)

üîπ 