# Analisis Sentimen Ulasan DANA
## NLP Pipeline — Data Acquisition sampai Evaluation

Notebook ini mengimplementasikan NLP Pipeline lengkap menggunakan ulasan aplikasi
**DANA - Dompet Digital Indonesia** dari Google Play Store.

**Tahapan:**
1. Data Acquisition
2. Text Cleaning & Pre-processing
3. Feature Engineering (TF-IDF)
4. Modeling (SVM / LinearSVC)
5. Evaluation

---
## Install Dependencies

In [None]:
!pip install google-play-scraper pandas numpy scikit-learn matplotlib seaborn wordcloud PySastrawi nltk joblib -q

In [None]:
import pandas as pd
import numpy as np
import re
import os
import json
import time
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)

from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

print("Semua library berhasil dimuat!")

---
## Tahap 1 — Data Acquisition

Mengambil ulasan aplikasi DANA dari Google Play Store menggunakan library `google-play-scraper`.
Label sentimen ditentukan berdasarkan rating bintang:
- Bintang 4–5 → **Positif**
- Bintang 3 → **Netral**
- Bintang 1–2 → **Negatif**

In [None]:
from google_play_scraper import reviews, Sort

APP_ID = "id.dana"
TARGET = 2000

semua_ulasan = []
token = None

try:
    while len(semua_ulasan) < TARGET:
        sisa = TARGET - len(semua_ulasan)
        batch = min(200, sisa)

        hasil, token = reviews(
            APP_ID,
            lang="id",
            country="id",
            sort=Sort.NEWEST,
            count=batch,
            continuation_token=token
        )

        if not hasil:
            break

        semua_ulasan.extend(hasil)
        print(f"Mengambil ulasan: {len(semua_ulasan)}/{TARGET}")

        if token is None:
            break

        time.sleep(1)

except Exception as e:
    print(f"Error: {e}")

print(f"\nTotal ulasan berhasil diambil: {len(semua_ulasan)}")

In [None]:
os.makedirs("data", exist_ok=True)

daftar = []
for u in semua_ulasan:
    daftar.append({
        "id_ulasan": u.get("reviewId", ""),
        "nama_pengguna": u.get("userName", ""),
        "isi_ulasan": u.get("content", ""),
        "bintang": u.get("score", 0),
        "tanggal_ulasan": u.get("at", ""),
        "jumlah_like": u.get("thumbsUpCount", 0),
    })

df = pd.DataFrame(daftar)

def tentukan_sentimen(bintang):
    if bintang >= 4:
        return "Positif"
    elif bintang == 3:
        return "Netral"
    else:
        return "Negatif"

df["sentimen"] = df["bintang"].apply(tentukan_sentimen)
df = df[df["isi_ulasan"].str.strip() != ""]
df = df.dropna(subset=["isi_ulasan"])

df.to_csv("data/ulasan_dana_mentah.csv", index=False, encoding="utf-8-sig")
print(f"Data disimpan: {len(df)} ulasan")
print("\nDistribusi Sentimen:")
print(df["sentimen"].value_counts())
df.head(5)

---
## Tahap 2 — Text Cleaning & Pre-processing

Pipeline pembersihan teks:
1. Lowercase
2. Hapus URL, mention, hashtag
3. Hapus emoji dan karakter non-ASCII
4. Hapus angka
5. Hapus tanda baca
6. Normalisasi kata slang/gaul
7. Stopword removal (PySastrawi)
8. Stemming (PySastrawi)

In [None]:
print("Memuat Stemmer & StopWord Bahasa Indonesia...")
hapus_sw = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

kamus_slang = {
    "gk": "tidak", "ga": "tidak", "gak": "tidak", "g": "tidak",
    "ngga": "tidak", "nggak": "tidak", "kagak": "tidak",
    "yg": "yang", "yng": "yang", "drpd": "daripada",
    "dgn": "dengan", "dg": "dengan", "sm": "sama",
    "sy": "saya", "gue": "saya", "gw": "saya",
    "lo": "kamu", "lu": "kamu", "km": "kamu",
    "sdh": "sudah", "udh": "sudah", "udah": "sudah",
    "blm": "belum", "blom": "belum", "blum": "belum",
    "dpt": "dapat", "bsa": "bisa", "bs": "bisa",
    "hrs": "harus", "krn": "karena", "karna": "karena",
    "tp": "tapi", "tpi": "tapi", "ttg": "tentang",
    "dl": "dulu", "dlu": "dulu",
    "bnyk": "banyak", "byk": "banyak",
    "msh": "masih", "masi": "masih",
    "bgt": "sangat", "banget": "sangat",
    "aja": "saja", "aj": "saja",
    "jg": "juga", "spt": "seperti",
    "lg": "lagi", "klo": "kalau", "kalo": "kalau",
    "dr": "dari", "utk": "untuk", "u": "untuk",
    "tdk": "tidak", "pd": "pada",
    "abis": "habis", "lbh": "lebih", "lbih": "lebih",
    "org": "orang", "emg": "memang", "emang": "memang",
    "knp": "kenapa", "tau": "tahu",
    "jgn": "jangan", "aplikasinya": "aplikasi", "appnya": "aplikasi",
}

def normalisasi_slang(teks):
    return " ".join([kamus_slang.get(k, k) for k in teks.split()])

def bersihkan_teks(teks):
    if pd.isna(teks) or str(teks).strip() == "":
        return ""
    teks = str(teks).lower()
    teks = re.sub(r"http\S+|www\.\S+", "", teks)
    teks = re.sub(r"@\w+|#\w+", "", teks)
    teks = teks.encode("ascii", "ignore").decode("ascii")
    teks = re.sub(r"\d+", "", teks)
    teks = re.sub(r"[^\w\s]", " ", teks)
    teks = re.sub(r"\s+", " ", teks).strip()
    teks = normalisasi_slang(teks)
    teks = hapus_sw.remove(teks)
    teks = stemmer.stem(teks)
    return teks.strip()

print("Fungsi pre-processing siap.")

In [None]:
print("Memproses teks... (mungkin beberapa menit)")
total = len(df)
hasil_bersih = []

for i, baris in df.iterrows():
    hasil_bersih.append(bersihkan_teks(baris["isi_ulasan"]))
    if (i + 1) % 200 == 0:
        print(f"  {i + 1}/{total} selesai...")

df["teks_bersih"] = hasil_bersih
df = df[df["teks_bersih"].str.strip() != ""]
df = df.dropna(subset=["teks_bersih"])

df.to_csv("data/ulasan_dana_bersih.csv", index=False, encoding="utf-8-sig")
print(f"\nSelesai. {len(df)} baris disimpan.")

In [None]:
# Contoh hasil pre-processing
print("Contoh hasil pre-processing:\n")
for _, row in df.head(5).iterrows():
    asli = str(row["isi_ulasan"])[:100]
    bersih = str(row["teks_bersih"])[:100]
    print(f"SEBELUM  : {asli}")
    print(f"SESUDAH  : {bersih}")
    print(f"SENTIMEN : {row['sentimen']}")
    print()

---
## Tahap 3 — Feature Engineering, Modeling & Evaluation

- **Feature Engineering:** TF-IDF Vectorizer (6.000 fitur, unigram + bigram)
- **Modeling:** SVM (LinearSVC) dengan CalibratedClassifierCV
- **Class Weight:** Balanced (menangani ketidakseimbangan kelas)
- **Split data:** 80% latih, 20% uji (stratified)
- **Evaluasi:** Accuracy, Precision, Recall, F1-Score, Confusion Matrix

In [None]:
os.makedirs("model", exist_ok=True)
os.makedirs("gambar", exist_ok=True)

df = pd.read_csv("data/ulasan_dana_bersih.csv")
df = df.dropna(subset=["teks_bersih", "sentimen"])
df = df[df["teks_bersih"].str.strip() != ""]

distribusi = df["sentimen"].value_counts()
print(f"Data: {len(df)} baris\n")
print("Distribusi Sentimen:")
print(distribusi)

In [None]:
X = df["teks_bersih"]
y = df["sentimen"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

vektorizer = TfidfVectorizer(
    max_features=6000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

X_train_tfidf = vektorizer.fit_transform(X_train)
X_test_tfidf = vektorizer.transform(X_test)

print(f"Data latih : {len(X_train)} sampel")
print(f"Data uji   : {len(X_test)} sampel")
print(f"Jumlah fitur TF-IDF: {X_train_tfidf.shape[1]}")

In [None]:
print("Melatih model SVM...")

svm = LinearSVC(C=1.0, max_iter=5000, random_state=42, class_weight="balanced")
model = CalibratedClassifierCV(svm, cv=3)
model.fit(X_train_tfidf, y_train)
print("Model selesai dilatih.")

### Evaluasi Model

In [None]:
prediksi = model.predict(X_test_tfidf)
akurasi = accuracy_score(y_test, prediksi)
label_unik = sorted(y.unique())

print(f"Akurasi: {akurasi * 100:.2f}%")
print("\nLaporan Klasifikasi:")
print(classification_report(y_test, prediksi, target_names=label_unik))

laporan = classification_report(y_test, prediksi, target_names=label_unik, output_dict=True)
pd.DataFrame(laporan).transpose().to_csv("data/laporan_evaluasi.csv", encoding="utf-8-sig")

### Visualisasi — Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, prediksi, labels=label_unik)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="RdYlGn",
            xticklabels=label_unik, yticklabels=label_unik,
            linewidths=0.5, ax=ax)
ax.set_title("Confusion Matrix - Sentimen Ulasan DANA", fontsize=13, fontweight="bold", pad=15)
ax.set_xlabel("Prediksi", fontsize=11)
ax.set_ylabel("Aktual", fontsize=11)
plt.tight_layout()
plt.savefig("gambar/confusion_matrix.png", dpi=150, bbox_inches="tight")
plt.show()

### Visualisasi — Distribusi Sentimen

In [None]:
palet = {"Positif": "#10b981", "Netral": "#f59e0b", "Negatif": "#ef4444"}
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle("Distribusi Sentimen Ulasan DANA", fontsize=14, fontweight="bold")

warna = [palet.get(k, "#6b7280") for k in distribusi.index]
batang = ax1.bar(distribusi.index, distribusi.values, color=warna, edgecolor="white", linewidth=1.5)
ax1.set_title("Jumlah Ulasan per Sentimen")
ax1.set_xlabel("Sentimen")
ax1.set_ylabel("Jumlah")
for p in batang:
    ax1.annotate(f"{p.get_height():,}",
                 (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha="center", va="bottom", fontweight="bold")

ax2.pie(distribusi.values, labels=distribusi.index, colors=warna,
        autopct="%1.1f%%", startangle=90,
        wedgeprops=dict(edgecolor="white", linewidth=2))
ax2.set_title("Persentase Sentimen")
plt.tight_layout()
plt.savefig("gambar/distribusi_sentimen.png", dpi=150, bbox_inches="tight")
plt.show()

### Visualisasi — Word Cloud per Sentimen

In [None]:
fig, sumbu = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle("Word Cloud Ulasan DANA per Sentimen", fontsize=14, fontweight="bold")

wc_cfg = {
    "Positif": {"cmap": "Greens", "ax": sumbu[0]},
    "Netral":  {"cmap": "Oranges", "ax": sumbu[1]},
    "Negatif": {"cmap": "Reds",    "ax": sumbu[2]},
}

for label, cfg in wc_cfg.items():
    teks = " ".join(df[df["sentimen"] == label]["teks_bersih"].tolist())
    if teks.strip():
        wc = WordCloud(width=500, height=300, background_color="white",
                       colormap=cfg["cmap"], max_words=80,
                       collocations=False).generate(teks)
        cfg["ax"].imshow(wc, interpolation="bilinear")
    cfg["ax"].set_title(f"Sentimen: {label}", fontsize=12, fontweight="bold")
    cfg["ax"].axis("off")

plt.tight_layout()
plt.savefig("gambar/wordcloud_sentimen.png", dpi=150, bbox_inches="tight")
plt.show()

### Simpan Model & Metadata

In [None]:
joblib.dump(model, "model/model_sentimen.pkl")
joblib.dump(vektorizer, "model/vektorizer_tfidf.pkl")

metadata = {
    "akurasi": float(akurasi),
    "jumlah_data": len(df),
    "jumlah_latih": len(X_train),
    "jumlah_uji": len(X_test),
    "jumlah_fitur": int(X_train_tfidf.shape[1]),
    "kelas": label_unik,
    "distribusi": distribusi.to_dict(),
    "algoritma": "SVM (LinearSVC + CalibratedClassifierCV)",
}
for s in label_unik:
    if s in laporan:
        metadata[f"presisi_{s.lower()}"] = laporan[s].get("precision", 0)
        metadata[f"recall_{s.lower()}"] = laporan[s].get("recall", 0)
        metadata[f"f1_{s.lower()}"] = laporan[s].get("f1-score", 0)

with open("model/metadata_model.json", "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print("Model  -> model/model_sentimen.pkl")
print("Vektor -> model/vektorizer_tfidf.pkl")
print("Meta   -> model/metadata_model.json")
print(f"\nAkurasi akhir: {akurasi * 100:.2f}%")

---
## Ringkasan

| Aspek | Detail |
|-------|--------|
| **Sumber Data** | Ulasan DANA dari Google Play Store |
| **Jumlah Data** | ~2.000 ulasan |
| **Pelabelan** | Berdasarkan rating bintang (4-5: Positif, 3: Netral, 1-2: Negatif) |
| **Pre-processing** | Lowercase, Hapus URL/Mention/Emoji, Normalisasi Slang, Stopword, Stemming |
| **Feature Engineering** | TF-IDF Vectorizer (6.000 fitur, unigram+bigram) |
| **Model** | SVM (LinearSVC + CalibratedClassifierCV) |
| **Deployment** | Streamlit Web App |