In [3]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# 1. Load data
df = pd.read_csv("POC_Merged_deduplicated_keyword filtered_cleaned_output_single word real.csv")

# 2. Lemmatization + Lowercasing
nlp = spacy.load("id_core_news_sm", disable=["parser","ner"])
def normalize_lemma(text):
    doc = nlp(text)
    return " ".join(tok.lemma_.lower() for tok in doc)

df["target_lemma"] = df["target"].astype(str).apply(normalize_lemma)

# 3. Mapping dengan kamus kecil (bisa diisi manual dari top-aspek)
mapping = {
    "busway": "transjakarta",
    "tj": "transjakarta",
    "transjakarta": "transjakarta",
    "jalur": "rute",
    "jalurnya": "rute",
    "nunggu": "headway",
    "nungguin": "headway",
    "transum": "transjakarta",
    "bus": "armada",
    "jaklingko": "jaklingko",
    "jalurnya": "rute",
    "tarif": "tarif",
    "ac": "ac",
    "angkot": "jaklingko",
    "armada": "armada",
    "rutenya": "rute",
    "transportasi": "transjakarta",
    "macetnya": "lalu lintas",
    "antrian": "antrian",
    "kartu": "pembayaran",
    "petugas": "pramusapa",
    "akses": "aksesibilitas",
    "bis": "armada",
    "saldo": "pembayaran",
    "layanan": "layanan",
    "nungguin": "headway",
    "nunggunya": "headway",
    "transumnya": "transjakarta",
    "supir": "pramudi",
    "aplikasi": "aplikasi",
    "datengnya": "headway",
    "lift": "lift",
    "pt_transjakarta": "transjakarta",
    "ngantri": "antrian",
    "supirnya": "pramudi",
    "tjnya": "armada",
    # tambahkan jika perlu...
}

df["target_mapped"] = df["target_lemma"].map(mapping).fillna(df["target_lemma"])

# 4. (Opsional) Clustering embeddings untuk grouping otomatis
#    Jalankan hanya sekali jika daftar unique target masih besar
unique_targets = df["target_mapped"].unique().tolist()
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeds = model.encode(unique_targets)

# Misal kita ingin ~15 cluster:
n_clusters = 15
km = KMeans(n_clusters=n_clusters, random_state=42)
labels = km.fit_predict(embeds)

# Buat DataFrame cluster untuk review manual
cluster_df = pd.DataFrame({
    "target": unique_targets,
    "cluster": labels
})

# Simpan untuk di–inspect: beri label general per cluster secara manual
cluster_df.to_csv("clusters_to_label.csv", index=False)

# Setelah Anda meninjau clusters_to_label.csv dan membuat dict `cluster_label = {cluster_id: "nama_general"...}`
# Anda bisa mapping:
# cluster_label = {0: "halte", 1: "saldo", ...}
# df["target_final"] = df["target_mapped"].map(
#     lambda t: cluster_label[km.predict([model.encode(t)])[0]]
# )

# 5. Simpan hasil
# df.to_csv("predictions_normalized.csv", index=False)


  from .autonotebook import tqdm as notebook_tqdm


OSError: [E050] Can't find model 'id_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
from gensim.models import KeyedVectors
from collections import defaultdict

# Contoh data (ganti dengan CSV Anda)
data = 'POC_Merged_deduplicated_keyword filtered_cleaned_output_single word real.csv'
df = pd.DataFrame(data)

# 1. LEXICON-BASED NORMALIZATION
normalization_dict = {
    "busway": "transjakarta",
    "tj": "transjakarta",
    "transjakarta": "transjakarta",
    "jalur": "rute",
    "jalurnya": "rute",
    "nunggu": "headway",
    "nungguin": "headway",
    "transum": "transjakarta",
    "bus": "armada",
    "jaklingko": "jaklingko",
    "jalurnya": "rute",
    "tarif": "tarif",
    "ac": "ac",
    "angkot": "jaklingko",
    "armada": "armada",
    "rutenya": "rute",
    "transportasi": "transjakarta",
    "macetnya": "lalu lintas",
    "antrian": "antrian",
    "kartu": "pembayaran",
    "petugas": "pramusapa",
    "akses": "aksesibilitas",
    "bis": "armada",
    "saldo": "pembayaran",
    "layanan": "layanan",
    "nungguin": "headway",
    "nunggunya": "headway",
    "transumnya": "transjakarta",
    "supir": "pramudi",
    "aplikasi": "aplikasi",
    "datengnya": "headway",
    "lift": "lift",
    "pt_transjakarta": "transjakarta",
    "ngantri": "antrian",
    "supirnya": "pramudi",
    "tjnya": "armada",

}

# Fungsi normalisasi lexicon
def lexicon_normalize(term):
    term_lower = term.lower()
    return normalization_dict.get(term_lower, term)

# 2. CLUSTERING UNKNOWN TERMS (Untuk istilah yang belum dinormalisasi)
# Load pretrained Indonesian word embeddings (download dulu dari https://fasttext.cc/docs/en/crawl-vectors.html)
# model = KeyedVectors.load_word2vec_format('cc.id.300.vec', limit=100000)

# Simulasi embedding (gunakan model nyata untuk implementasi)
def get_cluster(term):
    # Contoh: clustering manual berdasarkan contoh data
    if term.lower() in ['tasnya', 'madu']:
        return 'Fasilitas'
    return 'Lainnya'

# Gabungkan kedua metode
def normalize_aspect(term):
    # Coba normalisasi lexicon dulu
    normalized = lexicon_normalize(term)
    if normalized != term:  # Jika sudah dinormalisasi
        return normalized
    # Jika tidak, gunakan clustering
    return get_cluster(term)

# Terapkan ke dataframe
df['normalized_target'] = df['target'].apply(normalize_aspect)

print(df[['target', 'normalized_target']].drop_duplicates())