# Test input auto segmentasi multi aspek


In [None]:
# ==== IMPORTS ====
import os, re, json, pickle
import numpy as np
import pandas as pd

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

# ==== PATHS ====
ROOT        = "/content/drive/MyDrive/SKRIPSI/COBA_LDA/MODEL_LDA_5ASPEK_NEW"
MODEL_DIR   = os.path.join(ROOT, "Model_LDA")
ARTEFAK_DIR = os.path.join(ROOT, "artefak")

# ==== LOAD MODEL & ARTEFAK LDA ====
dictionary = Dictionary.load(os.path.join(MODEL_DIR, "dictionary.gensim"))
lda        = LdaModel.load(os.path.join(MODEL_DIR, "lda_model.gensim"))

bigram = None
bg_path = os.path.join(MODEL_DIR, "bigram_phraser.pkl")
if os.path.exists(bg_path):
    with open(bg_path, "rb") as f:
        bigram = pickle.load(f)
    print("✅ Bigram phraser dimuat.")
else:
    print("⚠️ Bigram phraser TIDAK ditemukan, jalan tanpa bigram.")

# mapping topik → aspek
map_xlsx = os.path.join(MODEL_DIR, "mapping_aspek_auto.xlsx")
df_map = pd.read_excel(map_xlsx, engine="openpyxl")
topic2aspect = dict(zip(df_map["topic"], df_map["assigned_aspect"]))

# daftar aspek
ASPEK = ["Kemasan","Aroma","Tekstur","Harga","Efek"]

# seed words
seeds_json = os.path.join(ARTEFAK_DIR, "seeds.json")
with open(seeds_json, "r", encoding="utf-8") as f:
    sj = json.load(f)

SEED_DICT = {
    "Kemasan": set(sj.get("Kemasan", [])),
    "Aroma":   set(sj.get("Aroma", [])),
    "Tekstur": set(sj.get("Tekstur", [])),
    "Harga":   set(sj.get("Harga", [])),
    "Efek":    set(sj.get("Efek", [])),
}

SEED_ROOTS = {
    aspek: set(
        _root_id(part)
        for w in SEED_DICT[aspek]
        for part in (w.split('_') + [w])
    )
    for aspek in ASPEK
}

print("✅ Model, dictionary, mapping, dan seeds dimuat.")

✅ Bigram phraser dimuat.
✅ Model, dictionary, mapping, dan seeds dimuat.


In [None]:
# ==== UTIL PREPROC ====

def _simple_clean(text: str) -> str:
    t = str(text).lower()
    t = t.replace("enggak","gak").replace("nggak","gak")
    return re.sub(r"[^a-z0-9_ ]+", " ", t)

def tokenize_from_val(val):
    """Terima string atau list token → list token (+bigram jika ada)"""
    if isinstance(val, list):
        toks = [str(t) for t in val if t]
    else:
        toks = _simple_clean(val).split()
    if bigram is not None:
        try:
            toks = list(bigram[toks])
        except Exception:
            pass
    return toks

def bow_of(tokens):
    return dictionary.doc2bow([t for t in tokens if t in dictionary.token2id])

def _root_id(token: str) -> str:
    t = str(token).lower().strip()      # <<< PERBAIKAN PENTING
    t = re.sub(r'(ku|mu|nya)$', '', t)  # hilangkan akhiran umum
    t = re.sub(r'^([a-z0-9]+)_\1$', r'\1', t)  # merah_merah -> merah
    return t

def _expand_for_seed(tokens):
    """Split underscores + akar ringan agar seed match robust."""
    parts = []
    for tok in tokens:
        if '_' in tok:
            parts.extend(tok.split('_'))
        parts.append(tok)
    return {_root_id(p) for p in parts}

In [None]:
# akar kata simpel untuk masing-masing aspek
BASE_ROOT = {
    "Kemasan": "kemasan",
    "Aroma":   "aroma",
    "Tekstur": "tekstur",
    "Harga":   "harga",
    "Efek":    "efek",
}

def build_aspect_anchors_from_seeds():
    """
    Bangun ASPECT_ANCHORS dari SEED_DICT:
    - Ambil semua seed per aspek.
    - Pecah underscore, ambil root-nya.
    - Kalau root mengandung BASE_ROOT[aspek], jadikan anchor.
    Hasilnya: subset kecil seed yang kata-katanya eksplisit mengandung label aspek.
    """
    anchors = {a: set() for a in ASPEK}

    for aspek in ASPEK:
        base = BASE_ROOT[aspek]
        for w in SEED_DICT[aspek]:
            # pecah seed kalau pakai underscore, misal 'kemasannya_lucu'
            parts = w.split('_')
            for part in parts:
                r = _root_id(part)
                if base in r:
                    anchors[aspek].add(r)

    return anchors

ASPECT_ANCHORS = build_aspect_anchors_from_seeds()
print("ASPECT_ANCHORS:", ASPECT_ANCHORS)

ASPECT_ANCHORS: {'Kemasan': {'kemasan'}, 'Aroma': {'aroma', 'aromatik'}, 'Tekstur': {'teksture', 'tekstur', 'bertekstur', 'teksturny'}, 'Harga': {'harga'}, 'Efek': {'ngefek', 'efektif', 'efek'}}


In [None]:
# ==== BOOSTING & PREDIKSI ====

def predict_aspect_boosted(
    tokens,
    lambda_boost=0.9,   # seberapa kuat pengaruh seed
    gamma=2.0,          # non-linear gain
    seed_bonus=0.03,    # bonus additive kecil per seed
    dampen_price_if_no_seed=True,
    price_delta=0.7,    # turunkan Harga bila tak ada seed & ada seed aspek lain
    prefer_seed_for_top1=True
):
    # 1) Distribusi LDA per topik → [(k,p)]
    bow = bow_of(tokens)
    dist_pairs = lda.get_document_topics(bow, minimum_probability=0.0)

    # 2) Agregasi topik → aspek (RAW)
    p_aspek = {a: 0.0 for a in ASPEK}
    for k, p in dist_pairs:
        a = topic2aspect.get(k, f"T{k}")
        p_aspek[a] += p

    # 3) Seed hits
    toks_for_seed = _expand_for_seed(tokens) | _expand_for_seed(_simple_clean(" ".join(tokens)).split())
    seed_hits = {
        a: len({_root_id(w) for w in SEED_DICT[a]} & toks_for_seed)
        for a in ASPEK
    }

    # 4) Boost semua aspek berdasar seed (non-linear) + bonus kecil
    p_boost = {
        a: p_aspek[a] * (1.0 + lambda_boost * seed_hits[a])**gamma
        for a in ASPEK
    }
    for a, h in seed_hits.items():
        if h >= 1:
            p_boost[a] += seed_bonus * h

    # 5) Gating khusus Harga
    if dampen_price_if_no_seed and seed_hits["Harga"] == 0 and max(seed_hits.values()) > 0:
        p_boost["Harga"] *= price_delta

    # 6) Normalisasi
    Z = sum(p_boost.values()) or 1.0
    p_boost = {a: v / Z for a, v in p_boost.items()}

    # 7) Label final (prioritaskan aspek yang kena seed jika ada)
    if prefer_seed_for_top1 and any(h > 0 for h in seed_hits.values()):
        seeded_aspects = [a for a,h in seed_hits.items() if h > 0]
        aspect_final = max(seeded_aspects, key=lambda a: p_boost[a])
    else:
        aspect_final = max(p_boost, key=p_boost.get)

    # (opsional) juga kembalikan top-1 murni tanpa preferensi seed
    aspect_top1_plain = max(p_boost, key=p_boost.get)

    return p_aspek, seed_hits, p_boost, aspect_final, aspect_top1_plain

def select_multi_smart(p_boost, seed_hits, thr=0.35, max_k=2):
    """Pilih multi-aspek: >=thr atau ada seed; maksimal max_k item."""
    items = sorted(p_boost.items(), key=lambda x: x[1], reverse=True)
    must  = [a for a,h in seed_hits.items() if h >= 1]
    cand  = [a for a,p in items if (p >= thr) or (a in must)]
    picked= [(a, p_boost[a]) for a,_ in items if a in cand][:max_k]
    return [(a, float(f"{p:.4f}")) for a,p in picked]

In [None]:
# kata-kata yang TIDAK boleh memicu split aspek
SEGMENT_STOPWORDS = {
    "tidak", "gak", "nggak", "enggak", "ga",
    "banget", "aja", "sih", "dong", "kok",
    "dan", "atau", "yang", "itu", "ini",
    # tambahkan kata-kata umum yang sering muncul lintas aspek:
    "enak", "dipake", "pake", "nyaman"
}

def detect_aspect_from_token(tok: str):
    """
    DETEKSI ASPEK khusus SEGMENTASI:
    - pakai SEED_ROOTS
    - ABAIKAN kata-kata umum/negasi di SEGMENT_STOPWORDS
    """
    root = _root_id(_simple_clean(tok)).strip()
    if not root or root in SEGMENT_STOPWORDS:
        return None

    for a in ASPEK:
        if root in SEED_ROOTS[a]:
            return a

    return None

In [None]:
# akar kata sederhana untuk tiap aspek
BASE_ROOT = {
    "Kemasan": "kemas",
    "Aroma":   "aroma",
    "Tekstur": "tekstur",
    "Harga":   "harga",
    "Efek":    "efek",
}

def split_into_sentences(text: str):
    """
    Pecah teks jadi kalimat berdasarkan . ! ?
    """
    if not isinstance(text, str):
        text = str(text)

    parts = re.split(r'([.!?])', text)
    sents = []
    buf = ""

    for part in parts:
        if part in [".", "!", "?"]:
            buf += part
            if buf.strip():
                sents.append(buf.strip())
            buf = ""
        else:
            buf += part

    if buf.strip():
        sents.append(buf.strip())

    return sents


def segment_text_for_aspect(text):
    """
    Segmentasi berbasis ANCHOR aspek:
    - Cari token yang mengandung akar 'kemas', 'aroma', 'tekstur', 'harga', 'efek'.
    - Tiap anchor dianggap awal segmen baru untuk aspek itu.
    - Segmen = rentang dari anchor_i sampai sebelum anchor_{i+1}.
    - Bagian sebelum anchor pertama ikut segmen pertama.
    """
    sentences = split_into_sentences(text)
    segments = []

    for sent in sentences:
        tokens = sent.split()
        if not tokens:
            continue

        # cari anchor: posisi token yang mengandung base-root aspek
        anchor_list = []   # list of (pos, aspek)
        for idx, tok in enumerate(tokens):
            root = _root_id(_simple_clean(tok))
            for aspek in ASPEK:
                base = BASE_ROOT[aspek]
                if base in root:
                    anchor_list.append((idx, aspek))
                    break

        if not anchor_list:
            # tidak ada anchor aspek di kalimat ini → 1 segmen utuh
            segments.append(sent.strip())
            continue

        # kompres anchor: buang anchor berurutan dengan aspek yang sama
        compressed = []
        for pos, asp in sorted(anchor_list, key=lambda x: x[0]):
            if not compressed or compressed[-1][1] != asp:
                compressed.append((pos, asp))

        # buat boundaries: dari anchor ke anchor berikutnya
        for i, (pos, asp) in enumerate(compressed):
            start = pos if i > 0 else 0
            end = compressed[i+1][0] if i+1 < len(compressed) else len(tokens)

            seg_tokens = tokens[start:end]
            seg_text = " ".join(seg_tokens).strip(" ,")
            if seg_text:
                segments.append(seg_text)

    return segments

In [None]:
def test_segmented_text(text,
                        lambda_boost=0.9,
                        gamma=2.0,
                        seed_bonus=0.03,
                        dampen_price_if_no_seed=True,
                        price_delta=0.7,
                        prefer_seed_for_top1=True,
                        max_head_tokens=4):
    """
    1) Segmentasi awal: segment_text_for_aspect(text)
    2) Klasifikasi aspek untuk tiap segmen
    3) Perbaiki kepala segmen:
       - kalau beberapa token awal segmen i masih nyambung dgn aspect_final segmen i-1,
         (asp token = None atau sama dgn aspek sebelumnya, dan minimal 1 token = aspek sebelumnya)
         → head dipindah ke akhir segmen i-1.
    4) Merge segmen berurutan yg aspek-nya sama.
    """

    # ===== 0) Segmentasi awal =====
    segs_raw = segment_text_for_aspect(text)

    # ===== 1) Klasifikasi awal =====
    raw = []
    for seg in segs_raw:
        toks = tokenize_from_val(seg)

        p_raw, hits, p_boost, aspect_final, aspect_top1_plain = predict_aspect_boosted(
            toks,
            lambda_boost=lambda_boost,
            gamma=gamma,
            seed_bonus=seed_bonus,
            dampen_price_if_no_seed=dampen_price_if_no_seed,
            price_delta=price_delta,
            prefer_seed_for_top1=prefer_seed_for_top1
        )
        prob_final = p_boost[aspect_final]

        raw.append({
            "seg_text": seg,
            "tokens": toks,
            "p_boost": p_boost,
            "seed_hits": hits,
            "aspect_final": aspect_final,
            "aspect_prob_final": prob_final,
        })

    # ===== 2) Perbaiki kepala segmen (head tokens) =====
    adjusted = []
    for idx, item in enumerate(raw):
        if idx == 0:
            adjusted.append(item)
            continue

        prev = adjusted[-1]
        prev_aspect = prev["aspect_final"]

        toks = item["tokens"]
        if len(toks) == 0:
            continue

        head_len = 0
        seen_prev_aspect = False

        for t in toks:
            asp_tok = detect_aspect_from_token(t)
            if asp_tok is None or asp_tok == prev_aspect:
                head_len += 1
                if asp_tok == prev_aspect:
                    seen_prev_aspect = True
            else:
                break

            if head_len >= max_head_tokens:
                break

        if not seen_prev_aspect or head_len == 0 or head_len >= len(toks):
            adjusted.append(item)
            continue

        # geser head ke segmen sebelumnya
        head_tokens = toks[:head_len]
        tail_tokens = toks[head_len:]

        moved_text = " ".join(head_tokens)
        tail_text  = " ".join(tail_tokens).strip()

        # update segmen sebelumnya
        new_prev_text = prev["seg_text"].rstrip(" ,") + " " + moved_text
        new_prev_tokens = tokenize_from_val(new_prev_text)
        p_raw_p, hits_p, p_boost_p, aspect_p, aspect_top1_plain_p = predict_aspect_boosted(
            new_prev_tokens,
            lambda_boost=lambda_boost,
            gamma=gamma,
            seed_bonus=seed_bonus,
            dampen_price_if_no_seed=dampen_price_if_no_seed,
            price_delta=price_delta,
            prefer_seed_for_top1=prefer_seed_for_top1
        )
        prob_p = p_boost_p[aspect_p]

        adjusted[-1] = {
            "seg_text": new_prev_text,
            "tokens": new_prev_tokens,
            "p_boost": p_boost_p,
            "seed_hits": hits_p,
            "aspect_final": aspect_p,
            "aspect_prob_final": prob_p,
        }

        # segmen sekarang = tail (kalau ada)
        if tail_text:
            new_toks = tokenize_from_val(tail_text)
            p_raw_c, hits_c, p_boost_c, aspect_c, aspect_top1_plain_c = predict_aspect_boosted(
                new_toks,
                lambda_boost=lambda_boost,
                gamma=gamma,
                seed_bonus=seed_bonus,
                dampen_price_if_no_seed=dampen_price_if_no_seed,
                price_delta=price_delta,
                prefer_seed_for_top1=prefer_seed_for_top1
            )
            prob_c = p_boost_c[aspect_c]

            adjusted.append({
                "seg_text": tail_text,
                "tokens": new_toks,
                "p_boost": p_boost_c,
                "seed_hits": hits_c,
                "aspect_final": aspect_c,
                "aspect_prob_final": prob_c,
            })

    # ===== 3) MERGE segmen berurutan dengan aspek sama =====
    merged = []
    for item in adjusted:
        if not merged:
            merged.append(item)
            continue

        prev = merged[-1]
        if item["aspect_final"] == prev["aspect_final"]:
            combined_text = prev["seg_text"].rstrip(" ,") + " " + item["seg_text"].lstrip(" ,")
            combined_tokens = tokenize_from_val(combined_text)
            p_raw2, hits2, p_boost2, aspect2, aspect_top1_plain2 = predict_aspect_boosted(
                combined_tokens,
                lambda_boost=lambda_boost,
                gamma=gamma,
                seed_bonus=seed_bonus,
                dampen_price_if_no_seed=dampen_price_if_no_seed,
                price_delta=price_delta,
                prefer_seed_for_top1=prefer_seed_for_top1
            )
            prob2 = p_boost2[aspect2]

            merged[-1] = {
                "seg_text": combined_text,
                "tokens": combined_tokens,
                "p_boost": p_boost2,
                "seed_hits": hits2,
                "aspect_final": aspect2,
                "aspect_prob_final": prob2,
            }
        else:
            merged.append(item)

    # ===== 4) CETAK HASIL =====
    print(f'TEKS UTUH: "{text}"\n')

    print("SEGMENTASI TEKS:")
    for i, r in enumerate(merged, start=1):
        print(f"  Seg {i}: {r['seg_text']}")
    print()

    print("PROBABILITAS ASPEK FINAL PER SEGMEN:")
    for i, r in enumerate(merged, start=1):
        print(f"  Seg {i}: {r['aspect_final']} ({r['aspect_prob_final']:.4f})")

    print("\nASPEK FINAL PER SEGMEN:")
    for i, r in enumerate(merged, start=1):
        print(f"  Seg {i}: {r['aspect_final']}")

    results = []
    for i, r in enumerate(merged, start=1):
        results.append({
            "seg_index": i,
            "seg_text": r["seg_text"],
            "p_boost": r["p_boost"],
            "seed_hits": r["seed_hits"],
            "aspect_final": r["aspect_final"],
            "aspect_prob_final": r["aspect_prob_final"],
        })
    return results

In [None]:
# ==== CONTOH PANGGILAN TEST ====
hasil = test_segmented_text(
    "​​Kemasannya tube kecil dan tutupnya rapet jadi aman kalo masuk tas, aromanya wangi segar kayak citrus tapi ga berlebihan, teksturnya creamy tapi pas dioles berasa ringan dan ga lengket, harganya sih di tengah-tengah ga murah banget tapi ga mahal juga, dan efeknya di aku bikin kulit lebih cerah merata dan berasa lebih halus. "
)

TEKS UTUH: "​​Kemasannya tube kecil dan tutupnya rapet jadi aman kalo masuk tas, aromanya wangi segar kayak citrus tapi ga berlebihan, teksturnya creamy tapi pas dioles berasa ringan dan ga lengket, harganya sih di tengah-tengah ga murah banget tapi ga mahal juga, dan efeknya di aku bikin kulit lebih cerah merata dan berasa lebih halus. "

SEGMENTASI TEKS:
  Seg 1: ​​Kemasannya tube kecil dan tutupnya rapet jadi aman kalo masuk tas
  Seg 2: aromanya wangi segar kayak citrus tapi ga berlebihan
  Seg 3: teksturnya creamy tapi pas dioles berasa ringan dan ga lengket
  Seg 4: harganya sih di tengah-tengah ga murah banget tapi ga mahal juga, dan
  Seg 5: efeknya di aku bikin kulit lebih cerah merata dan berasa lebih halus.

PROBABILITAS ASPEK FINAL PER SEGMEN:
  Seg 1: Kemasan (0.8961)
  Seg 2: Aroma (0.9613)
  Seg 3: Tekstur (0.7995)
  Seg 4: Harga (0.9584)
  Seg 5: Efek (0.9832)

ASPEK FINAL PER SEGMEN:
  Seg 1: Kemasan
  Seg 2: Aroma
  Seg 3: Tekstur
  Seg 4: Harga
  Seg 5: Efek
