In [None]:
import pandas as pd
from neo4j import GraphDatabase
from time import sleep
from random import randint
from langchain_ollama.llms import OllamaLLM
import ast
# Import json untuk parsing output LLM
import json

In [None]:
# 0. Configuration
# -----------------------------------------------------------------------------
# Define Host & Port
OLLAMA_HOST = "127.0.0.1"
OLLAMA_PORT = 11435

# CONFIG IMPORTANT PARAMETERS
MODEL_LLAMA = "deepseek-r1:8b"

TEMPERATURE = 0.75

# Define Model
try:
    llm = OllamaLLM(
        model=MODEL_LLAMA,
        base_url=f"http://{OLLAMA_HOST}:{OLLAMA_PORT}",
        verbose=False,
        temperature=TEMPERATURE
    )
    # Test connection
    llm.invoke("Hello")
    print(f"--- Koneksi Ollama ke model '{MODEL_LLAMA}' berhasil ---")
except Exception as e:
    print(f"--- Gagal terhubung ke Ollama ---")
    print(f"Error: {e}")
    print(f"Pastikan Ollama berjalan di {OLLAMA_HOST}:{OLLAMA_PORT} dan model '{MODEL_LLAMA}' tersedia.")

--- Koneksi Ollama ke model 'deepseek-r1:8b' berhasil ---


In [5]:
# 1. Load CSV file and convert 'menu' and 'embedding' to list
# -----------------------------------------------------------------------------
df = pd.read_csv('wave_emb_dbmcv2.csv')
df = df.drop(['Unnamed: 0'], axis=1)
print(f"Tipe data nilai pada kolom 'menu' sebelum konversi: {type(df['menu'][0])}\nTipe data nilai pada kolom 'embedding' sebelum konversi: {type(df['embedding'][0])}")
df['menu'] = df['menu'].apply(ast.literal_eval)
df['embedding'] = df['embedding'].apply(ast.literal_eval)
print(f"Tipe data nilai pada kolom 'menu' setelah konversi: {type(df['menu'][0])}\nTipe data nilai pada kolom 'embedding' setelah konversi: {type(df['embedding'][0])}")
df

Tipe data nilai pada kolom 'menu' sebelum konversi: <class 'str'>
Tipe data nilai pada kolom 'embedding' sebelum konversi: <class 'str'>
Tipe data nilai pada kolom 'menu' setelah konversi: <class 'list'>
Tipe data nilai pada kolom 'embedding' setelah konversi: <class 'list'>


Unnamed: 0,restaurant,menu,city,alt_resto,embedding
0,BP Kerupuk Sayur,"[Kerupuk Labu BP Kerupuk Sayur, Kerupuk Wortel...",Surabaya,['Dapur Fizzul'],"[[-0.02187000773847103, 0.01127985492348671, -..."
1,Bumbu Rawon,"[Bumbu Rawon UKG (Usaha Kerta Gemilang), Bumbu...",Kabupaten Gresik,['Rawon Balungan H. Mufid'],"[[0.01963287591934204, 0.056399986147880554, 0..."
2,JAMU BU SOLIKAH,"[JAMU KUNIR SURUH, JAMU KUNIR, JAMU BERAS KENC...",Malang,['Sambat Luwe'],"[[0.009273191913962364, 0.03742887079715729, -..."
3,RANIS KDS,[KRUPUK BAWANG RANIS KDS ],Malang,['KDS Cantonese Restaurant'],"[[-0.08321938663721085, 0.05083480849862099, 0..."
4,Winarti Snack,[Kacang Goreng Winarti Snack],Surabaya,['Nathania Snack House'],"[[-0.09416132420301437, 0.10255325585603714, 0..."
...,...,...,...,...,...
2827,virinda food,"[Rolade, Sempol]",Malang,['Javanine Resto'],"[[-0.024130970239639282, 0.0011671549873426557..."
2828,wouwcake,"[risoles wouwcake, kue tar wouwcake, kroket ke...",Surabaya,['Warung wong lue (WWL)'],"[[-0.03364633023738861, 0.07875539362430573, -..."
2829,yuk tri,"[kue nastar, kue kastengel, kue putri salju]",Malang,['Trimurti Resto'],"[[-0.010088230483233929, -0.05614767223596573,..."
2830,zara,[stik keju zara],Malang,['Lafayette Coffee & Eatery'],"[[0.004909013397991657, -0.049723122268915176,..."


In [None]:
# 2. Neo4j connection & similarity helpers
# -----------------------------------------------------------------------------
drv = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

def find_similar_restaurants_by_embedding(alts_embs: list[list[float]], threshold=0.6, topk=2):
    ids = []
    with drv.session() as ses:
        for emb in alts_embs:
            # Kueri ini mengembalikan properti 'r_id' dari node
            rows = ses.run("""
                CALL db.index.vector.queryNodes('restaurant_emb_dbmcv2', $topk, $emb)
                YIELD node, score
                WHERE score >= $threshold
                OPTIONAL MATCH (other:Restaurant)-[s:SIMILAR_TO_DBMCV2]->(node)
                    WHERE s.score >= $threshold
                RETURN DISTINCT node.r_id AS id, collect(other.r_id) AS linked_ids
                """, emb=emb, threshold=threshold, topk=topk)
            for r in rows:
                if r["id"] is not None:
                    ids.append(r["id"])
                ids.extend(r["linked_ids"])
    return list(set(ids))

# Fungsi jaccard menerima list berisi string
def jaccard(list1: list[str], list2: list[str]) -> float:
    """Menghitung Jaccard similarity antara dua list string."""
    set1 = {item.lower().strip() for item in list1}
    set2 = {item.lower().strip() for item in list2}
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union != 0 else 0.0

In [None]:
# 3. Overclaim checker
# -----------------------------------------------------------------------------
def check_overclaim(menu_names: list[str], alts_embs: list[list[float]]):
    # 1) look up their node IDs in Neo4j (Tidak ada perubahan)
    rids_count = 1
    for i in range(10):
        rids = find_similar_restaurants_by_embedding(alts_embs)
        print(f"Number of attempts to find similar restaurants by emb: {rids_count}")
        if rids: break
        rids_count += 1
        sleep(randint(1, 2))

    # Jika tidak ada kandidat ditemukan, keluar lebih awal
    if not rids:
        return [], 0.0, None, [], [], []
    
    # 2) Kumpulkan semua kandidat dan menu mereka dari Neo4j terlebih dahulu
    candidates = []
    with drv.session() as ses:
        for rid in rids:
            result = ses.run(
                """
                MATCH (r:Restaurant {r_id:$rid})
                OPTIONAL MATCH (r)-[:HAS_PRODUCT]->(m:Menu)
                RETURN r.restaurant AS restaurant_name, collect(m.menu) AS actual_menus
                """, rid=rid
            )
            rec = result.single()
            if rec and rec["restaurant_name"]:
                candidates.append({
                    "id": rid,
                    "name": rec["restaurant_name"],
                    "menu": rec["actual_menus"]
                })

    # Jika setelah query tidak ada kandidat valid, keluar
    if not candidates:
        return [], 0.0, None, [], [], []

    # 3) Cari kandidat terbaik menggunakan Jaccard similarity (perhitungan "murah")
    best_candidate = None
    max_jaccard_score = 0.0
    for cand in candidates:
        score = jaccard(cand["menu"], menu_names)
        if score > max_jaccard_score:
            max_jaccard_score = score
            best_candidate = cand
            
    # Jika tidak ada menu yang cocok sama sekali
    if not best_candidate:
        return [], 0.0, None, [], [], []

    # 4) Lakukan satu panggilan LLM untuk generalisasi pada kandidat terbaik
    gen_menu_names = menu_names
    gen_actual_menus = best_candidate["menu"]
    
    prompt_list1 = list(set(menu_names))
    prompt_list2 = list(set(best_candidate["menu"]))

    prompt_template_generalize = f"""
    Anda adalah asisten cerdas untuk generalisasi menu. Saya memiliki dua list menu:

    List 1: {prompt_list1}

    List 2: {prompt_list2}

    Tugas Anda:
    1. Anggap setiap list sebagai set (abaikan duplikat internal).
    2. Generalisasikan setiap item dalam set ke kategori makanan intinya menggunakan pengetahuan makanan Anda.
    3. Kembalikan hasil dalam Bahasa Indonesia dan format JSON yang ketat dengan kunci "gen_menu" (untuk List 1) dan "gen_comp_menu" (untuk List 2).

    Gunakan contoh-contoh berikut sebagai panduan ketat untuk generalisasi:
    
    Contoh Input A:
    Daftar 1: ["Bolu Pisang", "Brownies Kukus Besar", "Brownies Chocochip", "Gurame"]
    Daftar 2: ["Bolu Gulung Durian", "Patin", "Es Teh Manis", "Lele Goreng"]
    
    Contoh Output JSON A:
    {{
      "gen_menu": ["bolu", "brownies", "ikan"],
      "gen_comp_menu": ["bolu", "ikan", "teh"]
    }}

    Contoh Input B:
    Daftar 1: ["Ayam Goreng", "Ayam Bakar", "Brownies", "Kopi Hitam Hangat", "Pisang Goreng"]
    Daftar 2: ["Fried Chicken", "Fried Duck", "Bika Ambon", "Caramel Latte", "Cappuccino", "Gurame"]
    
    Contoh Output JSON B:
    {{
      "gen_menu": ["ayam", "kue", "kopi", "pisang"],
      "gen_comp_menu": ["ayam", "bebek", "kue", "kopi", "ikan"]
    }}

    Contoh Input C:
    Daftar 1: ["Fish Tofu", "Ramen"]
    Daftar 2: ["Tahu Kuning", "Tempe", "Indomie Telor"]
    
    Contoh Output JSON C:
    {{
      "gen_menu": ["tahu", "mie"],
      "gen_comp_menu": ["tahu", "tempe", "mie"]
    }}

    Contoh Input D:
    Daftar 1: ["Nasi Uduk", "Cumi-cumi Asam Manis", "Udang", "Telor Balado"]
    Daftar 2: ["Seafood Fried Rice", "Cumi Goreng Tepung", "Lobster", "Omelette"]
    
    Contoh Output JSON D:
    {{
      "gen_menu": ["nasi", "cumi", "udang", "telur"],
      "gen_comp_menu": ["nasi", "cumi", "lobster", "telur"]
    }}

    Contoh Input E:
    Daftar 1: ["Nasi Putih", "Nasi Uduk", "Lalapan", "Sambel Merah", "Sambel Ijo"]
    Daftar 2: ["Nasi Kuning", "Ayam Goreng", "Bebek Goreng", "Rendang Sapi", "Rawon Sapi", "Tengkleng", "Sate Kambing", "Sambal"]
    
    Contoh Output JSON E:
    {{
      "gen_menu": ["nasi", "lalapan", "sambal"],
      "gen_comp_menu": ["nasi", "ayam", "bebek", "sapi", "kambing", "sambal"]
    }}
    
    Output JSON Anda:
    """

    print("\nMeminta LLM (Ollama) untuk menggeneralisasi menu secara semantik (hanya sekali)...")
    try:
        response = llm.invoke(prompt_template_generalize)
        
        # OllamaLLM (base LLM) mengembalikan string
        response_content = response.strip() 
        
        if response_content.startswith("```json"):
            response_content = response_content[7:-3].strip()
        elif response_content.startswith("```"):
            response_content = response_content[3:-3].strip()

        data = json.loads(response_content)
        
        gen_menu_names = data.get("gen_menu", menu_names)
        gen_actual_menus = data.get("gen_comp_menu", best_candidate["menu"])
        print(f"‚úÖ Generalisasi LLM (Ollama) berhasil.")

    except Exception as e:
        print(f"‚ö†Ô∏è Gagal menggeneralisasi menu dari LLM (Ollama): {e}. Menggunakan menu asli untuk Jaccard.")
    
    # 5) Hitung skor Jaccard final berdasarkan hasil generalisasi
    final_score = jaccard(gen_actual_menus, gen_menu_names)

    issues = []
    if final_score < 0.3:
        issues.append({
            "restaurant_id": best_candidate["id"],
            "restaurant_name": best_candidate["name"],
            "jaccard_score": final_score
        })

    return issues, final_score, best_candidate["name"], best_candidate["menu"], gen_menu_names, gen_actual_menus

# Fungsi evaluate_row mengembalikan tuple dengan menu generalisasi
def evaluate_row(menus: list[str], emb: list[list[float]]) -> tuple[str, float, str | None, list[str], list[str], list[str]]:
    """
    Mengevaluasi satu baris data, menentukan status berdasarkan Jaccard score akhir.
    """
    # Tangkap semua nilai yang dikembalikan oleh check_overclaim
    issues, score, comp_name, comp_menu, gen_menu, gen_comp_menu = check_overclaim(menus, emb)
    
    status = "overclaim" if score < 0.3 else "aman"
    
    # Kembalikan semua data yang relevan termasuk menu generalisasi
    return status, score, comp_name, comp_menu, gen_menu, gen_comp_menu

In [None]:
# 4. Run on the "df" DataFrame
# -----------------------------------------------------------------------------
hasil = []
# Looping menangani score, data perbandingan, dan menu generalisasi
for idx, row in df.iterrows():
    # Tangkap semua nilai yang dikembalikan oleh evaluate_row
    status, score, comp_restaurant, comp_menu, gen_menu, gen_comp_menu = evaluate_row(row["menu"], row["embedding"])
    
    # Kondisi untuk append
    # Hanya tambahkan ke 'hasil' jika 'comp_restaurant' DAN 'comp_menu' tidak kosong.
    if comp_restaurant and comp_menu:
        hasil.append({
            "index": idx,
            "restaurant": row["restaurant"],
            "menu": row["menu"],
            "city": row["city"],
            "comp_restaurant": comp_restaurant,
            "comp_menu": comp_menu,
            # Tambahkan kolom menu yang sudah digeneralisasi
            "gen_menu": gen_menu,
            "gen_comp_menu": gen_comp_menu,
            "score": score,
            "status": status
        })
        print(f"Data valid ditemukan dan ditambahkan pada baris {idx+1} ‚úÖ Total hasil sekarang: {len(hasil)}.")
    else:
        print(f"Data perbandingan kosong, baris {idx+1} dilewati ‚ùå Total hasil sekarang: {len(hasil)}.")

df_hasil = pd.DataFrame(hasil)
df_hasil

Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 1 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 2 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 3 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 4 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 5 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 6 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 7 dilewati ‚ùå Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 8 dilewa

Unnamed: 0,index,restaurant,menu,city,comp_restaurant,comp_menu,gen_menu,gen_comp_menu,score,status
0,18,A & F Cake and Cookies,"[Kue Kastengel, Kue Kacang, Kue Stik Keju, Donat]",Malang,"Raihan Bakery, Denai","[Tawar (Ceres/Keju), Bolu Gulung Chocomaltine,...","[Kue Kastengel, Kue Kacang, Kue Stik Keju, Donat]","[Tawar (Ceres/Keju), Bolu Gulung Chocomaltine,...",0.007463,overclaim
1,38,AICIRO,"[\tCHOCO CARAMEL, \tJASMINE TEA, \tMANGO BOBA ...",KOTA MALANG,Jajan Toast,"[Choco Banana, Dimsum Campur ( 4 variant Dimsu...","[\tCHOCO CARAMEL, \tJASMINE TEA, \tMANGO BOBA ...","[Choco Banana, Dimsum Campur ( 4 variant Dimsu...",0.035088,overclaim
2,41,AIRLANGGA,"[Burger, Burger + Keju, Kebab, Kentang Goreng,...",Kota¬†Surabaya,Assalam Resto,"[Sop Ikan Gurame/Nila, Gurame/Nila Saos Padang...","[Burger, Burger + Keju, Kebab, Kentang Goreng,...","[Sop Ikan Gurame/Nila, Gurame/Nila Saos Padang...",0.023256,overclaim
3,65,ANEKA MINUMAN (DHARMAWANITA ‚Äì SUMIYATI),"[Air Mineral, Es Cincau, Es Degan, Es Gooday (...",Kota¬†Surabaya,Kedai Es Juragan,"[Jus Melon, Jahe Merah, Es Nutrisari, Jus Sirs...","[es, minuman, jeruk, nata]","[es, minuman, jeruk, nata, cokelat, tahu, sapi...",0.400000,aman
4,68,ANGKRINGAN RINGIN KURUNG,"[DUMPLING BAKAR, BAKSO BAKAR, TELUR PUYUH BAKA...",Malang,Angkringan Abas Krian,"[Koin Kornet, Kepala Ayam, Sate Usus Bakar, Es...","[DUMPLING BAKAR, BAKSO BAKAR, TELUR PUYUH BAKA...","[Koin Kornet, Kepala Ayam, Sate Usus Bakar, Es...",0.019608,overclaim
...,...,...,...,...,...,...,...,...,...,...
156,2804,mie kuah,"[mie kuah, gimbal tempe, ote-ote, mie goreng, ...",Kabupaten Gresik,"Sari Rasa, Mojosari","[Angsio Hibien, Nasi Cap Jay, Kwetiau Goreng, ...","[mie kuah, gimbal tempe, ote-ote, mie goreng, ...","[Angsio Hibien, Nasi Cap Jay, Kwetiau Goreng, ...",0.058824,overclaim
157,2806,minuman segar jaya guna,"[temulawak, sinom, beras kencur, kunyit asam]",Kabupaten Gresik,Warung Sego BHS,"[Beras Kencur, Krupuk Puli, Teh Hangat, Juice ...","[temulawak, sinom, beras kencur, kunyit asam]","[Beras Kencur, Krupuk Puli, Teh Hangat, Juice ...",0.153846,overclaim
158,2815,roti bakar 51,"[roti bakar, ROTI BAKAR BANDUNG, ROTI BAKAR BA...",Malang,Bakso Bakar Hot,"[Telur Gulung Nuget, Tahu Walek Bakso Crispi, ...",[roti],"[roti, sosis, bakso, burger, pisang, tahu, uda...",0.083333,overclaim
159,2818,sri rejeki,"[ote- ote sri rejeki, pisang goreng, tempe isi...",Malang,Aneka Gorengan Siaga,"[bakwan Sayur, tempe Crispi, tahu Jeletot, tah...","[pisang, ote, tempe]","[pisang, singkong, tempe, tahu, ceker, bakso, ...",0.125000,overclaim


In [20]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [o


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
df_hasil.to_excel("df_hasil_semantik_161_baris_deepseek-r1-8b_temp0-75_512dim.xlsx")