In [None]:
import pandas as pd
from neo4j import GraphDatabase
from time import sleep
from random import randint
import ast

In [2]:
# 1. Load CSV file and convert 'menu' and 'embedding' to list
# -----------------------------------------------------------------------------
df = pd.read_csv('wave_emb_dbmcv2.csv')
df = df.drop(['Unnamed: 0'], axis=1)
print(f"Tipe data nilai pada kolom 'menu' sebelum konversi: {type(df['menu'][0])}\nTipe data nilai pada kolom 'embedding' sebelum konversi: {type(df['embedding'][0])}")
df['menu'] = df['menu'].apply(ast.literal_eval)
df['embedding'] = df['embedding'].apply(ast.literal_eval)
print(f"Tipe data nilai pada kolom 'menu' setelah konversi: {type(df['menu'][0])}\nTipe data nilai pada kolom 'embedding' setelah konversi: {type(df['embedding'][0])}")
df

Tipe data nilai pada kolom 'menu' sebelum konversi: <class 'str'>
Tipe data nilai pada kolom 'embedding' sebelum konversi: <class 'str'>
Tipe data nilai pada kolom 'menu' setelah konversi: <class 'list'>
Tipe data nilai pada kolom 'embedding' setelah konversi: <class 'list'>


Unnamed: 0,restaurant,menu,city,alt_resto,embedding
0,BP Kerupuk Sayur,"[Kerupuk Labu BP Kerupuk Sayur, Kerupuk Wortel...",Surabaya,['Dapur Fizzul'],"[[-0.02187000773847103, 0.01127985492348671, -..."
1,Bumbu Rawon,"[Bumbu Rawon UKG (Usaha Kerta Gemilang), Bumbu...",Kabupaten Gresik,['Rawon Balungan H. Mufid'],"[[0.01963287591934204, 0.056399986147880554, 0..."
2,JAMU BU SOLIKAH,"[JAMU KUNIR SURUH, JAMU KUNIR, JAMU BERAS KENC...",Malang,['Sambat Luwe'],"[[0.009273191913962364, 0.03742887079715729, -..."
3,RANIS KDS,[KRUPUK BAWANG RANIS KDS ],Malang,['KDS Cantonese Restaurant'],"[[-0.08321938663721085, 0.05083480849862099, 0..."
4,Winarti Snack,[Kacang Goreng Winarti Snack],Surabaya,['Nathania Snack House'],"[[-0.09416132420301437, 0.10255325585603714, 0..."
...,...,...,...,...,...
2827,virinda food,"[Rolade, Sempol]",Malang,['Javanine Resto'],"[[-0.024130970239639282, 0.0011671549873426557..."
2828,wouwcake,"[risoles wouwcake, kue tar wouwcake, kroket ke...",Surabaya,['Warung wong lue (WWL)'],"[[-0.03364633023738861, 0.07875539362430573, -..."
2829,yuk tri,"[kue nastar, kue kastengel, kue putri salju]",Malang,['Trimurti Resto'],"[[-0.010088230483233929, -0.05614767223596573,..."
2830,zara,[stik keju zara],Malang,['Lafayette Coffee & Eatery'],"[[0.004909013397991657, -0.049723122268915176,..."


In [None]:
# 2. Neo4j connection & similarity helpers
# -----------------------------------------------------------------------------
drv = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

def find_similar_restaurants_by_embedding(alts_embs: list[list[float]], threshold=0.6, topk=2):
    ids = []
    with drv.session() as ses:
        for emb in alts_embs:
            # Kueri ini mengembalikan properti 'r_id' dari node
            rows = ses.run("""
                CALL db.index.vector.queryNodes('restaurant_emb_dbmcv2', $topk, $emb)
                YIELD node, score
                WHERE score >= $threshold
                OPTIONAL MATCH (other:Restaurant)-[s:SIMILAR_TO_DBMCV2]->(node)
                    WHERE s.score >= $threshold
                RETURN DISTINCT node.r_id AS id, collect(other.r_id) AS linked_ids
                """, emb=emb, threshold=threshold, topk=topk)
            for r in rows:
                if r["id"] is not None:
                    ids.append(r["id"])
                ids.extend(r["linked_ids"])
    return list(set(ids))

# Fungsi jaccard menerima list berisi string
def jaccard(list1: list[str], list2: list[str]) -> float:
    """Menghitung Jaccard similarity antara dua list string."""
    set1 = {item.lower().strip() for item in list1}
    set2 = {item.lower().strip() for item in list2}
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union != 0 else 0.0

In [None]:
# 3. Overclaim checker
# -----------------------------------------------------------------------------
def check_overclaim(menu_names: list[str], alts_embs: list[list[float]]):
    # 1) look up their node IDs in Neo4j
    rids_count = 1
    for i in range(10):
        rids = find_similar_restaurants_by_embedding(alts_embs)
        print(f"Number of attempts to find similar restaurants by emb: {rids_count}")
        if rids: break
        rids_count += 1
        sleep(randint(1, 2))

    # 2) compare each menu_item list against your query list
    issues = []
    max_jaccard_score = 0.0
    # Variabel untuk menyimpan data perbandingan terbaik
    best_comp_name = None
    best_comp_menu = []
    
    with drv.session() as ses:
        for rid in rids:
            result = ses.run(
                """
                MATCH (r:Restaurant {r_id:$rid})
                OPTIONAL MATCH (r)-[:HAS_PRODUCT]->(m:Menu)
                RETURN r.restaurant AS restaurant_name, collect(m.menu) AS actual_menus
                """, rid=rid
            )
            rec = result.single()
            if rec and rec["restaurant_name"]:
                db_resto_name = rec["restaurant_name"]
                # Gunakan variabel ini untuk mengisi comp_menu nantinya
                actual_menus = rec["actual_menus"]

                split_menus = [item.split()[0] for item in actual_menus if item and item.strip()]

                score = jaccard(split_menus, menu_names)

                # Cek jika ini adalah skor tertinggi yang ditemukan sejauh ini
                if score > max_jaccard_score:
                    max_jaccard_score = score
                    best_comp_name = db_resto_name
                    # Simpan menu asli (bukan yang di-split)
                    best_comp_menu = actual_menus

                if score < 0.3:
                    issues.append({
                        "restaurant_id": rid,
                        "restaurant_name": db_resto_name,
                        "jaccard_score": score
                    })

    # Mengembalikan issues, score tertinggi, nama, dan menu perbandingan terbaik
    return issues, max_jaccard_score, best_comp_name, best_comp_menu

# Fungsi evaluate_row mengembalikan tuple (status, score, comp_name, comp_menu)
def evaluate_row(menus: list[str], emb: list[list[float]]) -> tuple[str, float, str | None, list[str]]:
    """
    Mengevaluasi satu baris data, menentukan status berdasarkan Jaccard score akhir.
    """
    # Tangkap semua nilai yang dikembalikan oleh check_overclaim
    issues, score, comp_name, comp_menu = check_overclaim(menus, emb)
    
    status = "overclaim" if score < 0.3 else "aman"
    
    # Kembalikan semua data yang relevan
    return status, score, comp_name, comp_menu

In [None]:
# 4. Run on the "df" DataFrame
# -----------------------------------------------------------------------------
hasil = []
# Looping menangani score dan data perbandingan
for idx, row in df.iterrows():
    query_menus = [i.split()[0] for i in row["menu"] if i and i.strip()]
    
    # Tangkap semua nilai yang dikembalikan oleh evaluate_row
    status, score, comp_restaurant, comp_menu = evaluate_row(query_menus, row["embedding"])
    
    # Kondisi untuk append
    # Hanya tambahkan ke 'hasil' jika 'comp_restaurant' DAN 'comp_menu' tidak kosong.
    if comp_restaurant and comp_menu:
        hasil.append({
            "index": idx,
            "restaurant": row["restaurant"],
            "menu": row["menu"],
            "city": row["city"],
            "comp_restaurant": comp_restaurant,
            "comp_menu": comp_menu,
            "score": score,
            "status": status
        })
        print(f"Data valid ditemukan dan ditambahkan pada baris {idx+1} ✅ Total hasil sekarang: {len(hasil)}.")
    else:
        print(f"Data perbandingan kosong, baris {idx+1} dilewati ❌ Total hasil sekarang: {len(hasil)}.")

df_hasil = pd.DataFrame(hasil)
df_hasil

Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 1 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 2 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 3 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 4 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 5 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data perbandingan kosong, baris 6 dilewati ❌ Total hasil sekarang: 0.
Number of attempts to find similar restaurants by emb: 1
Data valid ditemukan dan ditambahkan pada baris 7 ✅ Total hasil sekarang: 1.
Number of attempts to find similar restaurants by emb: 1
Data valid ditemukan dan ditambahkan pada baris

Unnamed: 0,index,restaurant,menu,city,comp_restaurant,comp_menu,score,status
0,6,'BUKAN SEMBARANG CIRENG',[CIRENG BEKU],Malang,Aneka Gorengan Siaga,"[bakwan Sayur, tempe Crispi, tahu Jeletot, tah...",0.055556,overclaim
1,7,-,"[Buntut Sapi, Daging Sapi, Jerohan Sapi, Kepal...",KAB. PROBOLINGGO,Bebek NdoWer,"[Jaer Nila, Es Jeruk Manis, Nasi Bebek Mumer, ...",0.043478,overclaim
2,10,23,[BIHUN TELUR GULUNG 23],Malang,Copa Coffee,"[Milo, Nanas, Nasi Goreng Ikan Asin, Iced Tea,...",0.016393,overclaim
3,12,3M Food,"[Dimsum Udang, Mie Pangsit, Tahu Bakso Ayam]",Malang,Mie Jeder,"[Mie Pedas Jeder Rendang Level 1, Mie Pedas Je...",0.333333,aman
4,15,99,"[Kulit Pangsit 99, MIE BASAH 99, Kulit pangsit...",Malang,SEBLAK BANDUNG 86,"[Seblak Baso Ikan, Es Milo, Seblak Spesial Sup...",0.066667,overclaim
...,...,...,...,...,...,...,...,...
1382,2820,tahu susu aulia,[tahu susu aulia],Malang,Pancong Pijay,"[Pancong Ovaltine Keju Susu, Ropang Pisang Kej...",0.030303,overclaim
1383,2821,tela tela mpu purwa,"[tela tela, cimol ]",Malang,Kedai ReffTwo,"[Nasi Ayam Pop Cron, Mie Seblak ReffTwo, Boba ...",0.025000,overclaim
1384,2823,telur gulung mak ti,"[telur gulung, nasi goreng, pisang keju]",Kabupaten Gresik,Warung Teguh,"[Bakso Bakar Dan Tahu Bakar, Tela Tela, Tahu W...",0.222222,overclaim
1385,2825,teras kuliner,"[asam manis ikan mas, es timun, ayam bakar, ja...",Kabupaten Gresik,Subsidi,"[Telur Dadar, Es Teh Manis, Mie Goreng, Nasi G...",0.142857,overclaim


In [49]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [9]:
df_hasil.to_excel("df_hasil_1kata_1387_rows_512dim.xlsx")