In [2]:
import os
import json
import re

# --- CONFIGURATION ---
# In Jupyter, we use os.getcwd() to get the current folder path
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

RAW_INPUT = os.path.join(BASE_DIR, "raw_mustika_rasa_full.json")
FINAL_OUTPUT = os.path.join(BASE_DIR, "mustika_rasa_full_cleaned.json")

print(f"Working Directory: {BASE_DIR}")
print(f"Looking for: {RAW_INPUT}")

Working Directory: /Users/prasetyoekosulaksono/Documents/bhineka_tunggal_rasa/1. Dataset Development
Looking for: /Users/prasetyoekosulaksono/Documents/bhineka_tunggal_rasa/1. Dataset Development/raw_mustika_rasa_full.json


In [3]:
import pandas as pd
import json
import os

def is_continuation(prev, curr):
    if not prev or not curr: 
        return False, None

    curr_title_orig = (curr.get('title_original') or "").lower()
    curr_title_norm = (curr.get('title_normalized') or "").lower()
    
    page_diff = curr.get('_source_page', 999) - prev.get('_source_page', 0)
    is_adjacent = (0 <= page_diff <= 1)
    
    if not is_adjacent:
        return False, None

    # Trigger 1: Keywords
    fragment_keywords = ["continu", "lanjut", "sambung", "untitled", "inferr", "previous"]
    is_explicit_fragment = any(kw in curr_title_orig or kw in curr_title_norm for kw in fragment_keywords)

    # Trigger 2: Previous incomplete
    prev_instr_list = prev.get('instructions') or []
    prev_instr_text = " ".join(prev_instr_list).lower()
    is_prev_incomplete = (
        len(prev_instr_list) == 0 or 
        "incomplete" in prev_instr_text or 
        "missing" in prev_instr_text
    )

    if is_explicit_fragment and is_prev_incomplete:
        return True, "Keyword + Empty Instructions"
    if is_explicit_fragment:
        return True, "Explicit Keyword"

    #not using below criteria
    #if is_prev_incomplete and curr.get('recipe_id', '').endswith('_01'):
    #    return True, "First item on page after incomplete"
        
    return False, None

In [4]:
def analyze_stitching(raw_input_path):
    with open(raw_input_path, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

    if not raw_list:
        print("No data found.")
        return

    stitch_targets = []
    buffer = raw_list[0]

    for i in range(1, len(raw_list)):
        next_item = raw_list[i]
        
        should_merge, reason = is_continuation(buffer, next_item)
        
        if should_merge:
            stitch_targets.append({
                "Head_ID": buffer['recipe_id'],
                "Head_Title": buffer.get('title_normalized'),
                "Tail_ID": next_item['recipe_id'],
                "Tail_Title": next_item.get('title_normalized'),
                "Reason": reason,
                "Page_Gap": next_item['_source_page'] - buffer['_source_page']
            })
            # Note: We don't update the buffer here because we want to see 
            # if multiple tails belong to one head
        else:
            buffer = next_item

    # Convert to DataFrame for easy viewing in Jupyter
    df = pd.DataFrame(stitch_targets)
    return df

# Execute
df_report = analyze_stitching(RAW_INPUT)
df_report

Unnamed: 0,Head_ID,Head_Title,Tail_ID,Tail_Title,Reason,Page_Gap
0,MR_196_02,Nasi Biryani,MR_197_01,[Resep Lanjutan],Keyword + Empty Instructions,1
1,MR_203_03,Nasi Kuning,MR_204_01,Nasi Kuning (Continuation),Keyword + Empty Instructions,1
2,MR_205_02,Nasi Tim,MR_206_01,Nasi X (Lanjutan dari Halaman 197),Keyword + Empty Instructions,1
3,MR_220_02,Jangan Singang,MR_221_01,Resep Ikan,Explicit Keyword,1
4,MR_228_02,Gangan Waluh,MR_229_01,Gulai Labu (Continuation),Keyword + Empty Instructions,1
...,...,...,...,...,...,...
142,MR_1118_02,Tahu Bumbu Kecap,MR_1119_01,Tahu Tauge Goreng,Keyword + Empty Instructions,1
143,MR_1131_02,Tiara Gesing,MR_1132_01,Kue Pisang Kukus (Inferred),Keyword + Empty Instructions,1
144,MR_1133_02,Cemplon,MR_1134_01,Misro or Srudut,Keyword + Empty Instructions,1
145,MR_1152_03,Es Buah-buahan,MR_1153_01,Es Sirsak,Keyword + Empty Instructions,1


In [32]:
with open(RAW_INPUT, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)
prev = [r for r in raw_list if r['recipe_id'] == 'MR_220_02'][0]
curr = [r for r in raw_list if r['recipe_id'] == 'MR_221_01'][0]
is_continuation(prev,curr)



(True, 'Explicit Keyword')

In [5]:
df_report['Reason'].value_counts()

Reason
Keyword + Empty Instructions    143
Explicit Keyword                  4
Name: count, dtype: int64

In [6]:
#go with 
# Keyword + Empty Instructions           127
# Explicit Keyword                         4

df_report = df_report[df_report['Reason']!='First item on page after incomplete']
df_report

Unnamed: 0,Head_ID,Head_Title,Tail_ID,Tail_Title,Reason,Page_Gap
0,MR_196_02,Nasi Biryani,MR_197_01,[Resep Lanjutan],Keyword + Empty Instructions,1
1,MR_203_03,Nasi Kuning,MR_204_01,Nasi Kuning (Continuation),Keyword + Empty Instructions,1
2,MR_205_02,Nasi Tim,MR_206_01,Nasi X (Lanjutan dari Halaman 197),Keyword + Empty Instructions,1
3,MR_220_02,Jangan Singang,MR_221_01,Resep Ikan,Explicit Keyword,1
4,MR_228_02,Gangan Waluh,MR_229_01,Gulai Labu (Continuation),Keyword + Empty Instructions,1
...,...,...,...,...,...,...
142,MR_1118_02,Tahu Bumbu Kecap,MR_1119_01,Tahu Tauge Goreng,Keyword + Empty Instructions,1
143,MR_1131_02,Tiara Gesing,MR_1132_01,Kue Pisang Kukus (Inferred),Keyword + Empty Instructions,1
144,MR_1133_02,Cemplon,MR_1134_01,Misro or Srudut,Keyword + Empty Instructions,1
145,MR_1152_03,Es Buah-buahan,MR_1153_01,Es Sirsak,Keyword + Empty Instructions,1


In [7]:
df_report[df_report['Reason']=='Explicit Keyword']

Unnamed: 0,Head_ID,Head_Title,Tail_ID,Tail_Title,Reason,Page_Gap
3,MR_220_02,Jangan Singang,MR_221_01,Resep Ikan,Explicit Keyword,1
114,MR_855_02,Cake Nanas,MR_856_01,Kue Lapis Susun Nanas (Lanjutan),Explicit Keyword,1
115,MR_857_03,Kue Chateau De Sumatera,MR_858_01,Rompudding (Continuation),Explicit Keyword,1
120,MR_883_02,Pempek Model (Palembang),MR_884_01,Continuation of previous recipe (Pempek Tahu),Explicit Keyword,1


In [8]:
def merge_recipes(head, tail):
    """
    Surgically merges 'tail' into 'head' based on:
    1. Inferred data exclusion.
    2. Intelligent ingredient group merging (bumbu/utama).
    3. Instruction replacement.
    """
    print(f"   ðŸ§µ Stitching: {head.get('recipe_id')} + {tail.get('recipe_id')}")

    # --- 1. MERGE INGREDIENTS ---
    head_groups = head.get('ingredient_groups', []) or []
    tail_groups = tail.get('ingredient_groups', []) or []

    for t_group in tail_groups:
        g_name = (t_group.get('group_name') or "").lower()
        
        # RULE: Skip if group_name contains 'inferred'
        if "inferred" in g_name:
            continue
            
        # RULE: Skip if any individual ingredient text contains 'inferred'
        t_ingredients = [
            ing for ing in t_group.get('ingredients', [])
            if "inferred" not in (ing.get('original_text') or "").lower()
        ]
        
        if not t_ingredients:
            continue

        # Check if we should merge into an existing 'utama' or 'bumbu' group
        target_group = None
        if any(name in g_name for name in ["utama", "bumbu"]):
            for h_group in head_groups:
                if h_group.get('group_name', '').lower() == g_name:
                    target_group = h_group
                    break
        
        if target_group:
            # Merge into existing group
            target_group['ingredients'].extend(t_ingredients)
        else:
            # Concat as a new group (preserving filtered ingredients)
            new_group = t_group.copy()
            new_group['ingredients'] = t_ingredients
            head_groups.append(new_group)

    head['ingredient_groups'] = head_groups

    # --- 2. MERGE INSTRUCTIONS ---
    t_instructions = tail.get('instructions', []) or []
    h_instructions = head.get('instructions', []) or []
    
    # RULE: If tail has instructions, it becomes the main instruction set
    # We filter out placeholder text like "(Instructions continue...)"
    if t_instructions:
        # Check if head instruction was just a placeholder
        is_placeholder = any("continue" in str(line).lower() for line in h_instructions)
        
        if not h_instructions or is_placeholder:
            head['instructions'] = t_instructions
        else:
            # If head actually had real steps, we append the tail steps
            head['instructions'].extend(t_instructions)

    return head

In [9]:
import pandas as pd
import json

def analyze_full_stitch(raw_input_path):
    with open(raw_input_path, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

    stitch_reports = []
    buffer = raw_list[0]

    for i in range(1, len(raw_list)):
        next_item = raw_list[i]
        should_merge, reason = is_continuation(buffer, next_item) # Using your logic from before
        
        if should_merge:
            # 1. Capture State Before
            head_id = buffer['recipe_id']
            pre_groups = [g['group_name'] for g in (buffer.get('ingredient_groups') or [])]
            pre_instr_count = len(buffer.get('instructions') or [])
            
            # 2. Perform the Merge (using the surgical logic)
            # Use .copy() to avoid mutating the actual raw_list during analysis
            head_copy = json.loads(json.dumps(buffer)) 
            merged = merge_recipes(head_copy, next_item)
            
            # 3. Capture State After
            post_groups = [g['group_name'] for g in merged['ingredient_groups']]
            post_instr_count = len(merged['instructions'])
            
            stitch_reports.append({
                "Recipe_Target": head_id,
                "Title": merged.get('title_normalized'),
                "Stitched_With": next_item['recipe_id'],
                "Reason": reason,
                "Groups_Before": pre_groups,
                "Groups_After": post_groups,
                "Instr_Count_Before": pre_instr_count,
                "Instr_Count_After": post_instr_count,
                "First_Instr_Line": merged['instructions'][0][:50] + "..." if merged['instructions'] else "NONE"
            })
            # Important: Update buffer to the merged version so we can catch 
            # triple-stitches (Page 1 + Page 2 + Page 3)
            buffer = merged
        else:
            buffer = next_item

    return pd.DataFrame(stitch_reports)

# Run and Display
df_stitch_results = analyze_full_stitch(RAW_INPUT)

   ðŸ§µ Stitching: MR_196_02 + MR_197_01
   ðŸ§µ Stitching: MR_203_03 + MR_204_01
   ðŸ§µ Stitching: MR_205_02 + MR_206_01
   ðŸ§µ Stitching: MR_220_02 + MR_221_01
   ðŸ§µ Stitching: MR_228_02 + MR_229_01
   ðŸ§µ Stitching: MR_234_02 + MR_235_01
   ðŸ§µ Stitching: MR_235_03 + MR_236_01
   ðŸ§µ Stitching: MR_240_02 + MR_241_01
   ðŸ§µ Stitching: MR_248_02 + MR_249_01
   ðŸ§µ Stitching: MR_250_02 + MR_251_01
   ðŸ§µ Stitching: MR_255_02 + MR_256_01
   ðŸ§µ Stitching: MR_262_02 + MR_263_01
   ðŸ§µ Stitching: MR_263_03 + MR_264_01
   ðŸ§µ Stitching: MR_264_03 + MR_265_01
   ðŸ§µ Stitching: MR_267_02 + MR_268_01
   ðŸ§µ Stitching: MR_277_02 + MR_278_01
   ðŸ§µ Stitching: MR_282_02 + MR_283_01
   ðŸ§µ Stitching: MR_286_02 + MR_287_01
   ðŸ§µ Stitching: MR_295_03 + MR_296_01
   ðŸ§µ Stitching: MR_296_03 + MR_297_01
   ðŸ§µ Stitching: MR_302_02 + MR_303_01
   ðŸ§µ Stitching: MR_303_03 + MR_304_01
   ðŸ§µ Stitching: MR_304_03 + MR_305_01
   ðŸ§µ Stitching: MR_307_02 + MR_308_01
   ðŸ§µ Stitchin

In [10]:
df_stitch_results

Unnamed: 0,Recipe_Target,Title,Stitched_With,Reason,Groups_Before,Groups_After,Instr_Count_Before,Instr_Count_After,First_Instr_Line
0,MR_196_02,Nasi Biryani,MR_197_01,Keyword + Empty Instructions,[utama],"[utama, bumbu]",0,4,Beras ditjutji sampai bersih....
1,MR_203_03,Nasi Kuning,MR_204_01,Keyword + Empty Instructions,"[Bahan, Bumbu]","[Bahan, Bumbu, Bumbu Lanjut]",0,7,"Bawang merah, bawang putih diiris, digoreng...."
2,MR_205_02,Nasi Tim,MR_206_01,Keyword + Empty Instructions,[utama],"[utama, bumbu]",0,3,"Beras ditjuti bersih, lalu dikaru...."
3,MR_220_02,Jangan Singang,MR_221_01,Explicit Keyword,[utama],"[utama, bumbu]",1,5,Ikan dibersihkan....
4,MR_228_02,Gangan Waluh,MR_229_01,Keyword + Empty Instructions,"[utama, bumbu]","[utama, bumbu]",0,5,Labu dikupas dan dipotong-potong....
...,...,...,...,...,...,...,...,...,...
142,MR_1118_02,Tahu Bumbu Kecap,MR_1119_01,Keyword + Empty Instructions,"[utama, bumbu]","[utama, bumbu, bahan tambahan]",0,7,Cara membuatnya:...
143,MR_1131_02,Tiara Gesing,MR_1132_01,Keyword + Empty Instructions,[utama],"[utama, bumbu]",0,5,"Pisang dikupas, di-irisÂ² bundar...."
144,MR_1133_02,Cemplon,MR_1134_01,Keyword + Empty Instructions,[utama],[utama],0,4,"Singkong dikupas, diparut...."
145,MR_1152_03,Es Buah-buahan,MR_1153_01,Keyword + Empty Instructions,[utama],[utama],0,4,Tepung maizena diaduk dengan kuning telur ajam...


In [11]:
# Check Sample 2 (Bundu Ayam) specifically
sample_2_check = df_stitch_results[df_stitch_results['Recipe_Target'] == "MR_431_02"]
print(sample_2_check[['Groups_Before', 'Groups_After', 'Instr_Count_After']].to_string())

   Groups_Before    Groups_After  Instr_Count_After
48       [utama]  [utama, bumbu]                  5


In [12]:
print(f"Total Stitches Performed: {len(df_stitch_results)}")
print(f"Average Instructions before Stitch: {df_stitch_results['Instr_Count_After'].mean():.2f}")
print(f"Average Instructions after Stitch: {df_stitch_results['Instr_Count_After'].mean():.2f}")

# Count how many were triggered by 'inferred' or 'placeholder' logic
inferred_cleanup = df_stitch_results[~df_stitch_results['Groups_After'].str.contains('inferred', na=False)]
print(f"Cleaned up inferred groups in {len(inferred_cleanup)} recipes.")

Total Stitches Performed: 147
Average Instructions before Stitch: 5.44
Average Instructions after Stitch: 5.44
Cleaned up inferred groups in 147 recipes.


In [None]:
[df_stitch_resultsdf_stitch_results['Recipe_Target']=='MR_230_01']

Unnamed: 0,Recipe_Target,Title,Stitched_With,Reason,Groups_Before,Groups_After,Instr_Count_Before,Instr_Count_After,First_Instr_Line


In [33]:
df_stitch_results[df_stitch_results['Recipe_Target']=='MR_220_02']

Unnamed: 0,Recipe_Target,Title,Stitched_With,Reason,Groups_Before,Groups_After,Instr_Count_Before,Instr_Count_After,First_Instr_Line
3,MR_220_02,Jangan Singang,MR_221_01,Explicit Keyword,[utama],"[utama, bumbu]",1,5,Ikan dibersihkan....
