In [9]:
import os
import json
import re

# --- CONFIGURATION ---
# In Jupyter, we use os.getcwd() to get the current folder path
try:
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    BASE_DIR = os.getcwd()

RAW_INPUT = os.path.join(BASE_DIR, "raw_mustika_rasa_full.json")
FINAL_OUTPUT = os.path.join(BASE_DIR, "mustika_rasa_full_cleaned.json")

print(f"Working Directory: {BASE_DIR}")
print(f"Looking for: {RAW_INPUT}")

Working Directory: /Users/prasetyoekosulaksono/Documents/bhineka_tunggal_rasa/1. Dataset Development
Looking for: /Users/prasetyoekosulaksono/Documents/bhineka_tunggal_rasa/1. Dataset Development/raw_mustika_rasa_full.json


In [None]:
import pandas as pd
import json
import os

def is_continuation(prev, curr):
    if not prev or not curr: 
        return False, None

    curr_title_orig = (curr.get('title_original') or "").lower()
    curr_title_norm = (curr.get('title_normalized') or "").lower()
    
    page_diff = curr.get('_source_page', 999) - prev.get('_source_page', 0)
    is_adjacent = (0 <= page_diff <= 1)
    
    if not is_adjacent:
        return False, None

    # Trigger 1: Keywords
    fragment_keywords = ["continu", "lanjut", "sambung", "untitled"]
    is_explicit_fragment = any(kw in curr_title_orig or kw in curr_title_norm for kw in fragment_keywords)

    # Trigger 2: Previous incomplete
    prev_instr_list = prev.get('instructions') or []
    prev_instr_text = " ".join(prev_instr_list).lower()
    is_prev_incomplete = (
        len(prev_instr_list) == 0 or 
        "incomplete" in prev_instr_text or 
        "missing" in prev_instr_text
    )

    if is_explicit_fragment and is_prev_incomplete:
        return True, "Keyword + Empty Instructions"
    if is_explicit_fragment:
        return True, "Explicit Keyword"
    if is_prev_incomplete and curr.get('recipe_id', '').endswith('_01'):
        return True, "First item on page after incomplete"
        
    return False, None

In [13]:
def main():
    with open(RAW_INPUT, 'r', encoding='utf-8') as f:
        raw_list = json.load(f)

    if not raw_list: return

    final_recipes = []
    buffer = raw_list[0]

    for i in range(1, len(raw_list)):
        next_item = raw_list[i]
        
        if is_continuation(buffer, next_item):
            print(f"ðŸ§µ Target: {buffer['recipe_id']} + {next_item['recipe_id']}")
        else:
            buffer = next_item

In [14]:
main()

ðŸ§µ Target: MR_196_02 + MR_197_01
ðŸ§µ Target: MR_201_02 + MR_202_01
ðŸ§µ Target: MR_203_03 + MR_204_01
ðŸ§µ Target: MR_205_02 + MR_206_01
ðŸ§µ Target: MR_209_02 + MR_210_01
ðŸ§µ Target: MR_228_02 + MR_229_01
ðŸ§µ Target: MR_234_02 + MR_235_01
ðŸ§µ Target: MR_240_02 + MR_241_01
ðŸ§µ Target: MR_248_02 + MR_249_01
ðŸ§µ Target: MR_250_02 + MR_251_01
ðŸ§µ Target: MR_255_02 + MR_256_01
ðŸ§µ Target: MR_259_01 + MR_259_02
ðŸ§µ Target: MR_262_02 + MR_263_01
ðŸ§µ Target: MR_263_03 + MR_264_01
ðŸ§µ Target: MR_267_02 + MR_268_01
ðŸ§µ Target: MR_277_02 + MR_278_01
ðŸ§µ Target: MR_282_02 + MR_283_01
ðŸ§µ Target: MR_286_02 + MR_287_01
ðŸ§µ Target: MR_294_02 + MR_295_01
ðŸ§µ Target: MR_295_03 + MR_296_01
ðŸ§µ Target: MR_296_03 + MR_297_01
ðŸ§µ Target: MR_302_02 + MR_303_01
ðŸ§µ Target: MR_303_03 + MR_304_01
ðŸ§µ Target: MR_304_03 + MR_305_01
ðŸ§µ Target: MR_307_02 + MR_308_01
ðŸ§µ Target: MR_309_02 + MR_310_01
ðŸ§µ Target: MR_313_02 + MR_314_01
ðŸ§µ Target: MR_314_03 + MR_315_01
ðŸ§µ Target: MR_325_