In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import os
import json

# === Path Folder ===
base_path = "/content/drive/My Drive/CBR_Project"
input_folder = os.path.join(base_path, "data/raw")
json_output_path = os.path.join(input_folder, "cases_input.json")

# === Baca Semua File .txt dan Simpan ke JSON ===
cases = []
for i, filename in enumerate(sorted(os.listdir(input_folder))):
    if filename.endswith(".txt"):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read().strip()
            if text:
                cases.append({
                    "case_id": i + 1,
                    "text_full": text
                })
            else:
                print(f"⚠️ File kosong dilewati: {filename}")

# === Simpan ke JSON ===
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(cases, f, ensure_ascii=False, indent=2)

print(f"✅ File JSON berhasil dibuat: {json_output_path}")

✅ File JSON berhasil dibuat: /content/drive/My Drive/CBR_Project/data/raw/cases_input.json


In [4]:
import os
import re
import csv
import json
from datetime import datetime

# === Path Folder ===
base_path = "/content/drive/My Drive/CBR_Project"
input_folder = os.path.join(base_path, "data/raw")
output_folder = os.path.join(base_path, "data/processed")
os.makedirs(output_folder, exist_ok=True)

# === Fungsi Ekstraksi Data ===
def extract_penggugat(text):
    for pattern in [r"penuntut\s+umum\s*:\s*(.*)", r"jaksa\s+penuntut\s+umum\s*:\s*(.*)", r"jaksa\s*:\s*(.*)"]:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            nama = match.group(1).strip()
            if 3 < len(nama) < 100:
                return nama
    return "Penuntut Umum"

def extract_terdakwa(text):
    match = re.search(r"terdakwa(?:\s+\d+)?\s*[:\-]?\s*(.*)", text, re.IGNORECASE)
    if match:
        nama = match.group(1).strip()
        if 3 < len(nama) < 100:
            return nama
    match = re.search(r"nama\s+lengkap\s+([a-z\s\.\']{5,100})", text, re.IGNORECASE)
    return match.group(1).strip().title() if match else None

def extract_pasal_list(text):
    pattern = r"(pasal\s+\d+(?:\s+ayat\s+\(?\d+\)?)?(?:\s+uu\s*(?:ri)?\s*no\s*\d+\s*tahun\s*\d{4})?)"
    matches = re.findall(pattern, text, re.IGNORECASE)
    return list({re.sub(r"\s+", " ", m.strip().lower()) for m in matches}) if matches else None

def extract_metadata(text):
    meta = {}
    match = re.search(r"\b(\d{1,5}pidb\d{4}pn\s?[a-z]{3,5})\b", text, re.IGNORECASE)
    meta['no_perkara'] = match.group(1) if match else None

    tanggal = re.search(r"\b(\d{1,2})\s+(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\s+(\d{4})\b", text, re.IGNORECASE)
    if tanggal:
        day, month_str, year = int(tanggal[1]), tanggal[2].lower(), int(tanggal[3])
        bulan = {'januari':1,'februari':2,'maret':3,'april':4,'mei':5,'juni':6,
                 'juli':7,'agustus':8,'september':9,'oktober':10,'november':11,'desember':12}
        try:
            meta['tanggal'] = datetime(year, bulan[month_str], day).date().isoformat()
        except:
            meta['tanggal'] = None
    else:
        meta['tanggal'] = None

    meta['pasal'] = extract_pasal_list(text)
    meta['penggugat'] = extract_penggugat(text)
    meta['tergugat'] = extract_terdakwa(text)
    return meta

def extract_ringkasan_fakta(text):
    for pattern in [r"(Terdakwa.+?dakwaan.*?)(?=\n\n|$)", r"(menimbang\s+bahwa\s+terdakwa.*?)(?=\n\n|$)"]:
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        if match:
            return match.group(1).strip()[:1000]
    return text.split("\n\n")[0][:300]

def extract_argumen_hukum(text):
    match = re.search(r"(pertimbangan\s*hukum.*?pasal.*?)(\n\n|$)", text, re.IGNORECASE | re.DOTALL)
    return match.group(0) if match else None

def feature_engineering(text):
    return len(text.split())

# === Proses dari JSON ===
json_input_path = os.path.join(input_folder, "cases_input.json")

with open(json_input_path, "r", encoding="utf-8") as f:
    input_cases = json.load(f)

cases_data = []
for case in input_cases:
    text = case.get("text_full", "")
    meta = extract_metadata(text)
    ringkasan = extract_ringkasan_fakta(text)
    argumen = extract_argumen_hukum(text)
    panjang_kata = feature_engineering(text)

    pihak = None
    if meta.get("penggugat") and meta.get("tergugat"):
        pihak = f"{meta['penggugat']} vs {meta['tergugat']}"

    cases_data.append({
        "case_id": case["case_id"],
        "no_perkara": meta.get("no_perkara"),
        "tanggal": meta.get("tanggal"),
        "ringkasan_fakta": ringkasan,
        "pasal": meta.get("pasal"),
        "pihak": pihak,
        "text_full": text.strip(),
        "argumen_hukum": argumen,
        "length_word": panjang_kata
    })

# === Simpan ke File CSV ===
csv_output_path = os.path.join(output_folder, "cases_all.csv")
with open(csv_output_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=[
        "case_id", "no_perkara", "tanggal", "ringkasan_fakta",
        "pasal", "pihak", "text_full", "argumen_hukum", "length_word"
    ], quoting=csv.QUOTE_ALL)
    writer.writeheader()
    writer.writerows(cases_data)

# === Simpan ke File JSON ===
json_output_path = os.path.join(output_folder, "cases_all.json")
with open(json_output_path, "w", encoding="utf-8") as f:
    json.dump(cases_data, f, ensure_ascii=False, indent=2)

print(f"✅ Berhasil membuat file:\n- CSV: {csv_output_path}\n- JSON: {json_output_path}")

✅ Berhasil membuat file:
- CSV: /content/drive/My Drive/CBR_Project/data/processed/cases_all.csv
- JSON: /content/drive/My Drive/CBR_Project/data/processed/cases_all.json
