In [2]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import re
from pdfminer.high_level import extract_text

# Lokasi folder kamu (Google Drive, misalnya)
base_path = "/content/drive/My Drive/CBR_Project"
pdf_folder = os.path.join(base_path, "pdfs")
output_folder = os.path.join(base_path, "data/raw")
log_file = os.path.join(base_path, "logs/cleaning.log")

os.makedirs(output_folder, exist_ok=True)
os.makedirs(os.path.dirname(log_file), exist_ok=True)

def clean_putusan_text(text):
    # Hapus disclaimer Mahkamah Agung
    text = re.sub(
        r"Disclaimer\s+Kepaniteraan Mahkamah Agung Republik Indonesia.*?021-384 3348 \(ext\.318\)",
        "", text, flags=re.DOTALL)

    # Hapus header/footer
    text = re.sub(r"Halaman\s+\d+\s+dari\s+\d+\s+Putusan Nomor.*", "", text)

    # Hapus watermark Mahkamah Agung (yang ditulis terpisah karakter)
    text = re.sub(r"Mah\s*ka\s*m\s*ah\s*A\s*gung\s*R\s*ep\s*ublik\s*In\s*dones\s*ia", "", text, flags=re.IGNORECASE)
    text = re.sub(r"Direktori Putusan Mahkamah Agung Republik Indonesia", "", text, flags=re.IGNORECASE)
    text = re.sub(r"putusan\.mahkamahagung\.go\.id", "", text, flags=re.IGNORECASE)
    # Hapus header awal halaman seperti "p u t u s a n nomor ..."
    text = re.sub(r"\bp\s*u\s*t\s*u\s*s\s*a\s*n\s*nomor.*?pn\s*mjl", "", text, flags=re.IGNORECASE)

    # # Hapus pola header halaman: "direktori putusan halaman X p u t u s a n nomor ... pn xxx"
    # text = re.sub(r"direktori\s+putusan\s+halaman\s+\d+\s+p\s*u\s*t\s*u\s*s\s*a\s*n\s*nomor\s*\d+.*?pn\s*\w+", "", text, flags=re.IGNORECASE)

    # Normalisasi: lowercase, hapus tanda baca, spasi
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # hilangkan tanda baca
    text = re.sub(r"\s+", " ", text)     # spasi berlebih
    return text.strip()

log_entries = []
pdf_files = sorted([f for f in os.listdir(pdf_folder) if f.endswith(".pdf")])[:30]

for idx, filename in enumerate(pdf_files):
    file_path = os.path.join(pdf_folder, filename)
    try:
        raw_text = extract_text(file_path)

        if len(raw_text.strip()) < 100:
            log_entries.append(f"{filename} | SKIPPED | kosong atau tidak bisa dibaca")
            continue

        cleaned_text = clean_putusan_text(raw_text)

        txt_filename = f"case_{idx+1:03}.txt"
        with open(os.path.join(output_folder, txt_filename), "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        log_entries.append(f"{filename} | OK | {len(cleaned_text)} chars")

    except Exception as e:
        log_entries.append(f"{filename} | ERROR | {str(e)}")

# Tulis log
with open(log_file, "w", encoding="utf-8") as f:
    f.write("\n".join(log_entries))

print("✔️ Proses selesai. File bersih dan log disimpan.")