# Tahap 1 – Membangun Case Base

###Seleksi & Unduh

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pandas requests beautifulsoup4 pdfminer.six lxml > /dev/null 2>&1

In [None]:
import argparse
import io
import os
import re
import time
import urllib
from concurrent.futures import ThreadPoolExecutor, wait
from datetime import date
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pdfminer.high_level import extract_text

In [None]:
def create_path(folder_name):
    path = os.path.join(os.getcwd(), folder_name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def open_page(link):
    count = 0
    while count < 3:
        try:
            return BeautifulSoup(requests.get(link).text, "lxml")
        except:
            count += 1
            time.sleep(5)


def get_detail(soup, keyword):
    try:
        text = (
            soup.find(lambda tag: tag.name == "td" and keyword in tag.text)
            .find_next()
            .get_text()
            .strip()
        )
        return text
    except:
        return ""


def get_pdf(url, path_pdf):
    try:
        file = urllib.request.urlopen(url)
        file_name = os.path.basename(url)
        file_content = file.read()
        with open(f"{path_pdf}/{file_name}", "wb") as out_file:
            out_file.write(file_content)
        return io.BytesIO(file_content), file_name
    except:
        return None, None


def clean_text(text):
    text = text.replace("M a h ka m a h A g u n g R e p u blik In d o n esia\n", "")
    text = text.replace("Disclaimer\n", "")
    text = text.replace(
        "Kepaniteraan Mahkamah Agung Republik Indonesia berusaha untuk selalu mencantumkan informasi paling kini dan akurat sebagai bentuk komitmen Mahkamah Agung untuk pelayanan publik, transparansi dan akuntabilitas\n",
        "",
    )
    text = text.replace(
        "pelaksanaan fungsi peradilan. Namun dalam hal-hal tertentu masih dimungkinkan terjadi permasalahan teknis terkait dengan akurasi dan keterkinian informasi yang kami sajikan, hal mana akan terus kami perbaiki dari waktu kewaktu.\n",
        "",
    )
    text = text.replace(
        "Dalam hal Anda menemukan inakurasi informasi yang termuat pada situs ini atau informasi yang seharusnya ada, namun belum tersedia, maka harap segera hubungi Kepaniteraan Mahkamah Agung RI melalui :\n",
        "",
    )
    text = text.replace(
        "Email : kepaniteraan@mahkamahagung.go.id    Telp : 021-384 3348 (ext.318)\n",
        "",
    )
    return text


def extract_data(link, keyword_url, path_output, path_pdf, today):
    soup = open_page(link)
    table = soup.find("table", {"class": "table"})
    judul = table.find("h2").text if table.find("h2") else ""

    nomor = get_detail(table, "Nomor")
    tingkat_proses = get_detail(table, "Tingkat Proses")
    klasifikasi = get_detail(table, "Klasifikasi")
    kata_kunci = get_detail(table, "Kata Kunci")
    tahun = get_detail(table, "Tahun")
    tanggal_register = get_detail(table, "Tanggal Register")
    lembaga_peradilan = get_detail(table, "Lembaga Peradilan")
    jenis_lembaga_peradilan = get_detail(table, "Jenis Lembaga Peradilan")
    hakim_ketua = get_detail(table, "Hakim Ketua")
    hakim_anggota = get_detail(table, "Hakim Anggota")
    panitera = get_detail(table, "Panitera")
    amar = get_detail(table, "Amar")
    amar_lainnya = get_detail(table, "Amar Lainnya")
    catatan_amar = get_detail(table, "Catatan Amar")
    tanggal_musyawarah = get_detail(table, "Tanggal Musyawarah")
    tanggal_dibacakan = get_detail(table, "Tanggal Dibacakan")
    kaidah = get_detail(table, "Kaidah")
    status = get_detail(table, "Status")
    abstrak = get_detail(table, "Abstrak")

    try:
        link_pdf = soup.find("a", href=re.compile(r"/pdf/"))["href"]
        file_pdf, file_name_pdf = get_pdf(link_pdf, path_pdf)
        text_pdf = extract_text(file_pdf)
        text_pdf = clean_text(text_pdf)
    except:
        link_pdf = ""
        text_pdf = ""
        file_name_pdf = ""

    data = [
        judul,
        nomor,
        tingkat_proses,
        klasifikasi,
        kata_kunci,
        tahun,
        tanggal_register,
        lembaga_peradilan,
        jenis_lembaga_peradilan,
        hakim_ketua,
        hakim_anggota,
        panitera,
        amar,
        amar_lainnya,
        catatan_amar,
        tanggal_musyawarah,
        tanggal_dibacakan,
        kaidah,
        status,
        abstrak,
        link,
        link_pdf,
        file_name_pdf,
        text_pdf,
    ]
    result = pd.DataFrame(
        [data],
        columns=[
            "judul",
            "nomor",
            "tingkat_proses",
            "klasifikasi",
            "kata_kunci",
            "tahun",
            "tanggal_register",
            "lembaga_peradilan",
            "jenis_lembaga_peradilan",
            "hakim_ketua",
            "hakim_anggota",
            "panitera",
            "amar",
            "amar_lainnya",
            "catatan_amar",
            "tanggal_musyawarah",
            "tanggal_dibacakan",
            "kaidah",
            "status",
            "abstrak",
            "link",
            "link_pdf",
            "file_name_pdf",
            "text_pdf",
        ],
    )

    keyword_url = keyword_url.replace("/", " ")
    if keyword_url.startswith("https"):
        keyword_url = ""
    destination = f"{path_output}/putusan_ma_{keyword_url}_{today}"
    if not os.path.isfile(f"{destination}.csv"):
        result.to_csv(f"{destination}.csv", header=True, index=False)
    else:
        result.to_csv(f"{destination}.csv", mode="a", header=False, index=False)


def run_process(keyword_url, page, sort_date, path_output, path_pdf, today):
    if keyword_url.startswith("https"):
        link = f"{keyword_url}&page={page}"
    else:
        link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword_url}&page={page}"
    if sort_date:
        link = f"{link}&obf=TANGGAL_PUTUS&obm=desc"

    soup = open_page(link)
    links = soup.find_all("a", {"href": re.compile("/direktori/putusan")})

    for link in links:
        extract_data(link["href"], keyword_url, path_output, path_pdf, today)


def run_scraper(keyword=None, url=None, sort_date=True, download_pdf=True):
    if not keyword and not url:
        print("Please provide a keyword or URL")
        return

    path_output = '/content/drive/MyDrive/Penalaran Komputer/CSV'
    path_pdf = '/content/drive/MyDrive/Penalaran Komputer/PDF'
    today = date.today().strftime("%Y-%m-%d")

    link = f"https://putusan3.mahkamahagung.go.id/search.html?q={keyword}&page=1"
    if url:
        link = url

    soup = open_page(link)
    last_page = int(soup.find_all("a", {"class": "page-link"})[-1].get("data-ci-pagination-page"))

    if url:
        print(f"Scraping with url: {url} - {20 * last_page} data - {last_page} page")
    else:
        print(f"Scraping with keyword: {keyword} - {20 * last_page} data - {last_page} page")

    if url:
        keyword_url = url
    else:
        keyword_url = keyword

    futures = []
    with ThreadPoolExecutor(max_workers=4) as executor:
        for page in range(last_page):
            futures.append(
                executor.submit(run_process, keyword_url, page + 1, sort_date, path_output, path_pdf, today)
            )
    wait(futures)


In [None]:
# Download Pidana Klasifikasi TUN di Pengadilan PTUN BANDUNG
run_scraper(url="https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=6ccf9ad6c4d6e9ab8fbc1f8b252cec81&jd=&tp=0&court=531823PTUN414+++++++++++++++++++&t_put=2021&t_reg=&t_upl=&t_pr=")

Scraping with url: https://putusan3.mahkamahagung.go.id/search.html?q=&jenis_doc=putusan&cat=6ccf9ad6c4d6e9ab8fbc1f8b252cec81&jd=&tp=0&court=531823PTUN414+++++++++++++++++++&t_put=2021&t_reg=&t_upl=&t_pr= - 140 data - 7 page


###Konversi & Ekstraksi Teks

In [None]:
!pip install pdfminer.six



In [None]:
import os

pdf_folder = '/content/drive/MyDrive/Penalaran Komputer/PDF'
output_folder = '/content/drive/MyDrive/Penalaran Komputer/data/raw'

os.makedirs(output_folder, exist_ok=True)

In [None]:
from pdfminer.high_level import extract_text

def convert_pdf_to_txt(pdf_path, txt_path):
    try:
        text = extract_text(pdf_path)
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f'✅ Berhasil konversi: {txt_path}')
    except Exception as e:
        print(f'❌ Gagal konversi {pdf_path}: {e}')

In [None]:
for i, filename in enumerate(sorted(os.listdir(pdf_folder))):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, filename)
        txt_filename = f'case_{i+1:03}.txt'
        txt_path = os.path.join(output_folder, txt_filename)
        convert_pdf_to_txt(pdf_path, txt_path)

✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_001.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_002.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_003.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_004.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_005.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_006.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_007.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_008.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_009.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_010.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/data/raw/case_011.txt
✅ Berhasil konversi: /content/drive/MyDrive/Penalaran Komputer/da

###Pembersihan

In [None]:
import os
import re

# Fungsi pembersih
def bersihkan_teks(teks):
    teks = re.sub(r'putusan\.?mahkamahagung\.?go\.?id', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'disclaimer.*?kami sajikan.*?(?=\s|$)', '', teks, flags=re.IGNORECASE | re.DOTALL)
    teks = re.sub(r'kepaniteraan@mahkamahagung\.?go\.?id', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'telp.*?\d{3}.*?(?=\s|$)', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'\bhalaman\s*\d+\b', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'\bhalaman\b', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'\d+\s*dari\s*\d+\s*halaman', '', teks, flags=re.IGNORECASE)
    teks = re.sub(r'\s+', ' ', teks)
    teks = teks.lower().strip()
    return teks

# Folder input & output
input_folder = '/content/drive/MyDrive/Penalaran Komputer/data/raw'
output_folder = '/content/drive/MyDrive/Penalaran Komputer/data_clean/raw'
os.makedirs(output_folder, exist_ok=True)

# Loop semua file .txt
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.txt'):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        with open(input_path, 'r', encoding='utf-8') as f:
            teks = f.read()

        teks_bersih = bersihkan_teks(teks)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(teks_bersih)

        print(f"✅ Dibersihkan dan disimpan: {filename}")

✅ Dibersihkan dan disimpan: case_001.txt
✅ Dibersihkan dan disimpan: case_002.txt
✅ Dibersihkan dan disimpan: case_003.txt
✅ Dibersihkan dan disimpan: case_004.txt
✅ Dibersihkan dan disimpan: case_005.txt
✅ Dibersihkan dan disimpan: case_006.txt
✅ Dibersihkan dan disimpan: case_007.txt
✅ Dibersihkan dan disimpan: case_008.txt
✅ Dibersihkan dan disimpan: case_009.txt
✅ Dibersihkan dan disimpan: case_010.txt
✅ Dibersihkan dan disimpan: case_011.txt
✅ Dibersihkan dan disimpan: case_012.txt
✅ Dibersihkan dan disimpan: case_013.txt
✅ Dibersihkan dan disimpan: case_014.txt
✅ Dibersihkan dan disimpan: case_015.txt
✅ Dibersihkan dan disimpan: case_016.txt
✅ Dibersihkan dan disimpan: case_017.txt
✅ Dibersihkan dan disimpan: case_018.txt
✅ Dibersihkan dan disimpan: case_019.txt
✅ Dibersihkan dan disimpan: case_020.txt
✅ Dibersihkan dan disimpan: case_021.txt
✅ Dibersihkan dan disimpan: case_022.txt
✅ Dibersihkan dan disimpan: case_023.txt
✅ Dibersihkan dan disimpan: case_024.txt
✅ Dibersihkan da

###Validasi

In [None]:
import os
import re
from datetime import datetime

# Path folder
raw_folder = '/content/drive/MyDrive/Penalaran Komputer/data/raw'
clean_folder = '/content/drive/MyDrive/Penalaran Komputer/data_clean/raw'
log_path = '/content/drive/MyDrive/Penalaran Komputer/logs/cleaning.log'

# Buat folder log jika belum ada
os.makedirs(os.path.dirname(log_path), exist_ok=True)

# Tulis log
with open(log_path, 'w', encoding='utf-8') as log_file:
    log_file.write(f"Log Validasi Pembersihan Teks\nWaktu: {datetime.now()}\n\n")
    log_file.write(f"{'File':<15} {'Original':<10} {'Cleaned':<10} {'Persen':<10} {'Status'}\n")
    log_file.write('-'*60 + '\n')

    for filename in sorted(os.listdir(raw_folder)):
        if filename.endswith('.txt'):
            raw_path = os.path.join(raw_folder, filename)
            clean_path = os.path.join(clean_folder, filename)

            try:
                with open(raw_path, 'r', encoding='utf-8') as f:
                    raw_text = f.read()
                with open(clean_path, 'r', encoding='utf-8') as f:
                    clean_text = f.read()

                raw_len = len(raw_text)
                clean_len = len(clean_text)
                percent = (clean_len / raw_len) * 100 if raw_len > 0 else 0

                status = "✅ OK" if percent >= 80 else "⚠️ Kurang"

                log_file.write(f"{filename:<15} {raw_len:<10} {clean_len:<10} {percent:<9.2f}% {status}\n")

            except Exception as e:
                log_file.write(f"{filename:<15} ERROR saat membaca: {e}\n")