# Tahap 2 – Case Representation

### Ekstraksi Metadata

In [None]:
import os
import re
import pandas as pd

# Folder input dan output
input_folder = '/content/drive/MyDrive/Penalaran Komputer/data_clean/raw'
output_csv = '/content/drive/MyDrive/Penalaran Komputer/data/processed/metadata_only.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Fungsi ekstraksi metadata
def extract_metadata(text):
    # Nomor Perkara
    match_no = re.search(r'(nomor|no)[\s:]*([^\n;,]*)', text, re.IGNORECASE)
    no_perkara = match_no.group(2).strip() if match_no else ''

    # Tanggal (format: 12 Januari 2023)
    match_tgl = re.search(r'(\d{1,2} [a-zA-Z]+ \d{4})', text)
    tanggal = match_tgl.group(1) if match_tgl else ''

    # Jenis Perkara (cari kata kunci umum)
    if 'pidana' in text:
        jenis = 'Pidana'
    elif 'perdata' in text:
        jenis = 'Perdata'
    else:
        jenis = ''

    # Pasal-pasal yang disebut
    pasal = re.findall(r'pasal[\s:]+(\d+[a-z]*)', text, re.IGNORECASE)
    pasal = ', '.join(sorted(set(pasal))) if pasal else ''

    # Pihak: Penggugat vs Tergugat atau terdakwa
    pihak = ''
    match_pihak = re.search(r'antara\s+(.*?)\s+melawan\s+(.*?)[\.,\n]', text, re.IGNORECASE)
    if match_pihak:
        pihak = f"{match_pihak.group(1).strip()} vs. {match_pihak.group(2).strip()}"

    return no_perkara, tanggal, jenis, pasal, pihak

# Proses semua file
rows = []

for i, filename in enumerate(sorted(os.listdir(input_folder))):
    if filename.endswith('.txt'):
        file_path = os.path.join(input_folder, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        no_perkara, tanggal, jenis, pasal, pihak = extract_metadata(text)

        rows.append({
            'case_id': i + 1,
            'no_perkara': no_perkara,
            'tanggal': tanggal,
            'jenis_perkara': jenis,
            'pasal': pasal,
            'pihak': pihak,
            'text_full': text
        })

# Simpan ke CSV
df = pd.DataFrame(rows)
df.to_csv(output_csv, index=False)
print(f'✅ Metadata disimpan ke: {output_csv}')

✅ Metadata disimpan ke: /content/drive/MyDrive/Penalaran Komputer/data/processed/metadata_only.csv


###Ekstraksi Konten Kunci

In [None]:
import pandas as pd
import re
import os

# Path input/output
input_csv = '/content/drive/MyDrive/Penalaran Komputer/data/processed/metadata_only.csv'
output_csv = '/content/drive/MyDrive/Penalaran Komputer/data/processed/cases.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Baca data
df = pd.read_csv(input_csv)

# Fungsi untuk ringkasan fakta
def extract_ringkasan_fakta(text):
    match = re.search(r'menimbang\s+bahwa\s+(.{50,1000}?)\.', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    match = re.search(r'bahwa\s+(.{50,1000}?)\.', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    return ''

# Fungsi untuk argumen hukum utama
def extract_argumen_hukum(text):
    match = re.search(r'memutuskan\s+(.{50,1000}?)\.', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    match = re.search(r'dasarkan\s+pasal\s+(.{20,500}?)\.', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    match = re.search(r'menyatakan\s+(.{50,1000}?)\.', text, re.IGNORECASE)
    if match:
        return match.group(1).strip()

    return ''

# Terapkan ke semua teks
df['ringkasan_fakta'] = df['text_full'].fillna('').apply(extract_ringkasan_fakta)
df['argumen_hukum'] = df['text_full'].fillna('').apply(extract_argumen_hukum)

# Simpan ke file output final
df.to_csv(output_csv, index=False)
print(f'✅ Ekstraksi konten kunci selesai. Disimpan ke:\n{output_csv}')

✅ Ekstraksi konten kunci selesai. Disimpan ke:
/content/drive/MyDrive/Penalaran Komputer/data/processed/cases.csv


###Feature Engineering

In [None]:
import pandas as pd
import re
import os

# Path input/output
input_csv = '/content/drive/MyDrive/Penalaran Komputer/data/processed/cases.csv'
output_csv = '/content/drive/MyDrive/Penalaran Komputer/data/processed/cases_features.csv'
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Baca data
df = pd.read_csv(input_csv)

# Fitur 1: Panjang teks (jumlah kata)
df['length'] = df['text_full'].fillna('').apply(lambda x: len(x.split()))

# Fitur 2: Bag-of-Words Keywords (manual keyword list)
keywords = ['wanprestasi', 'gugatan', 'penggugat', 'tergugat', 'putusan', 'perjanjian', 'pidana', 'narkotika', 'cerai']
def extract_keywords(text):
    text = text.lower()
    found = [kw for kw in keywords if kw in text]
    return ', '.join(found)
df['bow_keywords'] = df['text_full'].fillna('').apply(extract_keywords)

# Fitur 3: Estimasi QA-pairs
def count_qa_pairs(text):
    patterns = r'\b(apa|siapa|mengapa|bagaimana|karena|maka|oleh karena)\b'
    return len(re.findall(patterns, text.lower()))
df['qa_pairs_count'] = df['text_full'].fillna('').apply(count_qa_pairs)

# Simpan ke file baru
df.to_csv(output_csv, index=False)
print(f'✅ Feature Engineering selesai.\nDisimpan ke: {output_csv}')

✅ Feature Engineering selesai.
Disimpan ke: /content/drive/MyDrive/Penalaran Komputer/data/processed/cases_features.csv
