In [None]:
import os
import csv
import re
import json

In [None]:
# Load keywords from a file
def load_keywords_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keywords = json.load(file)
    return keywords

# Extract "No Putusan"
def extract_no_putusan(content):
    pattern = r"(Nomor\s*[:\-]?\s*\d+/[A-Za-z.]+/\d+/[A-Za-z\s\-]{1,7})"
    match = re.search(pattern, content)
    return match.group(1) if match else None

# Extract "Lembaga Peradilan"
def extract_lembaga_peradilan(content):
    pattern = r"(Pengadilan\s*(Negeri|Agama|Tinggi)(\s*di)?\s*\w+)(?!\w)"
    match = re.search(pattern, content)
    return match.group(1) if match else None

# Extract data from content using keywords
def extract_data_from_content(content, keywords):
    extracted_texts = []
    extracted_ranges = []

    for keyword in keywords:
        pattern = re.escape(keyword)
        match = re.search(pattern, content, re.IGNORECASE)

        if match:
            start_index = match.start()
            end_index = content.find('\n\n', start_index)
            end_index = end_index if end_index != -1 else len(content)

            overlaps = any(start <= start_index <= end or start <= end_index <= end for start, end in extracted_ranges)
            if overlaps:
                continue

            extracted_paragraph = content[start_index:end_index].strip()

            while extracted_paragraph.count('.') < 5:
                additional_end_index = content.find('\n\n', end_index + 2)
                additional_end_index = additional_end_index if additional_end_index != -1 else len(content)
                extracted_paragraph += content[end_index:additional_end_index].strip()
                end_index = additional_end_index

            extracted_texts.append(extracted_paragraph)
            extracted_ranges.append((start_index, end_index))

    consolidated_text = "\n\n".join(extracted_texts)
    return consolidated_text if consolidated_text else None

# Extract paragraph related to 'wanprestasi'
def extract_wanprestasi_paragraph(content):
    pattern = r"(Tergugat\stelah\smelakukan(?:perbuatan\s)?wanprestasi.*?)(?:\n\n|\Z)"
    match = re.search(pattern, content)
    return match.group(1) if match else None

# Extract data from a file
def extract_data_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    extracted_data = {category: None for category in KEYWORDS.keys()}

    extracted_data["No Putusan"] = extract_no_putusan(content)
    extracted_data["Lembaga Peradilan"] = extract_lembaga_peradilan(content)
    extracted_data["Perihal Gugatan"] = extract_wanprestasi_paragraph(content)
    extracted_data["Identitas Para Pihak"] = extract_data_from_content(content, KEYWORDS["Identitas Para Pihak"])

    for category, keywords in KEYWORDS.items():
        if category not in ["No Putusan", "Lembaga Peradilan", "Perihal Gugatan", "Identitas Para Pihak"]:
            extracted_data[category] = extract_data_from_content(content, keywords)

    return extracted_data

# Clean illegal characters for Excel
def clean_illegal_characters(data):
    cleaned_data = {k: re.sub(r'[\x00-\x1F\x7F]', ' ', v) if v else v for k, v in data.items()}
    return cleaned_data

# Save data to an Excel file
def save_to_excel(data, output_file):
    cleaned_data = {filename: clean_illegal_characters(file_data) for filename, file_data in data.items()}
    df = pd.DataFrame(cleaned_data).transpose()
    df = df.fillna("False")
    df.to_excel(output_file, index_label="File Name")

# Read files from a folder and extract data to Excel
def read_folder_and_extract_data(folder_path, output_file_path):
    all_extracted_data = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            all_extracted_data[file_name] = extract_data_from_file(file_path)

    save_to_excel(all_extracted_data, output_file_path)

# Executing the extraction process
KEYWORDS = load_keywords_from_file('keywords.txt')
source_folder = 'putusan/'
destination_file = 'Data.xlsx'
read_folder_and_extract_data(source_folder, destination_file)