In [7]:
import re
import csv
from PyPDF2 import PdfReader

def extract_arret_details(text):
    # Regular expressions for the components
    arrest_number_regex = r'Arrêt\s*N[°º]\s*(\d+|NEANT)'
    date_regex = r'du\s+([0-9]{1,2}\s+[a-zéû]+(?:\s+[0-9]{4})?)'
    reference_number_regex = r'Dossier\s*N[°º]\s*([^\s]+(?:-\S+)?)'

    # Extract components
    arrest_number = re.search(arrest_number_regex, text, re.IGNORECASE)
    date = re.search(date_regex, text, re.IGNORECASE)
    reference_number = re.search(reference_number_regex, text, re.IGNORECASE)

    # Get matched groups or set None
    arrest_number = arrest_number.group(1) if arrest_number else None
    date = date.group(1) if date else None
    reference_number = reference_number.group(1) if reference_number else None

    return {
        "Arrest Number": arrest_number,
        "Date": date,
        "Reference Number": reference_number,
        "Keywords": None  # Skip keywords as requested
    }

def process_pdf(pdf_path, output_csv):
    # Open the PDF file
    reader = PdfReader(pdf_path)
    extracted_data = []

    # Iterate through each page
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        text = page.extract_text()

        # Check if "Arrêt" appears on the page
        if "Arrêt" in text:
            details = extract_arret_details(text)
            extracted_data.append(details)

    # Save to CSV
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["Arrest Number", "Date", "Reference Number", "Keywords"])
        writer.writeheader()
        for data in extracted_data:
            writer.writerow(data)

    print(f"Extracted data saved to {output_csv}")

# Path to your PDF file and output CSV file
pdf_path = 'D:/all-arrest-2009.docx.pdf'
output_csv = 'metadata.csv'

# Process the PDF and save the data
process_pdf(pdf_path, output_csv)

Extracted data saved to metadata.csv
