## LLM Knowledge Extraction

### **Main Objective**
To leverage Generative AI (Google Gemini) to "read" the unstructured legal opinions and extract structured entities: **Offense**, **Punishment**, and **Appellate Decision**.

### **Technical Logic**
This script acts as the "intelligence layer" of the pipeline, converting raw text into queryable data:

* **Batch Processing & Context Management:** It processes cases in groups (batches of 10) to maximize API throughput and reduce latency. Crucially, it implements **text truncation** (`head_matter` to 4000 chars, `opinions` to 12000 chars) to ensure the input fits within the LLM's context window (token limit) while preserving the most critical initial sections of the judgment.
* **Fault Tolerance & Rate Limiting:** It features a robust network layer with a **retry mechanism** using exponential backoff. This automatically handles API quotas (HTTP 429 Rate Limits) and temporary server instability (HTTP 503) without crashing the pipeline. It also includes "resume capability," skipping IDs that have already been processed.
* **Hybrid Output Parsing:** Recognizing that LLMs can be unpredictable (sometimes adding conversational text around the requested JSON), the script uses a **dual-parsing strategy**: it first attempts standard JSON decoding, and if that fails, it falls back to **Regular Expressions (Regex)** to surgically extract valid JSON arrays or objects from the raw text response.

In [None]:
import pandas as pd
import json
import requests
from pprint import pprint
import os
import time
import random
import re

# --- JSON extraction helpers ---
def extract_json_array(text):
    """Try to extract a full JSON array from text using regex."""
    match = re.search(r'\[.*\]', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            return None
    return None

def extract_json_objects(text):
    """Extract individual JSON objects from text using regex."""
    matches = re.findall(r'\{.*?\}', text, re.DOTALL)
    objs = []
    for m in matches:
        try:
            objs.append(json.loads(m))
        except json.JSONDecodeError:
            continue
    return objs

# --- Load API key ---
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")

if not api_key:
    key_file = "key.txt"
    try:
        with open(key_file, "r", encoding="utf-8") as f:
            api_key = f.read().strip()
            print(f" API Key loaded successfully from {key_file}.")
    except FileNotFoundError:
        print(f" ERROR: API Key not found. Please set GEMINI_API_KEY or create '{key_file}'.")
        api_key = ""
else:
    print(" API Key loaded successfully from environment variables.")

# --- Gemini API Config ---
GEMINI_MODEL = "gemini-2.5-flash-preview-09-2025" 
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={api_key}"

HEADERS = {
    'Content-Type': 'application/json',
}

# --- Configuration (TEXAS) ---
csv_file = "Test/case_law_csv/case_texts.csv" 
output_file = "case_details_tx_with_llm.json"
prompt_template_file = "prompt_example.txt"

batch_size = 1
save_interval = 5 

# --- Load CSV ---
print(f"Loading cases from {csv_file}...")
if not os.path.exists(csv_file):
    print(f"ERROR: File {csv_file} not found. Did you run json_to_csv_parser_tx.py?")
    exit()

df = pd.read_csv(csv_file)

# --- Load prompt template ---
if not os.path.exists(prompt_template_file):
    print(f"ERROR: {prompt_template_file} not found.")
    exit()

with open(prompt_template_file, "r", encoding="utf-8") as f:
    prompt_template = f.read()

# --- Load existing results (Resume logic) ---
if os.path.exists(output_file):
    try:
        with open(output_file, "r", encoding="utf-8") as f:
            results = json.load(f)
        print(f" Loaded {len(results)} existing results from {output_file}")
    except json.JSONDecodeError:
        print(f" Could not load existing JSON from {output_file}. Starting fresh.")
        results = []
else:
    results = []

# ============================================================
# MODIFICA PER RIPARAZIONE: Rimuoviamo i casi falliti (Skipped)
# ============================================================
initial_count = len(results)
# Teniamo solo i risultati che hanno dati validi O che hanno un raw_output (quindi non sono stati skippati)
results = [r for r in results if r.get("offense") is not None or r.get("raw_output") is not None]
removed_count = initial_count - len(results)

if removed_count > 0:
    print(f"     REPAIR MODE: Removed {removed_count} failed cases (None/Skipped) from memory.")
    print(f"     They will be re-processed in this run.")
# ============================================================

processed_ids = {int(r.get("id")) for r in results if r.get("id") is not None}

# --- Prepare rows to process ---
if 'id' not in df.columns:
    raise ValueError("CSV file must contain an 'id' column.")

# Convert ID column to numeric for comparison
df['id_numeric'] = pd.to_numeric(df['id'], errors='coerce').fillna(-1).astype(int)

# Filtra le righe già processate
rows_to_process = [row for _, row in df.iterrows() if row["id_numeric"] not in processed_ids and row["id_numeric"] != -1]

# CALCOLO TOTAL BATCHES
total_batches = (len(rows_to_process) + batch_size - 1) // batch_size

print(f"Found {len(rows_to_process)} cases to process in {total_batches} batches.")

# --- Loop over batches ---
for batch_idx in range(total_batches):
    batch_rows = rows_to_process[batch_idx * batch_size : (batch_idx + 1) * batch_size]
    print(f"\n=== Processing batch {batch_idx + 1}/{total_batches} ({len(batch_rows)} cases) ===")

    # Build a single prompt for the batch
    batch_prompts = []
    for row in batch_rows:
        # OTTIMIZZAZIONE: Troncamento del testo
        raw_head = str(row["head_matter"]) if pd.notna(row["head_matter"]) else ""
        raw_opinions = str(row["opinions"]) if pd.notna(row["opinions"]) else ""

        head_matter_full = raw_head[:4000] 
        opinions_full = raw_opinions[:12000]

        case_prompt = prompt_template.format(
            id=row["id"],
            head_matter=head_matter_full,
            opinions=opinions_full
        )
        batch_prompts.append(case_prompt)

    system_instruction = (
        "You are a legal assistant. Extract offense, punishment, and decision from court cases. "
        "Your output MUST be ONLY a valid JSON array of objects with keys: id, offense, punishment, decision. "
        "No commentary, no extra text."
    )

    user_query = "Process the following cases and return the structured JSON:\n\n" + "\n\n".join(batch_prompts)
    
    # Debug della lunghezza del prompt
    query_length = len(user_query)
    print(f"DEBUG: Total prompt length for this batch: {query_length} characters.")

    # Construct the Gemini payload
    # ... dentro il ciclo for ...

    # Construct the Gemini payload CON SAFETY SETTINGS
    payload = {
        "contents": [{"parts": [{"text": user_query}]}],
        "systemInstruction": {"parts": [{"text": system_instruction}]},
        "safetySettings": [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_CIVIC_INTEGRITY", "threshold": "BLOCK_NONE"}
        ],
        "generationConfig": {
            "maxOutputTokens": 2048,
            "responseMimeType": "application/json"
        }
    
    }

    # --- Retry LLM call ---
    max_retries = 5
    output_text = None
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(API_URL, headers=HEADERS, json=payload)
            
            if response.status_code == 429:
                # Se colpito il limite, fai una pausa LUNGA per resettare
                wait_time = 30 + (10 * attempt)
                print(f" Rate Limit (429). Pausing for {wait_time}s...", end="\r")
                time.sleep(wait_time)
                continue
            
            if response.status_code == 503:
                time.sleep(5 * attempt)
                continue

            response.raise_for_status()
            
            result = response.json()
            candidate = result.get('candidates', [{}])[0]
            parts = candidate.get('content', {}).get('parts', [{}])
            output_text = parts[0].get('text', '').strip()

            if not output_text:
                # DEBUG: Stampiamo perché è vuoto (es. SAFETY)
                print(f" DEBUG FAILURE - Full Response: {result}")
                finish_reason = candidate.get('finishReason')
                if finish_reason == "SAFETY":
                    print(" BLOCKED BY SAFETY FILTERS (Fix applied?)")
                
                raise ValueError(f"Empty text. Finish Reason: {finish_reason}")

            print(f" Attempt {attempt} successful.")
            break
            
        except Exception as e:
            print(f" Attempt {attempt} failed: {e}")
            output_text = None
            if attempt < max_retries:
                time.sleep(2 ** attempt + random.uniform(0, 1))

    # --- Parse JSON safely ---
    if not output_text:
        print(" Skipping batch (No output).")
        batch_results = [{"id": int(row["id_numeric"]), "offense": None, "punishment": None, "decision": None, "raw_output": None} for row in batch_rows]
    else:
        try:
            batch_results = json.loads(output_text)
            if isinstance(batch_results, dict): batch_results = [batch_results]
        except json.JSONDecodeError:
            batch_results = extract_json_array(output_text) or extract_json_objects(output_text)
            
            if not batch_results:
                print(" No valid JSON found, storing raw output.")
                batch_results = [{"id": int(row["id_numeric"]), "offense": None, "punishment": None, "decision": None, "raw_output": output_text} for row in batch_rows]

    # Normalize IDs
    normalized_results = []
    # Create a map of ID -> Row for quick lookup
    batch_map = {int(r["id_numeric"]): r for r in batch_rows}
    
    # Process returned results
    if batch_results:
        for res in batch_results:
            res_id = res.get("id")
            if res_id is not None:
                try:
                    res_id = int(res_id)
                    if res_id in batch_map:
                        normalized_results.append(res)
                        del batch_map[res_id] # Mark as found
                except ValueError:
                    pass
    
    # Add missing cases as nulls
    for missing_id in batch_map:
        normalized_results.append({"id": missing_id, "offense": None, "punishment": None, "decision": None, "error": "LLM skipped this case"})

    # Pretty print snippet
    pprint(normalized_results[:2])
    
    # Store results
    results.extend(normalized_results)

    # --- Save intermediate results ---
    if (batch_idx + 1) % save_interval == 0 or batch_idx == total_batches - 1:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f" Saved intermediate results ({len(results)} total)")
    
    # Pausa dinamica per evitare blocchi
    print(" Cooling down API for 5 seconds...")
    time.sleep(5) 

print(f"\n LLM outputs saved to {output_file}")

 API Key loaded successfully from key.txt.
Loading cases from Test/case_law_csv/case_texts.csv...
 Loaded 7732 existing results from case_details_tx_with_llm.json
     REPAIR MODE: Removed 25 failed cases (None/Skipped) from memory.
     They will be re-processed in this run.
Found 20143 cases to process in 20143 batches.

=== Processing batch 1/20143 (1 cases) ===
DEBUG: Total prompt length for this batch: 10963 characters.
 Attempt 1 successful.
[{'decision': 'Reversed, and remanded',
  'id': '5149801',
  'offense': 'Burglary (in the daytime) concerning failure to define '
             "'breaking'",
  'punishment': 'Confinement in the penitentiary for two years'}]
 Cooling down API for 5 seconds...

=== Processing batch 2/20143 (1 cases) ===
DEBUG: Total prompt length for this batch: 11612 characters.
 Attempt 2 successful.
[{'decision': None,
  'error': 'LLM skipped this case',
  'id': 5149615,
  'offense': None,
  'punishment': None}]
 Cooling down API for 5 seconds...

=== Process

Cleaning of case_details_tx_with_llm.json and extraction of conviction into convictions_with_llm

## Data Cleaning & Quality Control

### **Main Objective**
To enforce the "Garbage In, Garbage Out" principle by strictly filtering the LLM outputs, ensuring that only cases with complete and valid legal metadata are admitted into the final dataset.

### **Technical Logic**
This script acts as a quality gatekeeper between the raw AI generation and the database ingestion:

* **Type Normalization:** It sanitizes the unique identifiers (`id`), converting them from potentially messy strings (with whitespace or quotes) into clean integers. This ensures the JSON data will join correctly with the CSV files created earlier.
* **Strict Validation Logic:** It applies a boolean filter: a case is preserved **only if** both the `offense` and `decision` fields are present and non-empty. This prevents the Graph Database from being populated with "ghost nodes" that have no legal value.
* **Graceful Degradation:** While `offense` and `decision` are mandatory, the script handles the `punishment` field more flexibly. If the LLM failed to extract a punishment (or if none was stated), it defaults the value to `"None stated"` rather than discarding the entire case, maximizing data retention without compromising integrity.

In [1]:
import json
import os
import pandas as pd

# Configurazione
INPUT_FILE = "case_details_tx_with_llm.json"
OUTPUT_FILE = "case_details_tx_CLEANED.json"

def clean_data():
    print(f"--- AVVIO PULIZIA DATI ---")
    
    if not os.path.exists(INPUT_FILE):
        print(f" Errore: Il file {INPUT_FILE} non esiste.")
        return

    # 1. Caricamento
    print(f"Caricamento di {INPUT_FILE}...")
    try:
        with open(INPUT_FILE, "r", encoding="utf-8") as f:
            raw_data = json.load(f)
    except json.JSONDecodeError:
        print(" Errore: Il file JSON è corrotto.")
        return

    print(f"Totale record grezzi: {len(raw_data)}")

    cleaned_data = []
    skipped_count = 0
    
    # 2. Iterazione e Pulizia
    for item in raw_data:
        # A. Normalizzazione ID
        raw_id = item.get("id")
        try:
            # Converte in stringa, rimuove spazi/virgolette, converte in int
            clean_id = int(str(raw_id).strip().replace('"', '').replace("'", ""))
        except (ValueError, TypeError):
            # Se l'ID non è valido (es. null), scarta il caso
            skipped_count += 1
            continue

        # B. Controllo Campi Obbligatori
        offense = item.get("offense")
        decision = item.get("decision")
        punishment = item.get("punishment") # Opzionale ma utile tenerlo pulito

        # Funzione helper per verificare se un campo è valido (non None, non stringa vuota)
        def is_valid(text):
            return text is not None and str(text).strip() != "" and str(text).lower() != "null"

        # Teniamo il caso SOLO SE ha un reato E una decisione
        if is_valid(offense) and is_valid(decision):
            cleaned_item = {
                "id": clean_id, # Ora è sicuramente un intero (int)
                "offense": str(offense).strip(),
                "punishment": str(punishment).strip() if is_valid(punishment) else "None stated",
                "decision": str(decision).strip()
            }
            cleaned_data.append(cleaned_item)
        else:
            skipped_count += 1

    # 3. Salvataggio
    print(f"\n--- RISULTATI ---")
    print(f" Casi validi mantenuti: {len(cleaned_data)}")
    print(f" Casi scartati (incompleti/errori): {skipped_count}")

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
    
    print(f"\nFile pulito salvato come: {OUTPUT_FILE}")

if __name__ == "__main__":
    clean_data()

--- AVVIO PULIZIA DATI ---
Caricamento di case_details_tx_with_llm.json...
Totale record grezzi: 7947

--- RISULTATI ---
 Casi validi mantenuti: 7791
 Casi scartati (incompleti/errori): 156

File pulito salvato come: case_details_tx_CLEANED.json


## Conviction Status Classifier

### **Main Objective**
To perform a specialized, second-pass analysis on the cleaned data to determine a critical boolean data point: **Did the case result in a final conviction?** (True/False).

### **Technical Logic**
While the previous script extracted general text fields, this script performs **binary classification** to enable filtering by "Winning" or "Losing" precedents:

* **Targeted Inference:** It inputs the `Offense`, `Punishment`, and `Decision` text generated in the previous step and asks the LLM to interpret the legal outcome. For example, it understands that *"Judgment Affirmed"* equates to **True** (Convicted), while *"Reversed and Remanded"* equates to **False** (Not Convicted).
* **Semantic Normalization:** The script maps various linguistic variations returned by the LLM (e.g., "guilty", "yes", "1", "affirmed") into a strict Python **Boolean** (`True`/`False`). This is essential for the Neo4j database, allowing for precise queries like *"Show me cases where the defense won."*
* **Batch Efficiency & Resume Capability:** Like the previous extractor, it processes cases in batches (20 at a time) to optimize API costs. It also checks the output file on startup to identify which IDs have already been processed, allowing the script to pick up exactly where it left off in case of interruption.
* **Data Validation Layer:** It includes a specific fix to ensure `ID`s are treated as integers (`id_int`), preventing type mismatch errors that could occur when merging data from different sources (JSON vs CSV).

In [1]:
import pandas as pd
import json
import requests
from pprint import pprint
import os
import time
import re

# ---------- JSON extraction helpers ----------
def extract_json_array(text):
    match = re.search(r'\[.*\]', text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            return None
    return None

def extract_json_objects(text):
    matches = re.findall(r'\{.*?\}', text, re.DOTALL)
    objs = []
    for m in matches:
        try:
            objs.append(json.loads(m))
        except json.JSONDecodeError:
            continue
    return objs if objs else None

# ---------- Load API key ----------
api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")

if not api_key:
    # Fallback manuale se le env vars non sono settate
    key_file = "key.txt"
    try:
        with open(key_file, "r", encoding="utf-8") as f:
            api_key = f.read().strip()
            print(f" API Key loaded successfully from {key_file}.")
    except FileNotFoundError:
        print(f" ERROR: API Key not found.")
        api_key = ""

# ---------- Gemini API Config ----------
GEMINI_MODEL = "gemini-2.5-flash" 
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={api_key}"
HEADERS = {'Content-Type': 'application/json'}

# ---------- Parameters ----------
json_input_file = "case_details_tx_CLEANED.json" 
prompt_template_path = "prompt_conviction.txt" 
output_file = "convictions_with_llm.json"

# Configurazione Batch
batch_size = 5
save_every_n_batches = 5

# ---------- Load Resources ----------
if not os.path.exists(prompt_template_path):
    print(f" ERROR: {prompt_template_path} not found. Create it first.")
    exit()
with open(prompt_template_path, "r", encoding="utf-8") as f:
    prompt_template = f.read()

print(f" Loading extracted details from {json_input_file}...")
if not os.path.exists(json_input_file):
    print(f" ERROR: Input file {json_input_file} not found.")
    exit()

try:
    with open(json_input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    df = pd.DataFrame(data) 
    print(f" Loaded {len(df)} raw records from JSON.")
except Exception as e:
    print(f" Error reading JSON: {e}")
    exit()

# --- ### FIX: PREPARAZIONE DATAFRAME ### ---
# 1. Assicuriamoci che l'ID sia un intero per fare confronti corretti
# (Usa 'coerce' per trasformare errori in NaN, poi riempi con 0 e converti a int)
df['id_int'] = pd.to_numeric(df['id'], errors='coerce').fillna(0).astype(int)

# 2. Creiamo df_valid rimuovendo ID nulli o zero (sicurezza)
df_valid = df[df['id_int'] > 0].copy()
print(f" Validated dataframe size: {len(df_valid)} records.")
# -------------------------------------------

# Caricamento risultati esistenti (Resume capability)
if os.path.exists(output_file):
    try:
        with open(output_file, "r", encoding="utf-8") as f:
            results = json.load(f)
        print(f" Loaded {len(results)} existing results from {output_file}")
    except json.JSONDecodeError:
        results = []
else:
    results = []

# Crea un set di ID già fatti
done_ids = set()
for r in results:
    if r.get("id") is not None:
        try:
            done_ids.add(int(r["id"]))
        except:
            pass

# Filtra le righe da processare usando df_valid
rows_to_process = [row for _, row in df_valid.iterrows() if row["id_int"] not in done_ids]
total_batches = (len(rows_to_process) + batch_size - 1) // batch_size

print(f" Found {len(rows_to_process)} NEW valid cases to process.")

def build_cases_block(rows):
    lines = []
    for row in rows:
        # Gestione sicura dei campi (se sono None mette stringa vuota)
        offense = row.get("offense") or "Unknown"
        punishment = row.get("punishment") or "Unknown"
        decision = row.get("decision") or "Unknown"
        
        lines.append(
            f'Case:\n'
            f'Id: {row["id_int"]}\n'
            f'Offense: "{offense}"\n'
            f'Punishment: "{punishment}"\n'
            f'Decision: "{decision}"\n'
        )
    return "\n".join(lines)

# ==========================================
#       MAIN LOOP
# ==========================================

if total_batches == 0:
    print("Nothing to do! All cases processed.")
    exit()

for batch_idx in range(total_batches):
    batch_rows = rows_to_process[batch_idx*batch_size : (batch_idx+1)*batch_size]
    print(f"\n=== Processing batch {batch_idx+1}/{total_batches} ({len(batch_rows)} cases) ===")

    cases_block = build_cases_block(batch_rows)
    prompt_filled = prompt_template.replace("{cases_block}", cases_block)

    system_instruction = "You are a precise legal assistant. Follow the instructions exactly and output ONLY valid JSON."
    
    payload = {
        "contents": [{"parts": [{"text": prompt_filled}]}],
        "systemInstruction": {"parts": [{"text": system_instruction}]},
        # --- SAFETY BLOCK REMOVED  ---
        "safetySettings": [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_CIVIC_INTEGRITY", "threshold": "BLOCK_NONE"}
        ],
        # ------------------------------
        "generationConfig": {
            "maxOutputTokens": 2048, 
            "responseMimeType": "application/json"
        }
    }

    # --- Retry Loop ---
    max_retries = 5
    output_text = None
    
    for attempt in range(1, max_retries + 1):
        try:
            response = requests.post(API_URL, headers=HEADERS, json=payload)
            
            if response.status_code == 429:
                wait = 20 + (5 * attempt)
                print(f"Rate Limit (429). Pausing {wait}s...", end="\r")
                time.sleep(wait)
                continue

            response.raise_for_status()
            result = response.json()
            candidate = result.get('candidates', [{}])[0]
            parts = candidate.get('content', {}).get('parts', [{}])
            output_text = parts[0].get('text', '').strip()
            print(f" Attempt {attempt} successful.")
            break

        except Exception as e:
            print(f" Attempt {attempt} failed: {e}")
            time.sleep(2)

    # --- Parsing ---
    if not output_text:
        print(" Skipping batch (No output).")
        continue
    else:
        parsed = None
        try:
            parsed = json.loads(output_text)
            if isinstance(parsed, dict): parsed = [parsed]
        except json.JSONDecodeError:
            parsed = extract_json_array(output_text) or extract_json_objects(output_text)

        if not parsed:
            print(" Invalid JSON response.")
            continue
            
        # --- Normalizzazione Risultati Batch ---
        normalized = []
        
        # Mappa ID -> Riga originale per recuperare i dati se l'LLM sbaglia
        batch_map = {r["id_int"]: r for r in batch_rows}
        
        for obj in parsed:
            # Recupera ID
            try:
                obj_id = int(obj.get("id"))
            except (ValueError, TypeError):
                continue 
            
            if obj_id in batch_map:
                convicted_val = obj.get("convicted")
                
                # Normalizza Booleano
                if isinstance(convicted_val, str):
                    low = convicted_val.strip().lower()
                    if low in ["true", "yes", "1", "affirmed", "guilty"]: convicted_val = True
                    elif low in ["false", "no", "0", "reversed", "acquitted"]: convicted_val = False
                    else: convicted_val = None
                elif not isinstance(convicted_val, bool):
                    convicted_val = None
                
                normalized.append({"id": obj_id, "convicted": convicted_val})
                # Rimuoviamo dalla mappa per sapere chi manca
                if obj_id in batch_map:
                    del batch_map[obj_id] 

        # Aggiungiamo i casi che l'LLM ha dimenticato nel batch come None
        for missing_id in batch_map:
             normalized.append({"id": missing_id, "convicted": None})

        if len(normalized) > 0:
            pprint(normalized[:1]) # Mostra solo il primo per pulizia console
        results.extend(normalized)

    # --- Save ---
    if (batch_idx + 1) % save_every_n_batches == 0 or batch_idx == total_batches - 1:
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        print(f" Saved intermediate results ({len(results)} total)")

    # Pausa standard per evitare 429 aggressivi
    time.sleep(4)

print(f"\n Process complete. Conviction outputs saved to {output_file}")

 API Key loaded successfully from key.txt.
 Loading extracted details from case_details_tx_CLEANED.json...
 Loaded 7791 raw records from JSON.
 Validated dataframe size: 7791 records.
 Loaded 7093 existing results from convictions_with_llm.json
 Found 698 NEW valid cases to process.

=== Processing batch 1/140 (5 cases) ===
 Attempt 1 successful.
 Skipping batch (No output).

=== Processing batch 2/140 (5 cases) ===
 Attempt 1 successful.
[{'convicted': False, 'id': 5277648}]

=== Processing batch 3/140 (5 cases) ===
 Attempt 1 successful.
[{'convicted': True, 'id': 5277350}]

=== Processing batch 4/140 (5 cases) ===
 Attempt 1 successful.
 Skipping batch (No output).

=== Processing batch 5/140 (5 cases) ===
 Attempt 1 successful.
[{'convicted': False, 'id': 5797503}]
 Saved intermediate results (7108 total)

=== Processing batch 6/140 (5 cases) ===
 Attempt 1 successful.
 Skipping batch (No output).

=== Processing batch 7/140 (5 cases) ===
 Attempt 1 successful.
 Skipping batch (No 