## Data Parser & Normalizer

### **Main Objective**
To transform the raw, unstructured, and nested JSON data acquired by the crawler into structured, relational CSV files suitable for database ingestion (specifically Neo4j) and analysis.

### **Technical Logic**
This script acts as an ETL (Extract, Transform, Load) pipeline, converting a file-system-based storage into a tabular format:

* **Hierarchical Traversal:** The script recursively iterates through the directory structure created by the crawler (`State` → `Volumes` → `Cases`), ensuring that metadata files (`ReporterMetadata`, `VolumesMetadata`) are linked to the correct cases.
* **Schema Flattening:** It parses the complex, nested JSON structure of the Case.law schema. It extracts specific fields (e.g., jurisdiction, court, decision date) and flattens nested lists (like judges, parties, or attorneys) into pipe-separated strings (`|`) to fit into a flat CSV format.
* **Data Normalization & Separation:** To optimize performance and database design, the script splits the data into four distinct logical entities:
    1.  `state_reports.csv`: General metadata about the reporter.
    2.  `volumes_metadata.csv`: Details about specific book volumes.
    3.  `cases.csv`: Lightweight metadata for each case (titles, dates, parties, citations), allowing for fast indexing.
    4.  `case_texts.csv`: The "heavy" payload containing the full text of opinions and headnotes, separated to avoid bloating the main node properties in the graph.
* **Memory Optimization:** Data is collected in standard Python lists of dictionaries and converted to **Pandas DataFrames** only at the end for efficient batch writing to CSV.

In [1]:
ROOT_DIR = "Test/case_law_json"  # Root folder containing state folders
OUTPUT_DIR = "Test/case_law_csv"
STATE_REPORTS_CSV = "state_reports.csv"
VOLUMES_METADATA_CSV = "volumes_metadata.csv"
CASES_CSV = "cases.csv"
CASE_TEXTS_CSV = "case_texts.csv"

In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm 

# --- CONFIGURAZIONE PERCORSI ---
# ROOT_DIR deve puntare alla cartella che CONTIENE le cartelle degli stati (es. 'tex-crim')
ROOT_DIR = "Test/case_law_json"  

# OUTPUT_DIR è dove verranno salvati i CSV. Verrà creata automaticamente.
OUTPUT_DIR = "Test/case_law_csv" 
# --- FINE CONFIGURAZIONE ---

# Definisce i nomi dei file di output
STATE_REPORTS_CSV = "state_reports.csv"
VOLUMES_METADATA_CSV = "volumes_metadata.csv"
CASES_CSV = "cases.csv"
CASE_TEXTS_CSV = "case_texts.csv"

# Liste globali per raccogliere i dati
state_reports_data = []
volumes_metadata_data = []
cases_data = []
case_texts_data = []

def process_state_folder(state_folder_path):
    """Processa una singola cartella di stato (es. 'tex-crim')."""
    slug = os.path.basename(state_folder_path)
    print(f"Processing state folder: {slug}")
    
    # --- Process ReporterMetadata.json ---
    reporter_file = os.path.join(state_folder_path, "ReporterMetadata.json")
    if os.path.exists(reporter_file):
        try:
            with open(reporter_file, "r", encoding="utf-8") as f:
                reporter = json.load(f)
            jurisdiction = reporter.get("jurisdictions", [{}])[0]
            state_reports_data.append({
                "id": reporter.get("id"),
                "full_name": reporter.get("full_name"),
                "short_name": reporter.get("short_name"),
                "start_year": reporter.get("start_year"),
                "end_year": reporter.get("end_year"),
                "jurisdictions_id": jurisdiction.get("id"),
                "jurisdictions_name": jurisdiction.get("name"),
                "jurisdictions_name_long": jurisdiction.get("name_long"),
                "slug": reporter.get("slug")
            })
        except Exception as e:
            print(f"Error processing {reporter_file}: {e}")

    # --- Process VolumesMetadata.json ---
    volumes_file = os.path.join(state_folder_path, "VolumesMetadata.json")
    if os.path.exists(volumes_file):
        try:
            with open(volumes_file, "r", encoding="utf-8") as f:
                volumes = json.load(f)
            for v in volumes:
                jurisdiction = v.get("jurisdictions", [{}])[0]
                nominative = v.get("nominative_reporter") or {}
                volumes_metadata_data.append({
                    "volume_number": v.get("volume_number"),
                    "title": v.get("title"),
                    "publisher": v.get("publisher"),
                    "publication_year": v.get("publication_year"),
                    "start_year": v.get("start_year"),
                    "end_year": v.get("end_year"),
                    "series_volume_number": v.get("series_volume_number"),
                    "jurisdictions_id": jurisdiction.get("id"),
                    "jurisdictions_name": jurisdiction.get("name"),
                    "jurisdictions_name_long": jurisdiction.get("name_long"),
                    "id": v.get("id"),
                    "harvard_hollis_id": v.get("harvard_hollis_id"),
                    "spine_start_year": v.get("spine_start_year"),
                    "spine_end_year": v.get("spine_end_year"),
                    "publication_city": v.get("publication_city"),
                    "redacted": v.get("redacted"),
                    "nominative_volume_number": nominative.get("volume_number"),
                    "nominative_name": nominative.get("nominative_name"),
                    "volume_folder": v.get("volume_folder"),
                    "reporter_slug": v.get("reporter_slug")
                })
        except Exception as e:
            print(f"Error processing {volumes_file}: {e}")

    # --- Process cases inside volume folders ---
    # Itera su tutte le cartelle dei volumi (es. '1', '2', '3'...)
    if not os.path.exists(state_folder_path):
         print(f"Warning: Folder {state_folder_path} does not exist.")
         return

    # Ottieni la lista delle cartelle, ignorando file sciolti
    volume_folders = [f for f in os.listdir(state_folder_path) if os.path.isdir(os.path.join(state_folder_path, f))]

    for volume_folder in tqdm(volume_folders, desc=f"Parsing {slug} volumes"):
        volume_folder_path = os.path.join(state_folder_path, volume_folder)
        
        # I casi si trovano in una sottocartella 'cases'
        cases_json_dir = os.path.join(volume_folder_path, "cases")
        
        # Controlla se la cartella 'cases' esiste
        if not os.path.isdir(cases_json_dir):
            continue 

        # Itera su tutti i file JSON *dentro* la cartella 'cases'
        for filename in os.listdir(cases_json_dir):
            if not filename.endswith(".json"):
                continue
                
            case_file_path = os.path.join(cases_json_dir, filename)
            
            try:
                with open(case_file_path, "r", encoding="utf-8") as f:
                    case_data = json.load(f)

                if isinstance(case_data, list):
                    case_list = case_data
                elif isinstance(case_data, dict):
                    case_list = [case_data]
                else:
                    continue
                
                for case in case_list:
                    if not isinstance(case, dict):
                        continue

                    court = case.get("court") or {}
                    jurisdiction = case.get("jurisdiction") or {}
                    casebody = case.get("casebody", {})
                    
                    # --- cases.csv ---
                    cases_data.append({
                        "id": case.get("id"),
                        "name": case.get("name"),
                        "name_abbreviation": case.get("name_abbreviation"),
                        "decision_date": case.get("decision_date"),
                        "docket_number": case.get("docket_number"),
                        "first_page": case.get("first_page"),
                        "last_page": case.get("last_page"),
                        "court_id": court.get("id"),
                        "court_name": court.get("name"),
                        "court_name_abbreviation": court.get("name_abbreviation"),
                        "jurisdiction_id": jurisdiction.get("id"),
                        "jurisdiction_name": jurisdiction.get("name"),
                        "jurisdiction_name_long": jurisdiction.get("name_long"),
                        "judges": "|".join(casebody.get("judges", []) or []),
                        "parties": "|".join(casebody.get("parties", []) or []),
                        "attorneys": "|".join(casebody.get("attorneys", []) or []),
                        "state_volume_file": f"/{slug}/{volume_folder}/cases/{filename.replace('.json','')}"
                    })

                    # --- case_texts.csv ---
                    opinions = casebody.get("opinions", []) or []
                    opinions_text = "|".join([op.get("text","") for op in opinions if op])
                    
                    case_texts_data.append({
                        "id": case.get("id"),
                        "title": case.get("name"),
                        "head_matter": casebody.get("head_matter", ""),
                        "corrections": casebody.get("corrections", ""),
                        "opinions": opinions_text
                    })
            except Exception as e:
                print(f"Error reading case {case_file_path}: {e}")

# --- Blocco di esecuzione principale ---
if __name__ == "__main__":
    # 1. CREA LA CARTELLA DI OUTPUT (Così non devi farlo a mano)
    print(f"Ensuring output directory exists: {OUTPUT_DIR}")
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    print(f"Starting parsing from root: {ROOT_DIR}")
    
    if os.path.exists(ROOT_DIR):
        # Cerca cartelle dentro ROOT_DIR (es. 'tex-crim')
        for state_folder in os.listdir(ROOT_DIR):
            state_folder_path = os.path.join(ROOT_DIR, state_folder)
            if os.path.isdir(state_folder_path):
                process_state_folder(state_folder_path)
    else:
        print(f"ERROR: Root directory '{ROOT_DIR}' not found!")

    # --- Salvataggio ---
    if cases_data:
        print(f"\nSaving CSV files to {OUTPUT_DIR}...")
        pd.DataFrame(state_reports_data).to_csv(os.path.join(OUTPUT_DIR, STATE_REPORTS_CSV), index=False)
        pd.DataFrame(volumes_metadata_data).to_csv(os.path.join(OUTPUT_DIR, VOLUMES_METADATA_CSV), index=False)
        pd.DataFrame(cases_data).to_csv(os.path.join(OUTPUT_DIR, CASES_CSV), index=False)
        pd.DataFrame(case_texts_data).to_csv(os.path.join(OUTPUT_DIR, CASE_TEXTS_CSV), index=False)

        print("\n--- Parsing complete ---")
        print(f"Total cases processed: {len(cases_data)}")
    else:
        print("\nNo cases found to save. Check your paths.")

Ensuring output directory exists: Test/case_law_csv
Starting parsing from root: Test/case_law_json
Processing state folder: tex-crim


Parsing tex-crim volumes: 100%|██████████| 142/142 [08:22<00:00,  3.54s/it]



Saving CSV files to Test/case_law_csv...

--- Parsing complete ---
Total cases processed: 27712


## Citation Network Extractor

### **Main Objective**
To build the "Who Cites Whom" graph by parsing the citation metadata embedded within the downloaded JSON case files and resolving references between cases.

### **Technical Logic**
This script is crucial for creating the edges (relationships) in the graph database. Its logic focuses on resolving entities:

* **Source Identification:** It iterates through all available JSON files again. For each case, it extracts its unique ID, which serves as the source node (`id1`) of the relationship.
* **Target Resolution (The Matching Problem):** The script analyzes the `cites_to` field in the JSON, which contains outgoing citations. It uses a multi-tiered strategy to link these citations to other cases in the database:
    1.  **Explicit ID Match:** It first checks if the cited case has a direct numerical ID (`case_ids`). If this ID exists in the set of previously downloaded cases (`id_set`), a verified link is created.
    2.  **Path Match:** If no ID is available, it attempts to resolve the citation using the file path (`case_paths`), matching it against a pre-computed dictionary mapping file paths to IDs.
    3.  **Unresolved Citations:** If a citation points to a case outside the dataset (e.g., a Civil case or a non-Texas case), it is still recorded but flagged as `matched=False`. This preserves the citation text for display without creating a dangling edge in the graph.
* **Data Integrity:** Before processing, it loads the `cases.csv` file created by the previous script to build a whitelist of valid IDs. This ensures referential integrity: the script only creates relationships where the source node is guaranteed to exist in our dataset.

In [3]:
import os
import json
import pandas as pd
from tqdm import tqdm

# --- CONFIGURAZIONE ---
# Deve corrispondere a dove hai i JSON e i CSV del Texas
ROOT_DIR = "Test/case_law_json"
CASES_CSV = "Test/case_law_csv/cases.csv"
CITATIONS_CSV = "Test/case_law_csv/citations.csv"

print(f"Loading cases from {CASES_CSV}...")
# Load cases.csv
# Usiamo dtype str per evitare problemi con ID numerici
cases_df = pd.read_csv(CASES_CSV, dtype={"id": str})

# Creiamo una mappa per trovare velocemente l'ID dato il percorso del file
# Importante: Il percorso nel CSV è tipo "/tx-crim/1/cases/0001"
file_to_id = dict(zip(cases_df["state_volume_file"], cases_df["id"]))
id_set = set(cases_df["id"])

print(f"Loaded {len(id_set)} case IDs.")

citations_data = []

# --- INIZIO ELABORAZIONE ---
if not os.path.exists(ROOT_DIR):
    print(f"ERROR: Root directory {ROOT_DIR} not found.")
    exit()

# Itera sugli stati (es. 'tx-crim')
state_folders = [f for f in os.listdir(ROOT_DIR) if os.path.isdir(os.path.join(ROOT_DIR, f))]

for state_folder in tqdm(state_folders, desc="Processing states"):
    state_folder_path = os.path.join(ROOT_DIR, state_folder)
    
    # Itera sui volumi
    volume_folders = [f for f in os.listdir(state_folder_path) if os.path.isdir(os.path.join(state_folder_path, f))]
    
    for volume_folder in tqdm(volume_folders, desc=f"Volumes in {state_folder}", leave=False):
        volume_folder_path = os.path.join(state_folder_path, volume_folder)
        
        # --- MODIFICA IMPORTANTE: Cerca nella sottocartella 'cases' ---
        cases_dir = os.path.join(volume_folder_path, "cases")
        if not os.path.isdir(cases_dir):
            continue

        for filename in os.listdir(cases_dir):
            if not filename.endswith(".json"):
                continue

            file_path = os.path.join(cases_dir, filename)
            
            try:
                with open(file_path, "r", encoding="utf-8") as f:
                    case_json = json.load(f)
            except (json.JSONDecodeError, OSError) as e:
                print(f"Error reading {file_path}: {e}")
                continue

            # Normalize to list (gestisce sia singolo oggetto che lista di oggetti)
            if isinstance(case_json, dict):
                case_list = [case_json]
            elif isinstance(case_json, list):
                case_list = case_json
            else:
                continue

            for case in case_list:
                # ID del caso CHE CITA (Source)
                id1 = str(case.get("id"))
                
                # Se questo caso non è nel nostro CSV (es. errore di parsing precedente), lo saltiamo
                if id1 not in id_set:
                    continue

                # Costruiamo il percorso "virtuale" per il matching.
                # Deve essere identico a quello generato in json_to_csv_parser.py:
                # /{slug}/{volume}/cases/{filename_no_ext}
                filename_no_ext = filename.replace('.json','')
                # Nota: aggiungiamo lo slash iniziale perché nel CSV c'è
                current_state_volume_file = f"/{state_folder}/{volume_folder}/cases/{filename_no_ext}"

                # Processiamo le citazioni (OUTGOING edges)
                cites_to = case.get("cites_to", [])
                if not cites_to:
                    continue

                for cite in cites_to:
                    citation_reference = cite.get("cite")
                    category = cite.get("category")
                    reporter = cite.get("reporter")
                    opinion_index = cite.get("opinion_index")

                    match_found = False

                    # --- Strategia 1: Link tramite case_ids (Esplicito) ---
                    for id2 in cite.get("case_ids", []):
                        id2_str = str(id2)
                        
                        # Determiniamo se è un "match" interno (il caso citato è nel nostro DB)
                        is_matched = id2_str in id_set
                        
                        citations_data.append({
                            "id1": id1,
                            "id2": id2_str,
                            "citation_reference": citation_reference,
                            "category": category,
                            "reporter": reporter,
                            "opinion_index": opinion_index,
                            "matched": is_matched
                        })
                        match_found = True

                    # --- Strategia 2: Link tramite case_paths (Implicito) ---
                    # Utile se case_ids è vuoto ma abbiamo un percorso file
                    if not match_found: # Evitiamo duplicati se abbiamo già trovato via ID
                        for path in cite.get("case_paths", []):
                            # path è spesso relativo o parziale, proviamo a vedere se lo abbiamo mappato
                            # Nota: case.law paths possono variare, questo è un tentativo best-effort
                            id2_str = file_to_id.get(path)
                            
                            if id2_str:
                                citations_data.append({
                                    "id1": id1,
                                    "id2": id2_str,
                                    "citation_reference": citation_reference,
                                    "category": category,
                                    "reporter": reporter,
                                    "opinion_index": opinion_index,
                                    "matched": True
                                })
                                match_found = True

                    # --- Strategia 3: Nessun ID trovato (Citazione esterna o non risolta) ---
                    # Se non abbiamo trovato né ID né Path, registriamo comunque la citazione
                    # ma con id2=None e matched=False. Utile per statistiche.
                    if not match_found:
                         citations_data.append({
                            "id1": id1,
                            "id2": None,
                            "citation_reference": citation_reference,
                            "category": category,
                            "reporter": reporter,
                            "opinion_index": opinion_index,
                            "matched": False
                        })

# Save citations table
print(f"Saving {len(citations_data)} citations to {CITATIONS_CSV}...")
if citations_data:
    df_citations = pd.DataFrame(citations_data)
    # Rimuoviamo eventuali duplicati esatti
    df_citations = df_citations.drop_duplicates()
    df_citations.to_csv(CITATIONS_CSV, index=False)
    print("Done.")
else:
    print("No citations found.")

Loading cases from Test/case_law_csv/cases.csv...
Loaded 27712 case IDs.


Processing states: 100%|██████████| 1/1 [00:04<00:00,  4.60s/it]


Saving 191572 citations to Test/case_law_csv/citations.csv...
Done.
