## Data Acquisition Crawler

### **Main Objective**
To automate the massive and organized download of thousands of legal judgments (focusing on *Texas Criminal Law*) from the public static database *Case.law*.

### **Technical Logic**
The script is designed to handle the acquisition of large volumes of data through a hierarchical and resilient structure:

* **Hierarchical Navigation:** The crawler replicates the original database structure by navigating through three levels: *State* → *Volumes* → *Individual Cases*. It uses **BeautifulSoup** to parse the HTML of index pages and dynamically extract links to JSON files.
* **Parallel Execution (Multithreading):** To speed up the download of thousands of small JSON files, the script uses `ThreadPoolExecutor` with 8 simultaneous workers. This allows downloading 8 cases at a time instead of sequentially, significantly reducing network wait times (I/O bound).
* **Resilience and Idempotency:**
    * **Retry Logic:** It includes a `fetch_url` function that retries up to 4 times in case of connection failure, ensuring that a momentary network error does not block the entire process.
    * **Existence Check:** Before downloading, it checks if the file already exists locally (`if os.path.exists`). This allows the process to be interrupted and resumed (resume capability) without duplicating work.

In [2]:
import os
import time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

In [3]:
STATES = ["tex-crim"]

In [4]:
BASE_URL = "https://static.case.law"
DOWNLOAD_FOLDER = "Test/case_law_json"
RETRIES = 4
SLEEP_BETWEEN_RETRIES = 2  # seconds
MAX_WORKERS = 8  # Number of threads for parallel downloads

In [5]:
def fetch_url(url):
    """Esegue il fetch di un URL con tentativi (retries)."""
    for attempt in range(RETRIES):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status() # Lancia un errore per status HTTP > 400
            return response
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}. Retry {attempt + 1}/{RETRIES}")
            time.sleep(SLEEP_BETWEEN_RETRIES)
    print(f" Failed after {RETRIES} retries: {url}")
    return None


def save_json(url, path):
    """Salva un file JSON da un URL, se non esiste già."""
    if os.path.exists(path):
        # print(f" Skipping {path} (already downloaded)")
        return True  # Già scaricato
    
    response = fetch_url(url)
    if response is None:
        print(f" Failed to download {url}")
        return False
    
    # Crea la directory genitore se non esiste
    os.makedirs(os.path.dirname(path), exist_ok=True)
    
    with open(path, "wb") as f:
        f.write(response.content)
    # print(f" Saved {url} -> {path}") # Riduciamo il rumore nel log
    return True


def get_volumes(state):
    """Estrae i numeri dei volumi e gli URL dalla pagina indice dello stato."""
    state_url = f"{BASE_URL}/{state}/"
    print(f"Fetching volumes from: {state_url}")
    response = fetch_url(state_url)
    if response is None:
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")

    volumes = []
    for row in soup.find_all("tr"):
        first_td = row.find("td")
        if first_td:
            a = first_td.find("a")
            # Assicurati che il link sia un numero (volume)
            if a and a.text.strip().isdigit():
                vol_num = a.text.strip()
                vol_url = a["href"]
                
                # Costruisci l'URL completo se è relativo
                if not vol_url.startswith("http"):
                    # L'URL corretto dovrebbe puntare alla cartella, non al file
                    vol_url = f"{BASE_URL}/{state}/{vol_num}/" 
                
                volumes.append((vol_num, vol_url))
    print(f"Found {len(volumes)} volumes for state {state}.")
    return volumes


def get_cases(volume_url):
    """Estrae tutti gli URL dei JSON dei casi dalla pagina /cases/."""
    cases_url = f"{volume_url}cases/"
    response = fetch_url(cases_url)
    if response is None:
        return []
    
    soup = BeautifulSoup(response.text, "html.parser")

    case_urls = []
    for row in soup.find_all("tr"):
        first_td = row.find("td")
        if first_td:
            a = first_td.find("a")
            # Assicurati che il link finisca con .json
            if a and a["href"].endswith(".json"):
                href = a["href"]
                if not href.startswith("http"):
                    href = cases_url + href
                case_urls.append(href)
    return case_urls


def crawl_state(state):
    """Esegue il crawl completo per un singolo stato."""
    print(f"=== Processing state: {state} ===")

    # Download dei metadati a livello di stato
    state_metadata_files = ["ReporterMetadata.json", "VolumesMetadata.json"]
    for meta in state_metadata_files:
        url = f"{BASE_URL}/{state}/{meta}"
        path = os.path.join(DOWNLOAD_FOLDER, state, meta)
        save_json(url, path)

    volumes = get_volumes(state)
    
    # Processa i volumi in parallelo (download dei metadati E dei casi)
    for vol, vol_url in tqdm(volumes, desc=f"Processing {state} volumes"):
        
        # Download dei metadati a livello di volume
        volume_metadata_files = ["VolumeMetadata.json", "CasesMetadata.json"]
        for meta in volume_metadata_files:
            url = f"{vol_url}{meta}"
            path = os.path.join(DOWNLOAD_FOLDER, state, vol, meta)
            save_json(url, path)

        # Download di tutti i JSON dei casi per questo volume in parallelo
        case_urls = get_cases(vol_url)
        futures = []
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            for case_url in case_urls:
                case_filename = case_url.split("/")[-1]
                save_path = os.path.join(DOWNLOAD_FOLDER, state, vol, "cases", case_filename)
                
                # Modifica: assicurati che la cartella "cases" esista
                os.makedirs(os.path.dirname(save_path), exist_ok=True)
                
                futures.append(executor.submit(save_json, case_url, save_path))

            # Mostra una barra di avanzamento per i download dei casi
            for f in tqdm(as_completed(futures), total=len(futures), desc=f"  Volume {vol} cases", leave=False):
                f.result() # Puoi controllare il risultato (True/False) se necessario

    print(f"=== Finished state: {state} ===")

In [6]:
if __name__ == "__main__":
    for state in STATES:
        crawl_state(state)

=== Processing state: tex-crim ===
Fetching volumes from: https://static.case.law/tex-crim/
Found 142 volumes for state tex-crim.


Processing tex-crim volumes: 100%|██████████| 142/142 [02:04<00:00,  1.14it/s]

=== Finished state: tex-crim ===



