# **Studium Parisiense Data Collection**

In [5]:
import requests
from bs4 import BeautifulSoup
import json
import time
import string
import re
import os

# --- CONFIGURATION ---
BASE_URL = "http://studium-parisiense.univ-paris1.fr"
INDEX_URL = "http://studium-parisiense.univ-paris1.fr/?action=index&letter={}"
API_URL = "http://studium-parisiense.univ-paris1.fr/api/prosopography/{}"
OUTPUT_FILE = "studium_parisiense_dataset.jsonl"

# Add a user-agent so the server knows it's a script but not malicious
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (compatible; ResearchScript/1.0; +http://studium-parisiense.univ-paris1.fr)'
}

def get_ids_from_letter(letter):
    """Scrapes the index page for a specific letter and returns a list of IDs."""
    url = INDEX_URL.format(letter)
    print(f"[*] Scanning Index for letter: {letter}...")
    
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        ids = []
        # Based on the site structure, links are usually in a list
        # We look for links that look like "/individus/1-a.fidelis"
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link['href']
            # Regex to find patterns like "/individus/123-name"
            # It captures the digits between 'individus/' and the first hyphen
            match = re.search(r'individus/(\d+)-', href)
            if match:
                found_id = match.group(1)
                ids.append(found_id)
        
        # Remove duplicates just in case
        return list(set(ids))

    except Exception as e:
        print(f"[!] Error scanning letter {letter}: {e}")
        return []

def download_profile(profile_id):
    """Fetches the JSON data for a specific ID."""
    try:
        response = requests.get(API_URL.format(profile_id), headers=HEADERS)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 404:
            print(f"    [!] ID {profile_id} not found (404).")
            return None
        else:
            print(f"    [!] Error {response.status_code} for ID {profile_id}")
            return None
    except Exception as e:
        print(f"    [!] Connection error for ID {profile_id}: {e}")
        return None

def main():
    # 1. Gather all IDs first
    all_ids = []
    print("--- PHASE 1: Harvesting IDs from Index ---")
    
    # Loop A through Z
    for letter in string.ascii_uppercase:
        ids = get_ids_from_letter(letter)
        print(f"    Found {len(ids)} profiles for '{letter}'")
        all_ids.extend(ids)
        time.sleep(1) # Be polite to the server
        
    print(f"\nTotal unique profiles found: {len(all_ids)}")
    print("--- PHASE 2: Downloading JSON Data ---")

    # Check if file exists to resume (simple check)
    downloaded_ids = set()
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    data = json.loads(line)
                    # Assuming the JSON has an '_id' or 'reference' field matching our ID
                    # The example showed "reference": "1"
                    if 'reference' in data:
                        downloaded_ids.add(data['reference'])
                except:
                    pass
        print(f"Resuming... {len(downloaded_ids)} profiles already downloaded.")

    # 2. Download Data
    with open(OUTPUT_FILE, 'a', encoding='utf-8') as f:
        for i, profile_id in enumerate(all_ids):
            
            if profile_id in downloaded_ids:
                continue

            print(f"[{i+1}/{len(all_ids)}] Downloading ID: {profile_id}", end='\r')
            
            data = download_profile(profile_id)
            
            if data:
                # Write to file 
                json.dump(data, f, ensure_ascii=False)
                f.write('\n')
            
            # CRITICAL: Rate limiting
            # Do not set this to 0. It prevents the server from banning your IP.
            time.sleep(0.5) 

    print("\n\n[SUCCESS] Download complete.")

main()

--- PHASE 1: Harvesting IDs from Index ---
[*] Scanning Index for letter: A...
    Found 2323 profiles for 'A'
[*] Scanning Index for letter: B...
    Found 951 profiles for 'B'
[*] Scanning Index for letter: C...
    Found 617 profiles for 'C'
[*] Scanning Index for letter: D...
    Found 455 profiles for 'D'
[*] Scanning Index for letter: E...
    Found 314 profiles for 'E'
[*] Scanning Index for letter: F...
    Found 452 profiles for 'F'
[*] Scanning Index for letter: G...
    Found 4504 profiles for 'G'
[*] Scanning Index for letter: H...
    Found 1343 profiles for 'H'
[*] Scanning Index for letter: I...
    Found 240 profiles for 'I'
[*] Scanning Index for letter: J...
    Found 0 profiles for 'J'
[*] Scanning Index for letter: K...
    Found 1 profiles for 'K'
[*] Scanning Index for letter: L...
    Found 481 profiles for 'L'
[*] Scanning Index for letter: M...
    Found 957 profiles for 'M'
[*] Scanning Index for letter: N...
    Found 1102 profiles for 'N'
[*] Scanning Index 

KeyboardInterrupt: 