# Extract Population Data from Infofactures PDFs

This notebook extracts the population sentence from each infofacture PDF and creates a CSV with:

- code_udi
- full sentence about population
- population number

## Installation

First, install pdfplumber if not already installed:

```bash
uv pip install pdfplumber
```


In [None]:
import pdfplumber
import re
import csv
from pathlib import Path
from tqdm import tqdm
import pandas as pd

In [None]:
def extract_population_info(pdf_path):
    """
    Extract the population sentence from a PDF.

    Returns:
        tuple: (full_sentence, population_number, full_text)
               Returns (None, None, full_text) if extraction fails
    """
    full_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            # Crop to left 33% to keep only the first column
            for page in pdf.pages:
                # Get page dimensions
                page_width = page.width

                # Crop to left column (left 33% of page)
                left_column = page.crop((0, 0, page_width * 0.33, page.height))

                # Extract text from left column only
                text = left_column.extract_text(x_tolerance=1)
                if text:
                    full_text += text

            # Pattern to match the population sentence
            # Match both "Votre réseau alimente" and "Le réseau alimente" variations
            pattern = r"(Votre\s+réseau|Le\s+réseau)\s+alimente.*?personnes?\.?"

            match = re.search(pattern, full_text, re.DOTALL | re.IGNORECASE)

            if match:
                sentence = match.group(0)
                # Clean up: remove extra whitespace and newlines
                sentence = " ".join(sentence.split())
                # Remove trailing spaces
                sentence = sentence.strip()
                # Ensure it ends with a period
                if not sentence.endswith("."):
                    sentence = sentence + "."

                # Extract population number
                # Pattern handles numbers with spaces (e.g., "100 000")
                pop_pattern = r"(\d+(?:\s+\d+)*)\s*personnes?"
                pop_match = re.search(pop_pattern, sentence, re.IGNORECASE)

                if pop_match:
                    # Remove spaces from number (e.g., "100 000" -> "100000")
                    population_str = pop_match.group(1).replace(" ", "")
                    population = int(population_str)
                    return sentence, population, full_text

            return None, None, full_text

    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return None, None, full_text

In [None]:
# Test on a single file
sentence, population, full_text = extract_population_info(
    Path("../../database/infos_factures/045000559.pdf")
)
print(f"Sentence: {sentence}")
print(f"Population: {population}")
print(f"\nFull text preview (first 200 chars):\n{full_text[:200]}...")

In [None]:
# Test on one file to see if the extraction works
pdf_dir = Path("../../database/infos_factures")
test_files = list(pdf_dir.glob("*.pdf"))[:5]  # Test on first 5 files

print("Testing extraction on sample files:\n")
for pdf_file in test_files:
    code_udi = pdf_file.stem
    sentence, population, full_text = extract_population_info(pdf_file)
    print(f"Code UDI: {code_udi}")
    print(f"Population: {population}")
    print(f"Sentence: {sentence}")
    if sentence is None:
        print(f"Full text preview: {full_text[:500]}...")
    print("-" * 80)

In [None]:
# Process all PDFs
pdf_dir = Path("../../database/infos_factures")
pdf_files = list(pdf_dir.glob("*.pdf"))

print(f"Found {len(pdf_files)} PDF files to process")

results = []
errors = []

for pdf_file in tqdm(pdf_files, desc="Extracting population data"):
    code_udi = pdf_file.stem
    sentence, population, full_text = extract_population_info(pdf_file)

    if sentence is not None and population is not None:
        results.append(
            {"code_udi": code_udi, "sentence": sentence, "population": population}
        )
    else:
        errors.append(
            {
                "code_udi": code_udi,
                "full_text": full_text[:600],  # First 600 chars for debugging
            }
        )
        print(
            f"Extraction failed for Code UDI: {code_udi}\n Full text preview: {full_text[:600]}...\n{'-' * 80}"
        )

print(f"\nSuccessfully extracted: {len(results)}")
print(f"Errors: {len(errors)}")


In [None]:
for error in errors[:5]:
    print(f"\nCode UDI: {error['code_udi']}")
    print(f"Text preview: {error['full_text'][:600]}...")
    print("-" * 80)

In [None]:
# Create DataFrame and save to CSV
df = pd.DataFrame(results)

# Sort by code_udi
df = df.sort_values("code_udi")

# Display first few rows
print("First 10 rows:")
print(df.head(10))

# Save to CSV
output_path = "../../database/cache/udi_population_from_infofactures.csv"
df.to_csv(output_path, index=False)

print(f"\nSaved to: {output_path}")
print(f"Total rows: {len(df)}")

In [None]:
# Show some statistics
print("Population statistics:")
print(df["population"].describe())
print(f"\nTotal population across all UDIs: {df['population'].sum():,}")