# RESEARCH: Polling districts geolocation

**Author**: gwiazdan  
**Created**: 07-02-2026  
**Version:** 1.0  

## Overview

This notebook geocodes Polish polling district addresses using the GUCiK (Główny Urząd Geodezji i Karografii) API. The goal is to obtain geographic coordinates for all polling stations to enable spatial aggregations.

## Methodology

### Data Source
- Dataset: Polish polling districts (`obwody_glosowania_utf8.csv`) from Polish Electoral Commite (PKW)
- Contains ~32,000 polling stations across Poland

In [None]:
import pandas as pd
from tqdm import tqdm
import requests
from shapely import wkt
import geopandas as gpd

In [None]:
data_path = "../scripts/geocoder/data/raw"
df = pd.read_csv(f'{data_path}/obwody_glosowania_utf8.csv', sep=';')

In [None]:
print(df.columns)
print(df.shape)

In [None]:
def normalize_teryt(teryt):
    """Normalize TERYT to 6-digit format."""
    try:
        teryt_int = int(float(teryt))
        return str(teryt_int).zfill(6)
    except (ValueError, TypeError):
        return None

In [None]:
df = df[df['TERYT gminy'].notna()]

df0 = df[['TERYT gminy', 'Mieszkańcy', 'Gmina', 'Wyborcy', 'Powiat', 'Województwo', 'Miejscowość', 'Siedziba', 'Ulica', 'Numer posesji', 'Kod pocztowy']]
df0['TERYT gminy'] = df0['TERYT gminy'].apply(normalize_teryt)
mask = df0["TERYT gminy"].astype(str).str.fullmatch(r"1465(0[2-9]|1[0-9])")
df0.loc[mask, "TERYT gminy"] = "146501"
print(df0.head())

In [None]:
URL = "https://services.gugik.gov.pl/uug"

for p in ['ul. ', 'pl. ', 'al. ', 'os. ']:
    df0['Ulica'] = df0['Ulica'].str.removeprefix(p)

# Load geodata for validation
print("Loading geodata file...")
geodata_path = "../scripts/geocoder/data/raw/00_jednostki_administracyjne.zip"
try:
    gdf_gminy = gpd.read_file(f"zip://{geodata_path}!A03_Granice_gmin.shp")
    gdf_gminy['TERYT_NORM'] = gdf_gminy['JPT_KOD_JE'].str[:-1].apply(normalize_teryt)
    gdf_gminy_indexed = gdf_gminy.set_index('TERYT_NORM', drop=True)
    gdf_gminy_indexed = gdf_gminy_indexed.to_crs("EPSG:2180")
    print(f"Loaded {len(gdf_gminy)} municipalities")
    print(f"Geodata CRS: {gdf_gminy.crs}")
except Exception as e:
    print(f"Warning: Could not load geodata - {e}")
    gdf_gminy = None
    gdf_gminy_indexed = None

In [None]:
def find_address(address, expected_teryt=None):
    """Query GUGiK API for address coordinates with TERYT validation."""
    params = {"request": "GetAddress", "address": address}
    try:
        req = requests.get(URL, params=params, timeout=10)
        req.raise_for_status()
        data = req.json()

        if data.get("found objects", 0) > 0 and data.get("results"):
            # If TERYT validation is required, find the matching result
            if expected_teryt:
                expected_teryt_norm = normalize_teryt(expected_teryt)
                for _, result in data["results"].items():
                    result_teryt = result.get("teryt", "")
                    if normalize_teryt(result_teryt) == expected_teryt_norm:
                        return {
                            "success": True,
                            "geometry": wkt.loads(result["geometry_wkt"]),
                            "x": float(result.get("x")),
                            "y": float(result.get("y")),
                            "matched_address": address,
                        }
            # Fallback - take first result
            result = data["results"]["1"]
            return {
                "success": True,
                "geometry": wkt.loads(result["geometry_wkt"]),
                "x": float(result.get("x")),
                "y": float(result.get("y")),
                "matched_address": address,
            }
    except Exception:
        pass

    return {"success": False}


def normalize_point_to_gdf(point):
    """Normalize point CRS to match gdf_gminy, if needed."""
    if gdf_gminy is None or point is None:
        return point
    gdf_crs = gdf_gminy.crs
    if gdf_crs is None:
        return point

    x, y = point.x, point.y
    is_latlon = abs(x) <= 180 and abs(y) <= 90
    if is_latlon and str(gdf_crs).upper() not in ("EPSG:4326", "CRS:84"):
        try:
            return gpd.GeoSeries([point], crs="EPSG:4326").to_crs(gdf_crs).iloc[0]
        except Exception:
            return point
    return point


def validate_point(teryt, point):
    """Validate if point lies within municipality geometry.

    Returns True/False if validation is possible, None if TERYT or geodata missing.
    """
    if gdf_gminy_indexed is None or point is None:
        return None

    if teryt not in gdf_gminy_indexed.index:
        return False

    try:
        municipality_geom = gdf_gminy_indexed.loc[teryt].geometry
        point_norm = normalize_point_to_gdf(point)
        return municipality_geom.covers(point_norm)
    except Exception:
        return False


def get_centroid(teryt):
    """Get centroid of municipality as fallback."""
    if gdf_gminy_indexed is None:
        return None

    if teryt not in gdf_gminy_indexed.index:
        return None

    try:
        return gdf_gminy_indexed.loc[teryt].geometry.centroid
    except Exception:
        return None

In [None]:
# Check TERYT coverage between polling districts data and gminy geodata
print("=== TERYT Coverage Analysis ===\n")

# Get unique normalized TERYT codes from both sources
teryt_data = set(normalize_teryt(t) for t in df0['TERYT gminy'].unique() if pd.notna(t))
teryt_gminy = set(gdf_gminy_indexed.index) if gdf_gminy_indexed is not None else set()

print(f"Polling districts data (df0):")
print(f"  - Total unique TERYT codes: {len(teryt_data)}")
print(f"  - Sample: {sorted(list(teryt_data))[:5]}")

print(f"\nGminy geodata (gminy.zip):")
print(f"  - Total unique TERYT codes: {len(teryt_gminy)}")
print(f"  - Sample: {sorted(list(teryt_gminy))[:5]}")

# Calculate coverage
overlap = teryt_data & teryt_gminy
only_in_data = teryt_data - teryt_gminy
only_in_gminy = teryt_gminy - teryt_data

print(f"\n=== Coverage Statistics ===")
print(f"Matching TERYT codes: {len(overlap)} ({100*len(overlap)/len(teryt_data):.1f}%)")
print(f"Only in polling data: {len(only_in_data)}")
print(f"Only in gminy geodata: {len(only_in_gminy)}")

if only_in_data:
    print(f"\nMissing in gminy.zip: {sorted(only_in_data)}")
    
if only_in_gminy:
    print(f"\nExtra in gminy.zip: {sorted(only_in_gminy)}")

### Geocoding Strategy

The geocoding process employs a fallback strategy with multiple address formats to maximize success rate:

1. **Full address** (locality + street + building_number)
2. **Full address with number + a variant** - If the exact building number is not found, try appending 'a' to handled cases where data may be incomplete
3. **Locality + Street**
4. **Locality only** (for rural areas)

### Spatial Validation with GeoJSON

Each geocoded point is validated against municipality boundaries using GeoJSON data (gminy.zip):

- **Valid**: Point falls within the municipality geometry (TERYT match)
- **Centroid fallback**: If point is outside boundaries, use municipality centroid as fallback
- **Invalid**: Point cannot be validated and no fallback available

In [None]:
df_sample = df0.copy()

df_sample['Ulica'] = df_sample['Ulica'].fillna('')
df_sample["Numer posesji"] = df_sample["Numer posesji"].fillna("")

df_sample["full_address_temp"] = (
    df_sample["Miejscowość"].astype(str)
    + "_"
    + df_sample["Ulica"].fillna("").astype(str)
    + "_"
    + df_sample["Numer posesji"].fillna("").astype(str)
)

df_sample = df_sample.drop_duplicates(subset=["full_address_temp"], keep="first")
df_sample = df_sample.drop(columns=["full_address_temp"])
df_sample = df_sample.reset_index(drop=True)

df_sample = df_sample.copy().sample(n=250, random_state=42).reset_index(drop=True)

results = []
validation_stats = {
    "total": 0,
    "geocoded": 0,
    "validated": 0,
    "centroid_fallback": 0,
    "unknown_teryt": 0,
    "failed": 0
}

for idx, row in tqdm(
    df_sample.iterrows(), total=len(df_sample), desc="Searching for coordinates..."
):
    locality = row["Miejscowość"]
    street = row.get("Ulica", "").strip()
    number_raw = str(row.get("Numer posesji", ""))
    teryt = row["TERYT gminy"]

    if pd.notna(number_raw) and number_raw != "":
        number = str(number_raw).strip()
        if "/" in number:
            number = number.split("/")[0].strip()
    else:
        number = ""

    geocoded = False
    result_data = None
    validation_status = None
    validation_stats["total"] += 1

    # Strategy: Try multiple address variants with fallback
    address_variants = []

    # 1. Full address (locality + street + number)
    if street and number:
        address_variants.append(f"{locality}, {street} {number}")

    # 2. Full address with 'a' suffix (common variant)
    if street and number:
        address_variants.append(f"{locality}, {street} {number}a")

    # 3. Locality + Street (without number, for cases where number is incomplete)
    if street:
        address_variants.append(f"{locality}, {street}")

    # 4. Locality only (for rural areas)
    address_variants.append(locality)

    # Try each variant until one succeeds
    for address in address_variants:
        result_data = find_address(address, expected_teryt=teryt)
        if result_data.get("success"):
            geocoded = True
            validation_stats["geocoded"] += 1
            break

    if geocoded and result_data:
        point = result_data.get("geometry")
        validation_result = validate_point(teryt, point)

        if validation_result:
            validation_status = "valid"
            validation_stats["validated"] += 1
        elif not validation_result:
            centroid = get_centroid(teryt)
            if centroid:
                validation_status = "centroid_fallback"
                validation_stats["centroid_fallback"] += 1
                result_data["x"] = centroid.x
                result_data["y"] = centroid.y
                result_data["geometry"] = centroid
                result_data["matched_address"] = f"{locality} (centroid)"
            else:
                validation_status = "invalid_location"
                geocoded = False

    if geocoded and result_data:
        results.append(
            {
                "original_address": f"{locality}, {street} {number}".strip(),
                "matched_address": result_data.get("matched_address"),
                "address": result_data.get("matched_address"),
                "x": result_data.get("x"),
                "y": result_data.get("y"),
                "geometry": result_data.get("geometry"),
                "validation_status": validation_status,
                "geocoded": True,
            }
        )
    else:
        validation_stats["failed"] += 1
        results.append(
            {
                "original_address": f"{locality}, {street} {number}".strip(),
                "matched_address": None,
                "address": None,
                "x": None,
                "y": None,
                "geometry": None,
                "validation_status": "failed",
                "geocoded": False,
            }
        )

df_results = pd.DataFrame(results)

# Print validation statistics
print("\n=== Validation Statistics ===")
print(f"Total addresses: {validation_stats['total']}")
print(f"Successfully geocoded: {validation_stats['geocoded']}")
print(f"Validated points: {validation_stats['validated']}")
print(f"Centroid fallback: {validation_stats['centroid_fallback']}")
print(f"Unknown TERYT in geodata: {validation_stats['unknown_teryt']}")
print(f"Failed: {validation_stats['failed']}")

In [None]:
print(df_results)

In [None]:
total = len(df_results)
geocoded = df_results['geocoded'].sum()
validated = (df_results['validation_status'] == 'valid').sum()
centroid_fb = (df_results['validation_status'] == 'centroid_fallback').sum()
unknown_teryt = (df_results['validation_status'] == 'no_teryt_match').sum()
percentage = (geocoded / total) * 100 if total > 0 else 0

print("Geocoding & Validation Statistics:")
print(f"Total addresses: {total}")
print(f"Successfully geocoded: {geocoded}")
print(f"  - Validated (point in municipality): {validated}")
print(f"  - Centroid fallback: {centroid_fb}")
print(f"  - Unknown TERYT in geodata: {unknown_teryt}")
print(f"Failed: {total - geocoded}")
print(f"Success rate: {percentage:.2f}%")
print(f"\nValidation breakdown:")
print(df_results['validation_status'].value_counts())

The high success rate demonstrates that the fallback strategy effectively handles:
- Urban addresses with full street information
- Rural addresses without street names
- Address variants and typos
- Missing building numbers

The next step is to implement this strategy on the full dataset