In [None]:
# Montar Google Drive

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
df_all = pd.read_csv("/content/drive/MyDrive/Dataset_TFM/carto/adoptantes_perfil_geo.csv")



Mounted at /content/drive


In [None]:
# GEO → CSV final para CARTO (lat/lon + the_geom)

!pip -q install geopy pandas tqdm

import os, unicodedata
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# Origen:

df_geo = df_all.copy()

# Preparación de columnas
for c in ["lat","lon"]:
    if c not in df_geo.columns:
        df_geo[c] = pd.NA

df_geo["poblacion"] = df_geo["poblacion"].astype(str).str.strip()
df_geo["provincia"] = df_geo["provincia"].astype(str).str.strip()
df_geo["direccion_busqueda"] = (df_geo["poblacion"] + ", " + df_geo["provincia"] + ", España").astype("string")

# Geocoder + RateLimiter
geolocator = Nominatim(user_agent="kartoffelita@gmail.com")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1, swallow_exceptions=True)

# Caché de geocodificación (clave → lat/lon/display_name)
CACHE_PATH = "/content/drive/MyDrive/Dataset_TFM/adoptantes_cache_geocoding.csv"
if Path(CACHE_PATH).exists():
    cache = pd.read_csv(CACHE_PATH, dtype={"direccion_busqueda":"string"})
    print(f"Cache cargada ({len(cache)} filas).")
else:
    cache = pd.DataFrame(columns=["direccion_busqueda","lat","lon","display_name"])
    cache["direccion_busqueda"] = cache["direccion_busqueda"].astype("string")

# Geocodificar solo queries únicas pendientes
unique_q = df_geo[["direccion_busqueda"]].drop_duplicates().astype({"direccion_busqueda":"string"})
pend = unique_q.merge(cache[["direccion_busqueda"]], on="direccion_busqueda", how="left", indicator=True)
pend = pend[pend["_merge"]=="left_only"].drop(columns="_merge").reset_index(drop=True)
print(f"Únicas totales: {len(unique_q)} | Pendientes: {len(pend)}")

batch, save_every = [], 50
for i, q in enumerate(tqdm(pend["direccion_busqueda"], desc="Geocodificando únicas"), start=1):
    try:
        loc = geocode(q, exactly_one=True, country_codes="es", addressdetails=False, language="es", timeout=10)
        if loc:
            batch.append({"direccion_busqueda": q, "lat": loc.latitude, "lon": loc.longitude, "display_name": getattr(loc, "address", None)})
        else:
            batch.append({"direccion_busqueda": q, "lat": pd.NA, "lon": pd.NA, "display_name": pd.NA})
    except Exception:
        batch.append({"direccion_busqueda": q, "lat": pd.NA, "lon": pd.NA, "display_name": pd.NA})

    if i % save_every == 0 or i == len(pend):
        tmp = pd.DataFrame(batch)
        cache = pd.concat([cache, tmp], ignore_index=True)
        cache.drop_duplicates(subset=["direccion_busqueda"], keep="last", inplace=True)
        cache.to_csv(CACHE_PATH, index=False)
        batch = []
print("Cache final guardada en:", CACHE_PATH)

# Reintentos sobre claves sin coordenadas
def strip_accents(s):
    if pd.isna(s): return s
    return "".join(c for c in unicodedata.normalize('NFKD', s) if not unicodedata.combining(c))

fix_map = {
    "A Coruña": "Coruña, A",
    "La Coruña": "Coruña, A",
    "L\u2019Hospitalet de Llobregat": "L'Hospitalet de Llobregat",
    "Vitoria": "Vitoria-Gasteiz"
}
viewbox = (-9.5, 44.2, 3.5, 35.6)

sin_coords = cache[cache["lat"].isna() | cache["lon"].isna()].copy()
if not sin_coords.empty:
    print(f"Reintentos sobre {len(sin_coords)} claves sin coordenadas ...")
    again = []
    for q in tqdm(sin_coords["direccion_busqueda"].tolist(), desc="Reintentando"):
        try:
            pobl, prov, _ = [x.strip() for x in q.split(",", 2)]
        except Exception:
            pobl, prov = q, ""
        pobl2 = fix_map.get(pobl, pobl)
        prov2 = fix_map.get(prov, prov)

        variantes = [
            q,
            f"{pobl2}, {prov2}, España",
            f"{strip_accents(pobl2)}, {strip_accents(prov2)}, Espana",
            f"{prov2}, {pobl2}, España"
        ]
        found = None
        for v in variantes:
            loc = geocode(v, exactly_one=True, country_codes="es", addressdetails=False, language="es", timeout=10)
            if not loc:
                loc = geocode(v, exactly_one=True, country_codes="es", addressdetails=False, language="es",
                              timeout=10, viewbox=viewbox, bounded=True)
            if loc:
                found = {"direccion_busqueda": q, "lat": loc.latitude, "lon": loc.longitude, "display_name": getattr(loc, "address", None)}
                break
        if found is None:
            again.append({"direccion_busqueda": q, "lat": pd.NA, "lon": pd.NA, "display_name": pd.NA})
        else:
            again.append(found)

    tmp = pd.DataFrame(again)
    cache = cache.drop(cache[cache["direccion_busqueda"].isin(tmp["direccion_busqueda"])].index)
    cache = pd.concat([cache, tmp], ignore_index=True)
    cache.drop_duplicates(subset=["direccion_busqueda"], keep="last", inplace=True)
    cache.to_csv(CACHE_PATH, index=False)
    print("Reintentos aplicados y cache actualizada.")

# Merge coordenadas al dataset
df_geo = df_geo.merge(cache[["direccion_busqueda","lat","lon","display_name"]], on="direccion_busqueda", how="left", suffixes=("", "_cache"))
for c in ["lat","lon","display_name"]:
    if f"{c}_cache" in df_geo.columns:
        df_geo[c] = df_geo[c].fillna(df_geo[f"{c}_cache"])
        df_geo.drop(columns=[f"{c}_cache"], inplace=True)

df_geo["lat"] = pd.to_numeric(df_geo["lat"], errors="coerce")
df_geo["lon"] = pd.to_numeric(df_geo["lon"], errors="coerce")

# Fallback:
faltan_prov = df_geo[df_geo["lat"].isna() | df_geo["lon"].isna()][["provincia"]].drop_duplicates()
if len(faltan_prov):
    print(f"Fallback por provincia para {len(faltan_prov)} provincias...")
    prov_cache = {}
    for prov in tqdm(faltan_prov["provincia"], desc="Provincias"):
        cand = [f"Provincia de {prov}, España", f"{prov}, España", f"{strip_accents(prov)}, Espana"]
        locp = None
        for v in cand:
            locp = geocode(v, exactly_one=True, country_codes="es", addressdetails=False, language="es", timeout=10)
            if not locp:
                locp = geocode(v, exactly_one=True, country_codes="es", addressdetails=False, language="es",
                               timeout=10, viewbox=viewbox, bounded=True)
            if locp: break
        if locp:
            prov_cache[prov] = (locp.latitude, locp.longitude)
    df_geo["geocode_resolution"] = pd.NA
    m_miss = df_geo["lat"].isna() | df_geo["lon"].isna()
    for prov, (la, lo) in prov_cache.items():
        m = m_miss & (df_geo["provincia"] == prov)
        df_geo.loc[m, "lat"] = la
        df_geo.loc[m, "lon"] = lo
        df_geo.loc[m, "geocode_resolution"] = "province"

# Marcar como 'locality' los que ya tenían municipio
df_geo.loc[df_geo["geocode_resolution"].isna(), "geocode_resolution"] = "locality"

# the_geom en WKT
df_geo["the_geom"] = np.where(
    df_geo["lat"].notna() & df_geo["lon"].notna(),
    "POINT(" + df_geo["lon"].astype(str) + " " + df_geo["lat"].astype(str) + ")",
    None
)

# Exportar CSV final para CARTO
OUT_DIR = "/content/drive/MyDrive/Dataset_TFM/carto"
os.makedirs(OUT_DIR, exist_ok=True)

cols_out = [
    "id","poblacion","provincia","edad","genero","grupo_edad","vivienda","horario_laboral",
    "experiencia_animales","tiempo_disponible","tipo_animal","devuelto",
    # columna de categorías por reglas:
    "perfil_adoptante","apto",
    # geo:
    "lat","lon","the_geom","display_name","geocode_resolution"
]
cols_out = [c for c in cols_out if c in df_geo.columns]

OUT_PATH = os.path.join(OUT_DIR, "adoptantes_perfil_geo.csv")
df_geo[cols_out].to_csv(OUT_PATH, index=False, encoding="utf-8")

print("CSV final para CARTO:", OUT_PATH)
print("Filas exportadas:", len(df_geo))
print("Con municipio (locality):", int((df_geo['geocode_resolution']=='locality').sum()))
print("Con fallback provincia:", int((df_geo['geocode_resolution']=='province').sum()))

Cache cargada (474 filas).
Únicas totales: 474 | Pendientes: 0


Geocodificando únicas: 0it [00:00, ?it/s]

Cache final guardada en: /content/drive/MyDrive/Dataset_TFM/adoptantes_cache_geocoding.csv
Reintentos sobre 34 claves sin coordenadas ...


Reintentando:   0%|          | 0/34 [00:00<?, ?it/s]

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
    raise GeocoderQueryError("Unsupported format for a bounding box")
geopy.exc.GeocoderQueryError: Unsupported format for a bounding box
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 274, in __call__
    res = self.func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/nominatim.py", line 250, in geocode
    params['viewbox'] = self._format_bounding_box(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/base.py", line 326, in _format_bounding_box
    raise GeocoderQueryError("Unsupported format for a bounding box")
geopy.exc.GeocoderQueryError: Unsupported f

Reintentos aplicados y cache actualizada.
Fallback por provincia para 20 provincias...


  cache = pd.concat([cache, tmp], ignore_index=True)
  df_geo[c] = df_geo[c].fillna(df_geo[f"{c}_cache"])
  df_geo[c] = df_geo[c].fillna(df_geo[f"{c}_cache"])


Provincias:   0%|          | 0/20 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 274, in __call__
    res = self.func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/nominatim.py", line 250, in geocode
    params['viewbox'] = self._format_bounding_box(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/geopy/geocoders/base.py", line 326, in _format_bounding_box
    raise GeocoderQueryError("Unsupported format for a bounding box")
geopy.exc.GeocoderQueryError: Unsupported format for a bounding box
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/geopy/extra/rate_limiter.py", line 136, in _retries_gen
    yield i  # Run the function.
    ^^^^^^^
  File "/u

CSV final para CARTO: /content/drive/MyDrive/Dataset_TFM/carto/adoptantes_perfil_geo.csv
Filas exportadas: 2299
Con municipio (locality): 2265
Con fallback provincia: 34


In [None]:
import pandas as pd

# Rutas en tu Drive
cache_path = "/content/drive/MyDrive/Dataset_TFM/adoptantes_cache_geocoding.csv"

# Cargar la cache
cache = pd.read_csv(cache_path, dtype={"direccion_busqueda":"string"})

# Filtrar las pendientes (sin lat o lon)
pendientes = cache[(cache["lat"].isna()) | (cache["lon"].isna())]

print(f"📌 Total pendientes: {len(pendientes)}")

# Mostrar todas en un DataFrame
pendientes.reset_index(drop=True)

📌 Total pendientes: 34


Unnamed: 0,direccion_busqueda,lat,lon,display_name
0,"San Fdo De Henares, Madrid, España",,,
1,"Valle De Luena, Cantabria, España",,,
2,"Bizkaia, Pais Vazco, España",,,
3,"Leuze En Hainaut, Bélgica, España",,,
4,"Barcelona, Catanuna, España",,,
5,"Borgloon, Bélgica, España",,,
6,"Queluz - Lisboa, Portugal, España",,,
7,"Blaco, Soria, España",,,
8,"Villabilla, Madrid, España",,,
9,"Niva, Dinamarca, España",,,
