In [15]:
import geopandas as gpd
from fuzzywuzzy import process
import awswrangler as wr
import os
import json
import io
import dotenv

dotenv.load_dotenv()

True

In [16]:
AWS_PROFILE = os.getenv("AWS_PROFILE")
DATA_PATH = os.getenv("DATA_PATH")

In [6]:
def normalize_str_choices(value, allowed_values, fz_threshold=70):
    if value is None or not isinstance(value, str):
        return None
    # fuzzy matching
    fz = process.extractOne(value, allowed_values)
    if fz is not None and fz[1] >= fz_threshold:
        return fz[0]
    else:
        return None

In [17]:
stream = io.BytesIO()
wr.s3.download(f"{DATA_PATH}/schemas/spain_municipality.json", stream)
spain_municipality = json.loads(stream.getvalue())

In [18]:
df_communities = gpd.read_file("./geojsons/spain-communities.geojson")
df_provinces = gpd.read_file("./geojsons/spain-provinces.geojson")

In [20]:
spain_comunidades = list(spain_municipality.keys())
spain_provinces = [p for provinces in spain_municipality.values() for p in provinces]

In [31]:
df_silver_communities = (
    df_communities
    .assign(
        name=lambda df: df.name.map(lambda x: normalize_str_choices(x, spain_comunidades)),

    )
    .astype({
        "cod_ccaa": "string",
        "name": "string",
    })
    [["cod_ccaa", "name", "geometry"]]
)
df_silver_communities.head()

Unnamed: 0,cod_ccaa,name,geometry
0,7,Castilla y León,"MULTIPOLYGON (((-4.89358 43.23848, -4.8392 43...."
1,9,Cataluña,"MULTIPOLYGON (((0.71604 42.85832, 0.85552 42.8..."
2,18,Ceuta (Ciudad de),"MULTIPOLYGON (((-5.34496 35.87106, -5.37291 35..."
3,14,Murcia (Región de),"MULTIPOLYGON (((-1.14985 38.74607, -1.11967 38..."
4,17,Rioja (La),"MULTIPOLYGON (((-2.99723 42.64214, -2.94645 42..."


In [38]:
assert df_silver_communities.isna().sum().sum() == 0

In [32]:
df_silver_provinces = (
    df_provinces
    .assign(
        name=lambda df: df.name.map(lambda x: normalize_str_choices(x, spain_provinces)),

    )
    .astype({
        "cod_prov": "string",
        "cod_ccaa": "string",
        "name": "string",
    })
    [["cod_prov", "cod_ccaa", "name", "geometry"]]
)
df_silver_provinces.head()

Unnamed: 0,cod_prov,cod_ccaa,name,geometry
0,7,3,Balears (Illes),"MULTIPOLYGON (((3.21364 39.95751, 3.1544 39.92..."
1,33,18,Asturias,"MULTIPOLYGON (((-5.84083 43.66062, -5.83002 43..."
2,15,11,Coruña (A),"MULTIPOLYGON (((-7.6803 43.78714, -7.66231 43...."
3,17,8,Girona,"MULTIPOLYGON (((1.74871 42.49482, 1.82355 42.4..."
4,35,4,Palmas (Las),"MULTIPOLYGON (((-13.89051 28.75685, -13.83632 ..."


In [37]:
assert df_silver_provinces.isna().sum().sum() == 0

# Almacenamos datos en s3

In [42]:
df_silver_communities.to_file('/tmp/spain-communities.geojson', driver='GeoJSON')  
df_silver_provinces.to_file('/tmp/spain-provinces.geojson', driver='GeoJSON')  

In [46]:
wr.s3.upload('/tmp/spain-communities.geojson', f"{DATA_PATH}/schemas/spain-communities.geojson")

In [48]:
wr.s3.upload('/tmp/spain-provinces.geojson', f"{DATA_PATH}/schemas/spain-provinces.geojson")