In [3]:
import pandas as pd
import glob, os, re

folder_path = r"C:\Users\black\Documents\SINCA files"      # change if needed
out_path = r"C:\Users\black\Documents\SINCA2\Data_Pollution_cleaned2.csv"

all_files = glob.glob(os.path.join(folder_path, "*.csv"))
dfs = []

def merge_split_decimals(df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge adjacent 'Unnamed:*' column into the left neighbor when left looks like integer
    and the unnamed right looks like a fractional part (digits only).
    """
    cols = list(df.columns)
    to_drop = []
    # Work as strings
    for c in cols:
        df[c] = df[c].astype(str)
    df = df.replace({"": pd.NA, "nan": pd.NA, "None": pd.NA})
    for i, c in enumerate(cols):
        if re.match(r"^Unnamed: ?\d+$", str(c)) and i > 0:
            left = cols[i-1]
            left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
            right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
            mask = left_is_int & right_is_frac
            if mask.any():
                # preserve leading zeros on fractional part
                df.loc[mask, left] = df.loc[mask, left].astype(str) + "." + df.loc[mask, c].astype(str)
                to_drop.append(c)
    if to_drop:
        df = df.drop(columns=list(dict.fromkeys(to_drop)), errors="ignore")
    return df

def fix_decimal_cell(val):
    """Turn '13,4399' -> '13.4399', '69 9047' -> '69.9047', leave others unchanged."""
    if pd.isna(val):
        return val
    s = str(val).strip()
    # comma decimal
    if re.fullmatch(r"-?\d+,\d+", s):
        return s.replace(",", ".")
    # space decimal like '69 9047' (one or more spaces)
    if re.fullmatch(r"-?\d+\s+\d+", s):
        return re.sub(r"\s+", ".", s)
    return s

for file in all_files:
    fname = os.path.basename(file).replace(".csv", "")
    # split filename into medida and centro (keeps everything after first '__' as centro)
    medida, centro = fname.split("__", 1)

    # Read as strings to avoid early coercion
    df = pd.read_csv(file, sep=";", dtype=str, engine="python")

    # If CSV was ragged and created Unnamed columns, try to merge split decimals
    df = merge_split_decimals(df)

    # Trim whitespace and normalize empties
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    df = df.replace({"": pd.NA})

    # Fix decimal patterns in every column where it's needed
    for col in df.columns:
        # quick check: only act if there is evidence of comma-decimal or spaced-decimal
        col_sample = df[col].dropna().astype(str)
        if col_sample.empty:
            continue
        if col_sample.str.contains(r",").any() or col_sample.str.contains(r"\d+\s+\d+").any():
            df[col] = df[col].apply(fix_decimal_cell)
            # try to convert to numeric (if appropriate)
            df[col] = pd.to_numeric(df[col], errors="ignore")

    # Attach metadata
    df["Medida"] = medida
    df["Centro"] = centro.replace("_", " ")

    dfs.append(df)

# Concatenate all files
combined_df = pd.concat(dfs, ignore_index=True)

# Convert FECHA (YYMMDD) to datetime (Excel will read it fine)
if "FECHA (YYMMDD)" in combined_df.columns:
    combined_df["FECHA (YYMMDD)"] = pd.to_datetime(
        combined_df["FECHA (YYMMDD)"].astype(str).str.strip(), format="%y%m%d", errors="coerce"
    )

# Drop unwanted columns
combined_df = combined_df.drop(columns=["HORA (HHMM)", "Unnamed: 5"], errors="ignore")

# Normalize and convert the registro columns
for col in ["Registros validados", "Registros preliminares", "Registros no validados"]:
    if col in combined_df.columns:
        combined_df[col] = (
            combined_df[col].astype(str)
            .str.replace(",", ".", regex=False)
            .str.replace(r"\s+", ".", regex=True)
        )
        combined_df[col] = pd.to_numeric(combined_df[col], errors="coerce")

# --- NEW: create total registros ---
registro_cols = ["Registros validados", "Registros preliminares", "Registros no validados"]
available_cols = [c for c in registro_cols if c in combined_df.columns]

if available_cols:
    combined_df["Total_registros"] = combined_df[available_cols].sum(axis=1, skipna=True)

    # Drop Medida+Centro groups where all Total_registros are NaN or 0
    combined_df = combined_df.groupby(["Medida", "Centro"], group_keys=False).filter(
        lambda g: not g["Total_registros"].fillna(0).eq(0).all()
    )

# Reset index for a clean output
combined_df = combined_df.reset_index(drop=True)

# --- NEW: aggregate to daily min, max, mean for each Medida–Centro ---
# First identify the column that contains the measure values.
# I’ll assume it’s "Registros validados" (adjust if needed).
measure_col = "Total_registros"

if measure_col in combined_df.columns:
    daily_df = (
        combined_df
        .groupby(["FECHA (YYMMDD)", "Medida", "Centro"], as_index=False)
        .agg(
            Min_val=(measure_col, "min"),
            Max_val=(measure_col, "max"),
            Mean_val=(measure_col, "mean")
        )
    )
else:
    daily_df = combined_df.copy()


# Save cleaned output
daily_df.to_csv(out_path, index=False)
print("Saved cleaned file to:", out_path)




  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r"-?\d+").fillna(False)
  right_is_frac = df[c].str.fullmatch(r"\d+").fillna(False)
  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
  left_is_int = df[left].str.fullmatch(r

Saved cleaned file to: C:\Users\black\Documents\SINCA2\Data_Pollution_cleaned2.csv


In [5]:
import geopandas as gpd
import pandas as pd
from shapely.ops import nearest_points
import numpy as np


def build_df(entidades_path, centros_path, csv_path, radius_km=2):
    # Load shapefiles
    entidades = gpd.read_file(entidades_path)
    centros = gpd.read_file(centros_path)

    # Load allowed Estaciones from CSV
    valid_estaciones = pd.read_csv(csv_path)["Estación"].astype(str).unique()

    # Keep only matching centros
    centros = centros[centros["Estación"].astype(str).isin(valid_estaciones)].copy()

    # Project to metric CRS
    entidades = entidades.to_crs(epsg=32719)  # Adjust CRS if needed
    centros = centros.to_crs(entidades.crs)

    records = []

    for idx, ent in entidades.iterrows():
        ent_geom = ent.geometry

        # Compute distances
        centros["dist_m"] = centros.geometry.distance(ent_geom)
        nearby = centros[centros["dist_m"] <= radius_km * 1000].copy()

        if nearby.empty:
            records.append({
                "Entidad": ent.get("name", idx),  # adjust col name
                "Estación": ".",
                "cod_comuna": ent["CUT"],
                "Pers": ent["TOTAL_PERS"],
                "weight": 0
            })
        else:
            # Inverse distance weights
            nearby["inv_dist"] = 1 / nearby["dist_m"]
            nearby["weight"] = nearby["inv_dist"] / nearby["inv_dist"].sum()

            for _, cen in nearby.iterrows():
                records.append({
                    "Entidad": ent.get("name", idx),
                    "Estación": cen["Estación"],
                    "cod_comuna": ent["CUT"],
                    "Pers": ent["TOTAL_PERS"],  # keep original name
                    "weight": cen["weight"]
                })

    return pd.DataFrame(records)

def build_df2(entidades_path, centros_path, csv_path, radius_km=2):
    # Load shapefiles
    entidades = gpd.read_file(entidades_path)
    centros = gpd.read_file(centros_path)

    # Load allowed Estaciones from CSV
    valid_estaciones = pd.read_csv(csv_path)["Estación"].astype(str).unique()

    # Keep only matching centros
    centros = centros[centros["Estación"].astype(str).isin(valid_estaciones)].copy()

    # Project to metric CRS
    entidades = entidades.to_crs(epsg=32719)  # Adjust CRS if needed
    centros = centros.to_crs(entidades.crs)

    records = []

    for idx, ent in entidades.iterrows():
        ent_geom = ent.geometry

        # Compute distances
        centros["dist_m"] = centros.geometry.distance(ent_geom)
        nearby = centros[centros["dist_m"] <= radius_km * 1000].copy()

        if nearby.empty:
            records.append({
                "Entidad": ent.get("name", idx),  # adjust col name
                "Estación": ".",
                "cod_comuna": ent["COD_COMUNA"],
                "Pers": ent["TOTAL_PERS"],
                "weight": 0
            })
        else:
            # Inverse distance weights
            nearby["inv_dist"] = 1 / nearby["dist_m"]
            nearby["weight"] = nearby["inv_dist"] / nearby["inv_dist"].sum()

            for _, cen in nearby.iterrows():
                records.append({
                    "Entidad": ent.get("name", idx),
                    "Estación": cen["Estación"],
                    "cod_comuna": ent["COD_COMUNA"],
                    "Pers": ent["TOTAL_PERS"],  # keep original name
                    "weight": cen["weight"]
                })

    return pd.DataFrame(records)


# ---- USAGE ----
df21 = build_df(r"C:\Users\black\Dropbox\Proyectos\microdatos_manzana\Centroide\Centroides_Manzana_32718.shp", r"C:\Users\black\Downloads\microdatos_manzana\Centroide\Coordenadas_LatLon_32718.shp", r"C:\Users\black\Documents\SINCA2\centros.csv", radius_km=10)
df22 = build_df(r"C:\Users\black\Dropbox\Proyectos\microdatos_manzana\Centroide\Centroides_Manzana_32719.shp", r"C:\Users\black\Downloads\microdatos_manzana\Centroide\Coordenadas_LatLon_32719.shp", r"C:\Users\black\Documents\SINCA2\centros.csv", radius_km=10)
df23 = build_df2(r"C:\Users\black\Dropbox\Proyectos\microdatos_manzana\Centroide\Entidad_Promedios_32718.shp", r"C:\Users\black\Downloads\microdatos_manzana\Centroide\Coordenadas_LatLon_32718.shp", r"C:\Users\black\Documents\SINCA2\centros.csv", radius_km=10)
df24 = build_df2(r"C:\Users\black\Dropbox\Proyectos\microdatos_manzana\Centroide\Entidad_Promedios_32719.shp", r"C:\Users\black\Downloads\microdatos_manzana\Centroide\Coordenadas_LatLon_32719.shp", r"C:\Users\black\Documents\SINCA2\centros.csv", radius_km=10)
# Append both results
df21["shp"] = 1
df22["shp"] = 2
df23["shp"] = 3
df24["shp"] = 4
final_df2 = pd.concat([df21, df22, df23, df24], ignore_index=True)

# Export to Excel
final_df2.to_excel(r"C:\Users\black\Dropbox\Proyectos\microdatos_manzana\Centroide\entidades_centros3.xlsx", index=False)

In [13]:
import geopandas as gpd
import rioxarray as rxr
import rasterstats
import pandas as pd
import os
import xarray as xr
import tempfile

def process_cr2met(temp_folder, precip_folder, shapefile_path, shp_label):
    """
    Extract daily tmin, tmax, pr for each polygon in shapefile,
    including number of contributing raster cells.
    """
    # --- Read shapefile and reproject ---
    gdf = gpd.read_file(shapefile_path).reset_index()  # Entity = index
    gdf = gdf.to_crs("EPSG:4326")

    results = []

    def prepare_raster(da):
        """Ensure lat increasing, set spatial dims and CRS"""
        da = da.sortby("lat")
        da = da.rio.set_spatial_dims(x_dim="lon", y_dim="lat")
        da = da.rio.write_crs("EPSG:4326")
        return da

    def zonal_extract(gdf, raster_da, tmpfile):
        """Save slice to temp GeoTIFF, run zonal_stats, delete file."""
        raster_da.rio.to_raster(tmpfile)
        stats = rasterstats.zonal_stats(
            gdf,
            tmpfile,
            stats=["mean", "count"],
            all_touched=True,
            nodata=None
        )
        return {i: s for i, s in enumerate(stats)}

    # --- Temperature files ---
    for fname in sorted(os.listdir(temp_folder)):
        if not fname.endswith(".nc"):
            continue
        fpath = os.path.join(temp_folder, fname)
        ds = xr.open_dataset(fpath)

        for t in range(ds.dims["time"]):
            date = pd.to_datetime(ds["time"].values[t]).date()

            tmin = prepare_raster(ds["tmin"].isel(time=t))
            tmax = prepare_raster(ds["tmax"].isel(time=t))

            # Skip empty rasters
            if tmin.shape[0] == 0 or tmin.shape[1] == 0:
                continue

            with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
                vals_tmin = zonal_extract(gdf, tmin, tmp.name)
            with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
                vals_tmax = zonal_extract(gdf, tmax, tmp.name)

            for i, feat in enumerate(gdf.itertuples()):
                results.append({
                    "Entity": i,
                    "Pers": getattr(feat, "Pers", None),
                    "cod_comuna": getattr(feat, "cod_comuna", None),
                    "SHP": shp_label,
                    "date": date,
                    "tmin": vals_tmin[i]["mean"],
                    "tmax": vals_tmax[i]["mean"],
                    "pr": None,
                    "n_cells_tmin": vals_tmin[i]["count"],
                    "n_cells_tmax": vals_tmax[i]["count"],
                    "n_cells_pr": None
                })

        ds.close()

    # --- Precipitation files ---
    for fname in sorted(os.listdir(precip_folder)):
        if not fname.endswith(".nc"):
            continue
        fpath = os.path.join(precip_folder, fname)
        ds = xr.open_dataset(fpath)

        for t in range(ds.dims["time"]):
            date = pd.to_datetime(ds["time"].values[t]).date()

            pr = prepare_raster(ds["pr"].isel(time=t))
            if pr.shape[0] == 0 or pr.shape[1] == 0:
                continue

            with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
                vals_pr = zonal_extract(gdf, pr, tmp.name)

            # Fill precipitation into existing records
            for rec in results:
                if rec["date"] == date:
                    rec["pr"] = vals_pr[rec["Entity"]]["mean"]
                    rec["n_cells_pr"] = vals_pr[rec["Entity"]]["count"]

        ds.close()

    df = pd.DataFrame(results)
    return df


# --- Example usage ---
temp_folder = r"C:\Users\black\temp daily"
precip_folder = r"C:\Users\black\precip"
shapefile = r"C:\Users\black\Dropbox\Proyectos\microdatos_entidad\Microdatos_Entidad.shp"
label = "Rural"

df = process_cr2met(temp_folder, precip_folder, shapefile, label)




  for t in range(ds.dims["time"]):


ValueError: width and height must be > 0

In [11]:
import xarray as xr

ds = xr.open_dataset(r"C:\Users\black\temp daily\CR2MET_tmin_tmax_v2.5_best_day_2024_04_005deg.nc")
print(ds)
print(ds.dims)         # lists dimension names
print(ds.coords)       # lists coordinate names

<xarray.Dataset> Size: 84MB
Dimensions:  (time: 30, lat: 800, lon: 220)
Coordinates:
  * time     (time) datetime64[ns] 240B 2024-04-01 2024-04-02 ... 2024-04-30
  * lon      (lon) float64 2kB -76.97 -76.92 -76.88 ... -66.12 -66.08 -66.03
  * lat      (lat) float64 6kB -56.98 -56.93 -56.88 ... -17.12 -17.07 -17.02
Data variables:
    tmax     (time, lat, lon) float64 42MB ...
    tmin     (time, lat, lon) float64 42MB ...
Attributes: (12/22)
    creation_date:           12-Aug-2024 06:35:20
    original_file:           /mnt/cirrus/cr2met_prodution/data_folder_cr2met_...
    original_file_mod_date:  2024-08-11 21:00:24.427215827 -0400
    av_ens_qtl:              0.0
    agg_metr:                median
    authors:                 Boisier et al.
    ...                      ...
    par_ns_reg_spl:          70
    par_lat_spl_sd:          1.5
    par_ele_spl_rdiff_max:   0.2
    era5_vars_s2:            tmin tmax t2m t2m06 t2m12 t2m18 t850 t700 td2m t...
    nb_months:               0
  