In [3]:
import pandas as pd
import numpy as np
import xarray as xr
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import os
import statsmodels.formula.api as smf


In [4]:

def load_production_data():
    print("Cargando datos de producción artesanal entre 2013 y 2023 en peru...")
    df = pd.read_csv('../data/imarpe/processed/produccion_mensual_matrix.csv')
    df['FECHA'] = pd.to_datetime(df['FECHA'])
    df.set_index('FECHA', inplace=True)
    df_long = (
        df
        .reset_index()                      # FECHA pasa de índice a columna
        .melt(
            id_vars="FECHA",
            var_name="puerto_especie",
            value_name="valor"
        )
    )

    df_long[["puerto", "especie"]] = df_long["puerto_especie"].str.split("_", expand=True)

    df_long = df_long.drop(columns="puerto_especie")

    return df_long

def add_port_coordinates(df_long, coordinates_file):
    print("Agregando coordenadas de puertos en el dataframe de pesca artesanal...")
    coordenadas = pd.read_csv(coordinates_file)
    coordenadas = coordenadas.rename(columns={"LATITUD": "lat", "LONGITUD": "lon", "PUERTO": "puerto"})
    coordenadas["puerto"] = coordenadas["puerto"].astype(str).str.strip().str.upper()
    df_long["puerto"] = df_long["puerto"].astype(str).str.strip().str.upper()

    df_long_geo = df_long.merge(
        coordenadas[["puerto", "lat", "lon"]],
        on="puerto",
        how="left"
    )
    
    return df_long_geo

def load_nearest_sst_data(df_long_geo, type='mean'):
    print("Cargando datos de SST y asignando al dataframe de pesca artesanal...")

    if type == 'mean':
        ds_sst = xr.open_dataset('../data/MODIS/processed/sst_anomaly_monthly_2002_2025.nc')
    elif type == 'max':
        ds_sst = xr.open_dataset('../data/MODIS/processed/sst_anomaly_monthly_max_2002_2025.nc')
    elif type == 'sum':
        ds_sst = xr.open_dataset('../data/MODIS/processed/sst_anomaly_monthly_sum_2002_2025.nc')

    df_sst = ds_sst.sel(lon=df_long_geo['lon'], lat=df_long_geo['lat'], method="nearest")
    df_sst = df_sst.to_dataframe().reset_index()
    df_sst = df_sst.rename(columns={"time": "FECHA"})

    df_long_geo = df_long_geo.merge(df_sst, on="FECHA", how="left")

    return df_long_geo


def find_valid_modis_pixel(
    ds_sst,
    lat_puerto,
    lon_puerto,
    search_radius=0.5,
    min_valid_frac=0.7
):
    lat_vals = ds_sst["lat"].values
    lon_vals = ds_sst["lon"].values

    lat_mask = np.abs(lat_vals - lat_puerto) <= search_radius
    lon_mask = np.abs(lon_vals - lon_puerto) <= search_radius

    candidate_lats = lat_vals[lat_mask]
    candidate_lons = lon_vals[lon_mask]

    results = []

    for lat in candidate_lats:
        for lon in candidate_lons:
            ts = ds_sst["sst"].sel(lat=lat, lon=lon)

            valid_frac = ts.notnull().mean().item()

            if valid_frac >= min_valid_frac:
                dist = np.sqrt((lat - lat_puerto)**2 + (lon - lon_puerto)**2)

                results.append({
                    "lat": lat,
                    "lon": lon,
                    "valid_frac": valid_frac,
                    "distance": dist
                })

    if len(results) == 0:
        return None

    results = sorted(results, key=lambda x: x["distance"])
    return results[0]

In [5]:
df_long = load_production_data()
df_long_geo = add_port_coordinates(df_long, "../data/imarpe/processed/coordenadas_puertos.csv")

Cargando datos de producción artesanal entre 2013 y 2023 en peru...
Agregando coordenadas de puertos en el dataframe de pesca artesanal...


In [6]:
ds_sst = xr.open_dataset('../data/MODIS/processed/sst_anomaly_monthly_2002_2025.nc')

In [7]:
puertos = (
    df_long_geo[["puerto", "lat", "lon"]]
    .drop_duplicates()
    .reset_index(drop=True)
)


In [8]:
pixel_por_puerto = []

# for _, row in puertos.iterrows():
#     res = find_valid_modis_pixel(
#         ds_sst,
#         lat_puerto=row["lat"],
#         lon_puerto=row["lon"],
#         search_radius=0.5,
#         min_valid_frac=0.7
#     )

#     if res is None:
#         pixel_por_puerto.append({
#             "puerto": row["puerto"],
#             "lat_puerto": row["lat"],
#             "lon_puerto": row["lon"],
#             "lat_modis": np.nan,
#             "lon_modis": np.nan,
#             "distance_deg": np.nan,
#             "valid_frac": np.nan
#         })
#     else:
#         pixel_por_puerto.append({
#             "puerto": row["puerto"],
#             "lat_puerto": row["lat"],
#             "lon_puerto": row["lon"],
#             "lat_modis": res["lat"],
#             "lon_modis": res["lon"],
#             "distance_deg": res["distance"],
#             "valid_frac": res["valid_frac"]
#         })

# df_pixel_modis = pd.DataFrame(pixel_por_puerto)


In [9]:
def load_sst_using_valid_pixels_monthly(df_long_geo, df_pixel_modis, type="mean"):
    if type == "mean":
        ds_sst = xr.open_dataset("../data/MODIS/processed/sst_anomaly_monthly_2002_2025.nc")
    elif type == "max":
        ds_sst = xr.open_dataset("../data/MODIS/processed/sst_anomaly_monthly_max_2002_2025.nc")
    elif type == "sum":
        ds_sst = xr.open_dataset("../data/MODIS/processed/sst_anomaly_monthly_sum_2002_2025.nc")

    df = df_long_geo.copy()

    df["puerto"] = df["puerto"].astype(str).str.strip().str.upper()
    px = df_pixel_modis.copy()
    px["puerto"] = px["puerto"].astype(str).str.strip().str.upper()

    df["FECHA"] = pd.to_datetime(df["FECHA"])
    df["anio_mes"] = df["FECHA"].dt.to_period("M").astype(str)

    df = df.merge(
        px[["puerto", "lat_modis", "lon_modis", "valid_frac", "distance_deg"]],
        on="puerto",
        how="left"
    )

    puertos_validos = (
        px.dropna(subset=["lat_modis", "lon_modis"])[["puerto", "lat_modis", "lon_modis"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    sst_rows = []
    for _, r in puertos_validos.iterrows():
        da = ds_sst["sst"].sel(lat=r["lat_modis"], lon=r["lon_modis"])
        tmp = da.to_dataframe(name="sst_anom").reset_index()
        tmp = tmp.rename(columns={"time": "FECHA"})
        tmp["FECHA"] = pd.to_datetime(tmp["FECHA"])
        tmp["anio_mes"] = tmp["FECHA"].dt.to_period("M").astype(str)
        tmp["puerto"] = r["puerto"]
        sst_rows.append(tmp[["puerto", "anio_mes", "sst_anom"]])

    df_sst = pd.concat(sst_rows, ignore_index=True).drop_duplicates(subset=["puerto", "anio_mes"])

    df = df.merge(df_sst, on=["puerto", "anio_mes"], how="left")

    return df

In [10]:
# df_pixel_modis.to_csv('../data/imarpe/processed/pixeles_modis_asignados_a_puertos_artesanales.csv', index=False)
df_pixel_modis = pd.read_csv('../data/imarpe/processed/pixeles_modis_asignados_a_puertos_artesanales.csv')

In [11]:
df_final = load_sst_using_valid_pixels_monthly(df_long_geo, df_pixel_modis, type="mean")

In [12]:

zero_stats = (
    df_long_geo
    .groupby("especie")["valor"]
    .apply(lambda x: (x == 0).mean())
    .reset_index(name="frac_zeros")
    .sort_values("frac_zeros")
)

zero_stats


Unnamed: 0,especie,frac_zeros
3,LISA,0.445431
1,CABALLA,0.525067
0,BONITO,0.552931
2,JUREL,0.607553
6,POTA,0.69675
4,MERLUZA,0.745016
5,PERICO,0.755478


In [16]:

especie_sel = zero_stats.iloc[0]["especie"]

df_e = df_final[df_final["especie"] == especie_sel].copy()

model = smf.ols(
    formula="valor ~ sst_anom + C(puerto)",
    data=df_e
).fit(cov_type="HC1")

model.summary()

0,1,2,3
Dep. Variable:,valor,R-squared:,0.161
Model:,OLS,Adj. R-squared:,0.155
Method:,Least Squares,F-statistic:,31.83
Date:,"Fri, 02 Jan 2026",Prob (F-statistic):,0.0
Time:,16:40:18,Log-Likelihood:,-48230.0
No. Observations:,8277,AIC:,96590.0
Df Residuals:,8212,BIC:,97050.0
Df Model:,64,,
Covariance Type:,HC1,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,7.6030,1.797,4.231,0.000,4.081,11.125
C(puerto)[T.ANCON],3.7387,2.403,1.556,0.120,-0.971,8.448
C(puerto)[T.ATICO],-7.7448,1.824,-4.247,0.000,-11.319,-4.171
C(puerto)[T.BAYOVAR],-7.6630,1.821,-4.209,0.000,-11.232,-4.094
C(puerto)[T.CABO BLANCO],41.1581,6.009,6.850,0.000,29.381,52.935
C(puerto)[T.CALETA LA CRUZ],4.3108,3.182,1.355,0.176,-1.926,10.548
C(puerto)[T.CALLAO],39.2728,7.760,5.061,0.000,24.063,54.482
C(puerto)[T.CANCAS],15.0297,5.427,2.770,0.006,4.394,25.666
C(puerto)[T.CARQUIN/HUACHO],8.6164,3.717,2.318,0.020,1.331,15.902

0,1,2,3
Omnibus:,17542.078,Durbin-Watson:,1.303
Prob(Omnibus):,0.0,Jarque-Bera (JB):,119341856.918
Skew:,18.261,Prob(JB):,0.0
Kurtosis:,590.12,Cond. No.,78.4
