Cargar librerías y dataset

In [1]:
# ==========================================
# LIMPIEZA Y FUSIÓN FINAL DE EVENTOS DE INCENDIO
# ==========================================

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import haversine_distances
from collections import defaultdict

# ------------------------------------------
# 1. Cargar dataset
# ------------------------------------------
df = pd.read_csv("/content/sample_data/Dataset_Incendios_y_clima_Final.csv")

# Parseo de fechas
df["fecha_inicio"] = pd.to_datetime(df["fecha_inicio"])
df["time_start"] = pd.to_datetime(df["time_start"])
df["time_end"] = pd.to_datetime(df["time_end"])

# ------------------------------------------
# 2. Función distancia Haversine (km)
# ------------------------------------------
def haversine_km(lat1, lon1, lat2, lon2):
    coords = np.radians([[lat1, lon1], [lat2, lon2]])
    return haversine_distances(coords)[0, 1] * 6371

# ------------------------------------------
# 3. Detectar eventos fragmentados
#    Criterios:
#    - misma comuna
#    - diferencia inicio <= 24 h
#    - distancia <= 1.5 km
# ------------------------------------------
pairs_to_merge = []

for comuna, g in df.groupby("comuna"):
    g = g.sort_values("fecha_inicio").reset_index(drop=True)

    for i in range(len(g)):
        for j in range(i + 1, len(g)):
            dt_hours = abs(
                (g.loc[j, "fecha_inicio"] - g.loc[i, "fecha_inicio"])
                .total_seconds()
            ) / 3600

            if dt_hours > 24:
                break

            dist_km = haversine_km(
                g.loc[i, "latitud"], g.loc[i, "longitud"],
                g.loc[j, "latitud"], g.loc[j, "longitud"]
            )

            if dist_km <= 1.5:
                pairs_to_merge.append(
                    (g.loc[i, "event_id"], g.loc[j, "event_id"])
                )

# ------------------------------------------
# 4. Resolver fusiones (componentes conexas)
# ------------------------------------------
graph = defaultdict(set)

for a, b in pairs_to_merge:
    graph[a].add(b)
    graph[b].add(a)

visited = set()
components = []

def dfs(node, comp):
    for neigh in graph[node]:
        if neigh not in visited:
            visited.add(neigh)
            comp.add(neigh)
            dfs(neigh, comp)

for node in graph:
    if node not in visited:
        visited.add(node)
        comp = {node}
        dfs(node, comp)
        components.append(comp)

# ------------------------------------------
# 5. Crear event_id_final
# ------------------------------------------
event_map = {}

for i, comp in enumerate(components):
    new_id = f"FIRE_{i+1}"
    for old_id in comp:
        event_map[old_id] = new_id

df["event_id_final"] = df["event_id"].map(event_map)
df["event_id_final"] = df["event_id_final"].fillna(df["event_id"])

# ------------------------------------------
# 6. Reagregar a nivel incendio físico
# ------------------------------------------
final_df = df.groupby("event_id_final").agg(
    comuna=("comuna", "first"),
    latitud=("latitud", "mean"),
    longitud=("longitud", "mean"),
    time_start=("time_start", "min"),
    time_end=("time_end", "max"),
    frp_inicial=("frp_inicial", "min"),
    frp_max=("frp_max", "max")
).reset_index()

final_df["duracion_horas"] = (
    (final_df["time_end"] - final_df["time_start"])
    .dt.total_seconds() / 3600
)

final_df["delta_frp"] = final_df["frp_max"] - final_df["frp_inicial"]
final_df["target_transicion"] = (final_df["delta_frp"] > 3).astype(int)

# ------------------------------------------
# 7. Eliminar eventos sin duración (clave)
# ------------------------------------------
print("Eventos antes de filtrar duración 0:", len(final_df))

final_df = final_df[final_df["duracion_horas"] > 0].reset_index(drop=True)

print("Eventos después de filtrar duración 0:", len(final_df))

# ------------------------------------------
# 8. Recuperar variables climáticas
#    (evento más temprano)
# ------------------------------------------
climate_cols = [
    c for c in df.columns
    if "lag" in c or "_mean" in c or "precipitation" in c
]

climate_df = (
    df.sort_values("fecha_inicio")
      .groupby("event_id_final")
      .first()[climate_cols]
      .reset_index()
)

final_df = final_df.merge(climate_df, on="event_id_final", how="left")

# ------------------------------------------
# 9. Auditoría final
# ------------------------------------------
print("\nEventos antes de fusionar:", df["event_id"].nunique())
print("Eventos finales:", final_df["event_id_final"].nunique())
print("\nDistribución del target:")
print(final_df["target_transicion"].value_counts(normalize=True))

# ------------------------------------------
# 10. Guardar dataset final
# ------------------------------------------
final_df.to_csv(
    "/content/Dataset_Incendios_Eventos_Fusionados_SinDuracionCero.csv",
    index=False
)

print("\nDataset final guardado como:")
print("Dataset_Incendios_Eventos_Fusionados_SinDuracionCero.csv")


Eventos antes de filtrar duración 0: 34908
Eventos después de filtrar duración 0: 7778

Eventos antes de fusionar: 67168
Eventos finales: 7778

Distribución del target:
target_transicion
1    0.60774
0    0.39226
Name: proportion, dtype: float64

Dataset final guardado como:
Dataset_Incendios_Eventos_Fusionados_SinDuracionCero.csv
