<a href="https://colab.research.google.com/github/freakezoide/codigos-de-coolab/blob/main/prueba_final_prueba_71.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# =============================================
# 🔧 1. CONFIGURACIÓN INICIAL
# =============================================
!pip install -q kmodes imbalanced-learn xgboost

import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from kmodes.kprototypes import KPrototypes
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, classification_report, roc_auc_score,
    precision_recall_curve, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# Estilo visual
plt.style.use('ggplot')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)
sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)


In [30]:
# =============================================
# 📂 2. CARGA Y LIMPIEZA DE DATOS
# =============================================
def load_and_clean(path):
    """Carga robusta de CSV con limpieza básica."""
    for enc in ['utf-8', 'latin1', 'ISO-8859-1']:
        try:
            df = pd.read_csv(path, encoding=enc, on_bad_lines='skip')
            break
        except Exception:
            continue

    df.columns = df.columns.str.replace('\ufeff', '', regex=False).str.strip().str.lower()

    rename_map = {
        'fechayhora': 'fecha y hora',
        'tipodesiniestro': 'tipo de siniestro',
        'fallecidoalosdias': 'fallecidos a los dias',
        'otrovehiculo': 'otro vehiculo',
        'lugardelsiniestro': 'lugar del siniestro'
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    if 'fecha y hora' in df.columns:
        df['fecha y hora'] = (
            df['fecha y hora'].astype(str)
            .str.replace('#', '', regex=False)
            .str.replace('am|pm', '', case=False, regex=True)
            .str.strip()
        )
        df['fecha y hora'] = pd.to_datetime(df['fecha y hora'], errors='coerce')
    return df

# Cargar todos los CSV
files = glob.glob("*.csv")
df_list = []
for f in files:
    print(f"Cargando {f}...")
    temp_df = load_and_clean(f)
    df_list.append(temp_df)
df = pd.concat(df_list, ignore_index=True)

Cargando Datos_fallecidos_siniestros_2018.csv...
Cargando Datos_fallecidos_siniestros_2014.csv...
Cargando Datos_fallecidos_siniestros_2016.csv...
Cargando siniestros_con_clusters.csv...
Cargando Datos_fallecidos_siniestros_2019.csv...
Cargando Datos_fallecidos_siniestros_2017.csv...


In [31]:
# =============================================
# 🧠 3. FEATURE ENGINEERING
# =============================================
df['año'] = df['fecha y hora'].dt.year
df['mes'] = df['fecha y hora'].dt.month
df['hora'] = df['fecha y hora'].dt.hour
df['dia_semana'] = df['fecha y hora'].dt.day_name()
df['es_fin_de_semana'] = df['dia_semana'].isin(['Saturday', 'Sunday']).astype(int)
df['hora_categoria'] = pd.cut(df['hora'], bins=[0,6,12,18,24],
                              labels=['Madrugada', 'Mañana', 'Tarde', 'Noche'], include_lowest=True)

df['objetivo_clasificacion'] = np.where(df['fallecidos a los dias'] <= 1, 1, 0)

for col in ['vehiculo', 'departamento', 'sexo', 'tipo de siniestro',
            'jurisdiccion', 'rol', 'localidad']:
    if col in df.columns:
        df[col] = df[col].fillna('No especificado')

In [37]:
# =============================================
# 📊 4. VISUALIZACIONES
# =============================================
def plot_interactive_trend(df):
    fig = px.line(df.resample('M', on='fecha y hora').size(),
                  title='Tendencia Mensual de Accidentes Fatales',
                  labels={'value': 'N° de fallecidos', 'fecha y hora': 'Fecha'})
    fig.update_layout(hovermode="x unified")
    fig.show()

def plot_heatmap_hora_dia(df):
    plt.figure(figsize=(14, 8))
    heatmap_data = df.pivot_table(index='hora', columns='dia_semana',
                                  values='objetivo_clasificacion', aggfunc='count', fill_value=0)
    sns.heatmap(heatmap_data, cmap="YlOrRd", annot=True, fmt="d")
    plt.title('Distribución de Accidentes por Hora y Día')
    plt.tight_layout()
    plt.show()

# plot_interactive_trend(df)
# plot_heatmap_hora_dia(df)

In [32]:
# =============================================
# 🔍 5. K-PROTOTYPES
# =============================================
def run_kprototypes(df):
    cols_cat = ['departamento', 'localidad', 'jurisdiccion', 'tipo de siniestro',
                'vehiculo', 'rol', 'sexo']
    cols_num = ['hora', 'edad', 'fallecidos a los dias']

    cols_cat = [c for c in cols_cat if c in df.columns]
    cols_num = [c for c in cols_num if c in df.columns]

    data = df[cols_cat + cols_num].copy()
    for col in cols_num:
        data[col] = data[col].fillna(data[col].median())
    for col in cols_cat:
        data[col] = data[col].fillna(data[col].mode()[0])

    matrix = data.to_numpy()
    cat_pos = list(range(len(cols_cat)))

    model = KPrototypes(n_clusters=4, init='Cao', verbose=2, random_state=42)
    clusters = model.fit_predict(matrix, categorical=cat_pos)
    df['cluster'] = clusters
    return clusters

df['cluster'] = run_kprototypes(df)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 1170, ncost: 499242.6003133329
Run: 1, iteration: 2/100, moves: 665, ncost: 449578.56198552926
Run: 1, iteration: 3/100, moves: 387, ncost: 433199.4621975839
Run: 1, iteration: 4/100, moves: 250, ncost: 426469.84908881516
Run: 1, iteration: 5/100, moves: 199, ncost: 422692.0776667402
Run: 1, iteration: 6/100, moves: 230, ncost: 417662.6672012461
Run: 1, iteration: 7/100, moves: 377, ncost: 407495.6207936638
Run: 1, iteration: 8/100, moves: 452, ncost: 394828.08046833216
Run: 1, iteration: 9/100, moves: 352, ncost: 385014.3892417711
Run: 1, iteration: 10/100, moves: 377, ncost: 373631.7152252842
Run: 1, iteration: 11/100, moves: 291, ncost: 366493.0890077876
Run: 1, iteration: 12/100, moves: 199, ncost: 363051.3783720058
Run: 1, iteration: 13/100, moves: 181, ncost: 360527.66075548803
Run: 1, iteratio

In [33]:
# =============================================
# 🌲 6. RANDOM FOREST
# =============================================
def run_random_forest(df):
    features = ['departamento', 'localidad', 'jurisdiccion', 'tipo de siniestro',
                'vehiculo', 'rol', 'sexo', 'hora', 'edad', 'cluster']
    target = 'fallecidos a los dias'

    df_rf = df[features + [target]].copy().dropna(subset=[target])

    for col in ['hora', 'edad']:
        df_rf[col] = df_rf[col].fillna(df_rf[col].median())
    for col in set(features) - {'hora', 'edad', 'cluster'}:
        df_rf[col] = df_rf[col].fillna(df_rf[col].mode()[0])

    X = pd.get_dummies(df_rf[features], drop_first=True)
    y = df_rf[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.2f}")
    print(f"R²: {r2_score(y_test, y_pred):.3f}")
    return model

rf_model = run_random_forest(df)


MAE: 1.54
RMSE: 2.57
R²: 0.602


In [34]:
# =============================================
# ⚡ 7. XGBOOST CLASSIFIER
# =============================================
def run_xgboost_classifier(df):
    features = ['tipo de siniestro', 'vehiculo', 'sexo', 'departamento',
                'año', 'mes', 'hora', 'dia_semana', 'es_fin_de_semana',
                'hora_categoria', 'cluster']
    target = 'objetivo_clasificacion'

    X = df[features].copy()
    y = df[target]

    X_encoded = pd.get_dummies(X, drop_first=True)
    imputer = SimpleImputer(strategy='most_frequent')
    X_encoded = imputer.fit_transform(X_encoded)

    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X_encoded, y)

    X_train, X_test, y_train, y_test = train_test_split(
        X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
    )

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    params = {
        'max_depth': [3, 5],
        'learning_rate': [0.1],
        'n_estimators': [100],
        'subsample': [0.9]
    }

    grid = GridSearchCV(xgb, params, cv=3, scoring='roc_auc', n_jobs=-1)
    grid.fit(X_train, y_train)
    best = grid.best_estimator_

    y_pred = best.predict(X_test)
    y_proba = best.predict_proba(X_test)[:, 1]

    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
    print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.2%}")
    print(classification_report(y_test, y_pred))

    return best

xgb_model = run_xgboost_classifier(df)



Parameters: { "use_label_encoder" } are not used.




Accuracy: 98.66%
AUC-ROC: 99.91%
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       822
           1       0.97      1.00      0.99       821

    accuracy                           0.99      1643
   macro avg       0.99      0.99      0.99      1643
weighted avg       0.99      0.99      0.99      1643



In [35]:
# =============================================
# 🧭 8. PLANES DE PREVENCIÓN
# =============================================
def generate_prevention_plans(df):
    top_siniestros = df['tipo de siniestro'].value_counts().nlargest(5).index
    strategies = {
        'Colisión': ['Control de velocidad', 'Mejor señalización'],
        'Atropello': ['Cruces peatonales', 'Reducción de velocidad'],
        'Volcamiento': ['Revisión técnica', 'Capacitación'],
        'Caída': ['Señalización de peligro'],
        'Otros': ['Concientización', 'Leyes estrictas']
    }

    fig = make_subplots(rows=1, cols=5, subplot_titles=top_siniestros)
    for i, siniestro in enumerate(top_siniestros, 1):
        s = strategies.get(siniestro, ['Medidas generales'])
        fig.add_trace(go.Bar(y=s, x=[1]*len(s), orientation='h'), row=1, col=i)
    fig.update_layout(title='Prevención por tipo de siniestro', showlegend=False, height=400)
    fig.show()

generate_prevention_plans(df)

In [36]:
# =============================================
# 💾 9. GUARDAR RESULTADOS
# =============================================
df.to_csv('siniestros_con_clusters.csv', index=False)
print("\n✅ Análisis completado. Resultados guardados.")


✅ Análisis completado. Resultados guardados.


In [38]:
# =============================================
# 🥧 10. GRÁFICOS DE TORTA Y TABLAS RESUMEN
# =============================================
def plot_pie_and_summary(df, column, top_n=6):
    """Grafica torta y tabla resumen para una columna categórica."""
    print(f"\n🔍 Resumen para: {column}")
    counts = df[column].value_counts().nlargest(top_n)
    other = df[column].value_counts().iloc[top_n:].sum()
    if other > 0:
        counts['Otros'] = other

    # Pie chart
    fig = go.Figure(data=[go.Pie(
        labels=counts.index,
        values=counts.values,
        textinfo='label+percent',
        insidetextorientation='radial'
    )])
    fig.update_layout(title=f"Distribución de {column.capitalize()}")
    fig.show()

    # Tabla resumen
    summary_table = counts.reset_index()
    summary_table.columns = [column, 'Frecuencia']
    display(summary_table)

# Ejemplos:
plot_pie_and_summary(df, 'tipo de siniestro')
plot_pie_and_summary(df, 'departamento')
plot_pie_and_summary(df, 'sexo')
plot_pie_and_summary(df, 'vehiculo')


🔍 Resumen para: tipo de siniestro


Unnamed: 0,tipo de siniestro,Frecuencia
0,COLISIÓN ENTRE VEHÍCULOS,2496
1,DESPISTE,998
2,ATROPELLO DE PEATÓN,806
3,CAÍDA,356
4,COLISIÓN CON OBSTÁCULO EN CALZADA,78
5,ATROPELLO DE ANIMALES,74



🔍 Resumen para: departamento


Unnamed: 0,departamento,Frecuencia
0,MONTEVIDEO,1260
1,CANELONES,930
2,MALDONADO,342
3,SAN JOSE,284
4,COLONIA,232
5,TACUAREMBO,210
6,Otros,1550



🔍 Resumen para: sexo


Unnamed: 0,sexo,Frecuencia
0,M,3682
1,F,1124
2,No especificado,2



🔍 Resumen para: vehiculo


Unnamed: 0,vehiculo,Frecuencia
0,MOTO,2210
1,AUTO,1016
2,PEATÓN,796
3,CAMIONETA,408
4,BICICLETA,266
5,CAMION,72
6,Otros,40





Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.




In [39]:
# =============================================
# 📋 11. TABLAS DE RESUMEN AGRUPADAS
# =============================================
def resumen_estadistico(df, group_col, metrics=['fallecidos a los dias', 'edad', 'hora']):
    """
    Genera resumen estadístico agrupado por una columna.
    - group_col: columna para agrupar
    - metrics: lista de columnas numéricas a resumir
    """
    print(f"\n📊 Estadísticas agrupadas por: {group_col}")
    resumen = df.groupby(group_col)[metrics].agg(['count', 'mean', 'std', 'min', 'max']).round(2)
    display(resumen)

# Ejemplos:
resumen_estadistico(df, 'departamento')
resumen_estadistico(df, 'tipo de siniestro')
resumen_estadistico(df, 'hora_categoria')
resumen_estadistico(df, 'cluster')


📊 Estadísticas agrupadas por: departamento


Unnamed: 0_level_0,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,edad,edad,edad,edad,edad,hora,hora,hora,hora,hora
Unnamed: 0_level_1,count,mean,std,min,max,count,mean,std,min,max,count,mean,std,min,max
departamento,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
ARTIGAS,18,0.44,0.98,0.0,3.0,110,41.58,22.24,0.0,85.0,56,12.64,7.27,1.0,23.0
CANELONES,152,1.24,3.0,0.0,16.0,918,43.01,21.86,0.0,96.0,464,13.66,6.7,0.0,23.0
CERRO LARGO,42,3.14,6.87,0.0,29.0,178,41.47,21.77,0.0,89.0,90,13.22,6.39,2.0,23.0
COLONIA,40,1.85,4.64,0.0,19.0,230,41.18,21.98,0.0,94.0,134,12.27,6.46,0.0,22.0
DURAZNO,20,0.6,1.85,0.0,6.0,102,43.47,19.31,16.0,88.0,56,11.07,6.23,2.0,23.0
FLORES,8,0.5,0.93,0.0,2.0,58,38.79,17.78,15.0,86.0,28,13.0,5.02,2.0,21.0
FLORIDA,32,1.69,5.13,0.0,21.0,142,50.25,18.94,11.0,89.0,68,13.97,6.72,0.0,23.0
LAVALLEJA,22,1.73,4.64,0.0,16.0,86,44.14,20.0,8.0,84.0,40,12.3,5.41,0.0,20.0
MALDONADO,64,1.25,2.64,0.0,11.0,340,41.18,20.32,3.0,95.0,172,13.09,6.53,2.0,23.0
MONTEVIDEO,202,1.4,3.14,0.0,19.0,1244,44.11,21.64,0.0,93.0,626,12.75,6.38,0.0,23.0



📊 Estadísticas agrupadas por: tipo de siniestro


Unnamed: 0_level_0,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,edad,edad,edad,edad,edad,hora,hora,hora,hora,hora
Unnamed: 0_level_1,count,mean,std,min,max,count,mean,std,min,max,count,mean,std,min,max
tipo de siniestro,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
ATROPELLO DE ANIMALES,8,0.0,0.0,0.0,0.0,74,38.16,16.96,16.0,77.0,36,11.28,7.79,0.0,21.0
ATROPELLO DE PEATÓN,152,1.71,3.73,0.0,19.0,794,56.6,24.05,0.0,96.0,430,13.75,6.54,0.0,23.0
CAÍDA,36,1.94,3.46,0.0,15.0,356,40.51,18.76,3.0,85.0,166,11.9,7.45,0.0,23.0
COLISIÓN CON OBSTÁCULO EN CALZADA,6,0.0,0.0,0.0,0.0,76,38.47,16.13,8.0,70.0,30,15.8,5.82,5.0,23.0
COLISIÓN ENTRE VEHÍCULOS,420,1.64,4.11,0.0,29.0,2482,40.95,19.87,0.0,94.0,1226,12.89,6.45,0.0,23.0
DESPISTE,222,1.32,4.15,0.0,24.0,980,41.53,20.17,0.0,93.0,558,11.7,6.33,0.0,23.0



📊 Estadísticas agrupadas por: hora_categoria






Unnamed: 0_level_0,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,edad,edad,edad,edad,edad,hora,hora,hora,hora,hora
Unnamed: 0_level_1,count,mean,std,min,max,count,mean,std,min,max,count,mean,std,min,max
hora_categoria,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Madrugada,186,0.83,2.75,0.0,19.0,554,37.0,16.58,0.0,87.0,558,3.58,2.16,0.0,6.0
Mañana,204,1.37,3.5,0.0,24.0,570,47.21,23.03,0.0,93.0,578,9.45,1.76,7.0,12.0
Tarde,230,2.17,5.01,0.0,29.0,688,45.38,23.43,0.0,92.0,696,15.75,1.7,13.0,18.0
Noche,224,1.69,4.04,0.0,27.0,610,44.25,20.1,0.0,94.0,614,20.65,1.35,19.0,23.0



📊 Estadísticas agrupadas por: cluster


Unnamed: 0_level_0,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,fallecidos a los dias,edad,edad,edad,edad,edad,hora,hora,hora,hora,hora
Unnamed: 0_level_1,count,mean,std,min,max,count,mean,std,min,max,count,mean,std,min,max
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,220,1.46,3.93,0.0,27.0,1138,57.8,5.25,49.0,68.0,580,13.13,6.59,0.0,23.0
1,246,1.15,3.08,0.0,21.0,1596,20.49,6.18,0.0,30.0,790,11.81,6.89,0.0,23.0
2,232,1.19,3.07,0.0,16.0,1234,38.68,5.36,30.0,48.0,670,12.62,6.85,0.0,23.0
3,146,2.93,6.0,0.0,29.0,794,77.12,6.64,67.0,96.0,406,14.04,4.93,0.0,23.0
