In [25]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans
from geopy.distance import geodesic

In [26]:
class DateFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    Extrae características de fecha y calcula la edad del punto de agua.
    Se encarga de transformar 'date_recorded' y calcular 'waterpoint_age' a partir de 'construction_year'.
    """
    def __init__(self):
        self.waterpoint_age_median = None

    def fit(self, X, y=None):
        X = X.copy()
        X["date_recorded"] = pd.to_datetime(X["date_recorded"])
        X["year_recorded"] = X["date_recorded"].dt.year
        X["month_recorded"] = X["date_recorded"].dt.month
        X["day_recorded"] = X["date_recorded"].dt.day
        X["waterpoint_age"] = X["year_recorded"] - X["construction_year"]
        # Calcula la mediana para imputar cuando construction_year==0
        self.waterpoint_age_median = X.loc[X["construction_year"] != 0, "waterpoint_age"].median()
        return self

    def transform(self, X):
        X = X.copy()
        X["date_recorded"] = pd.to_datetime(X["date_recorded"])
        X["year_recorded"] = X["date_recorded"].dt.year
        X["month_recorded"] = X["date_recorded"].dt.month
        X["day_recorded"] = X["date_recorded"].dt.day
        X["waterpoint_age"] = X["year_recorded"] - X["construction_year"]
        X.loc[X["construction_year"] == 0, "waterpoint_age"] = self.waterpoint_age_median
        bins = [-np.inf, 0, 8, 26, 53]
        labels = ['Negativo/Inconsistente', 'Muy Baja', 'Baja', 'Alta']
        X['waterpoint_age_category'] = pd.cut(X['waterpoint_age'], bins=bins, labels=labels, include_lowest=True)
        return X

In [27]:
class LocationClusterTransformer(BaseEstimator, TransformerMixin):
    """
    Crea un cluster de ubicación usando las columnas 'longitude' y 'latitude'.
    """
    def __init__(self, n_clusters=10, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = None

    def fit(self, X, y=None):
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)
        self.kmeans.fit(X[["longitude", "latitude"]])
        return self

    def transform(self, X):
        X = X.copy()
        X["location_cluster"] = self.kmeans.predict(X[["longitude", "latitude"]])
        return X


In [28]:
class RegionDistanceTransformer(BaseEstimator, TransformerMixin):
    """
    Calcula la distancia entre la ubicación del punto y el centro (mediana) de la región.
    """
    def __init__(self):
        self.region_centers = {}

    def fit(self, X, y=None):
        # Calcula el centro (mediana) de cada región
        self.region_centers = X.groupby("region")[["latitude", "longitude"]].median().to_dict('index')
        return self

    def transform(self, X):
        X = X.copy()
        def calc_distance(row):
            region = row["region"]
            if region in self.region_centers:
                center = self.region_centers[region]
                return geodesic((row["latitude"], row["longitude"]),
                                (center["latitude"], center["longitude"])).km
            else:
                return np.nan
        X["distance_to_region_center"] = X.apply(calc_distance, axis=1)
        X['log_distance'] = np.log1p(X['distance_to_region_center'])
        X['log_distance_categoric'] = pd.qcut(X['log_distance'], q=4, 
                                        labels=['Muy corta', 'Corta', 'Media', 'Larga'])
        return X


In [29]:
df_1 = pd.read_csv('data1.csv')
df_2 = pd.read_csv('data2.csv')
df_target = pd.read_csv('objetivo.csv')

# Une los DataFrames de entrenamiento y combina con el target
df = pd.concat([df_1, df_2], axis=0)
df_final = pd.merge(df, df_target, on="id")

# Separa la variable objetivo
y = df_final['status_group']
X = df_final.drop(columns=['status_group'])

#  Elimina columnas que no vayas a usar
columns_to_drop = ['recorded_by']
X = X.drop(columns=columns_to_drop)

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()


In [30]:
preprocessor = ColumnTransformer(
    transformers=[
        # Transformación para variables numéricas: imputar y escalar
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy="median")),
            ('scaler', MinMaxScaler())
        ]), numeric_features),
        # Transformación para variables categóricas: imputar y aplicar OneHotEncoding
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_features)
    ],
    remainder='drop'
)


In [38]:
# Pipeline completo que encadena los pasos de feature engineering y el modelo
pipeline = Pipeline(steps=[
    ('date_features', DateFeaturesTransformer()),
    ('location_cluster', LocationClusterTransformer(n_clusters=10)),
    ('region_distance', RegionDistanceTransformer()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [39]:
#X = X.drop(columns=["latitude", "longitude","distance_to_region_center","log_distance"])
#X = X.drop(columns=["date_recorded", "year_recorded", "month_recorded", "day_recorded", "waterpoint_age","construction_year"])
# Dividir los datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Evaluar el modelo en el set de prueba
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ======================
# PREDICCIÓN SOBRE NUEVOS DATOS
# ======================

# Cargar y procesar el DataFrame de predicción
prediciton_df = pd.read_csv('predicion.csv')

prediciton_df = prediciton_df.drop(columns=columns_to_drop)

# Aplica el pipeline entrenado (se aplicarán todos los pasos de transformación de forma consistente)
predictions = pipeline.predict(prediciton_df)

# Agrega las predicciones y guarda el resultado
prediciton_df["status_group"] = predictions
final_result = prediciton_df[['id', 'status_group']]
final_result.to_csv('resultados_predicciones.csv', index=False)
print("\nPrimeras 5 predicciones:")
print(prediciton_df[['id', 'status_group']].head())



Accuracy: 0.802300785634119
Classification Report:
                          precision    recall  f1-score   support

             functional       0.80      0.89      0.84      9724
functional needs repair       0.58      0.33      0.42      1293
         non functional       0.83      0.77      0.80      6803

               accuracy                           0.80     17820
              macro avg       0.74      0.66      0.69     17820
           weighted avg       0.80      0.80      0.80     17820






Primeras 5 predicciones:
      id    status_group
0  50785      functional
1  51630      functional
2  17168      functional
3  45559  non functional
4  49871      functional


In [33]:
# Extraer los nombres de las features del preprocesador
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

# Obtener las importancias del clasificador
importances = pipeline.named_steps['classifier'].feature_importances_

# Crear un DataFrame para visualizar la importancia de cada feature
feat_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Importancia de las 10 características principales:")
print(feat_importances.head(20))

Importancia de las 10 características principales:
                                       feature  importance
51857               cat__waterpoint_type_other    0.037508
51862         cat__waterpoint_type_group_other    0.034219
51784         cat__extraction_type_class_other    0.027774
51761               cat__extraction_type_other    0.026434
51827                     cat__quantity_enough    0.025959
9                       num__construction_year    0.024529
51831               cat__quantity_group_enough    0.021328
1                              num__amount_tsh    0.019646
3                               num__longitude    0.014721
51775         cat__extraction_type_group_other    0.014658
51810              cat__payment_type_never pay    0.013968
2                              num__gps_height    0.013699
51826               cat__quality_group_unknown    0.013679
4                                num__latitude    0.012670
51770       cat__extraction_type_group_gravity    0.011272
51755

In [None]:
from sklearn.model_selection import GridSearchCV

# Definir la grilla de parámetros
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Configurar el modelo base
rf = RandomForestClassifier(random_state=42)

# Crear el objeto GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Usa todos los núcleos disponibles
    verbose=2
)
corr_matrix = X.corr().abs()

# Extraer la parte superior del triángulo de la matriz
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identificar columnas con alta correlación (> 0.9)
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Eliminar columnas correlacionadas
X_reduced = X.drop(columns=to_drop)

# Ejecutar la búsqueda
grid_search.fit(X_train, y_train)

# Mostrar los mejores parámetros
print("🔍 Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)

# Evaluar el mejor modelo
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)

# Resultados
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(f"\n✅ Accuracy con mejor modelo: {accuracy_score(y_test, y_pred_best):.2%}\n")
print("📊 Reporte de clasificación:\n")
print(classification_report(y_test, y_pred_best))
print("🧩 Matriz de confusión:\n")
print(confusion_matrix(y_test, y_pred_best))

AttributeError: 'Pipeline' object has no attribute 'corr'