In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cluster import KMeans
from geopy.distance import geodesic

In [3]:
class DateFeaturesTransformer(BaseEstimator, TransformerMixin):
    """
    Extrae características de fecha y calcula la edad del punto de agua.
    Se encarga de transformar 'date_recorded' y calcular 'waterpoint_age' a partir de 'construction_year'.
    """
    def __init__(self):
        self.waterpoint_age_median = None

    def fit(self, X, y=None):
        X = X.copy()
        X["date_recorded"] = pd.to_datetime(X["date_recorded"])
        X["year_recorded"] = X["date_recorded"].dt.year
        X["month_recorded"] = X["date_recorded"].dt.month
        X["day_recorded"] = X["date_recorded"].dt.day
        X["waterpoint_age"] = X["year_recorded"] - X["construction_year"]
        # Calcula la mediana para imputar cuando construction_year==0
        self.waterpoint_age_median = X.loc[X["construction_year"] != 0, "waterpoint_age"].median()
        return self

    def transform(self, X):
        X = X.copy()
        X["date_recorded"] = pd.to_datetime(X["date_recorded"])
        X["year_recorded"] = X["date_recorded"].dt.year
        X["month_recorded"] = X["date_recorded"].dt.month
        X["day_recorded"] = X["date_recorded"].dt.day
        X["waterpoint_age"] = X["year_recorded"] - X["construction_year"]
        X.loc[X["construction_year"] == 0, "waterpoint_age"] = self.waterpoint_age_median
        return X

In [4]:
class LocationClusterTransformer(BaseEstimator, TransformerMixin):
    """
    Crea un cluster de ubicación usando las columnas 'longitude' y 'latitude'.
    """
    def __init__(self, n_clusters=10, random_state=42):
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.kmeans = None

    def fit(self, X, y=None):
        self.kmeans = KMeans(n_clusters=self.n_clusters, random_state=self.random_state, n_init=10)
        self.kmeans.fit(X[["longitude", "latitude"]])
        return self

    def transform(self, X):
        X = X.copy()
        X["location_cluster"] = self.kmeans.predict(X[["longitude", "latitude"]])
        return X


In [5]:
class RegionDistanceTransformer(BaseEstimator, TransformerMixin):
    """
    Calcula la distancia entre la ubicación del punto y el centro (mediana) de la región.
    """
    def __init__(self):
        self.region_centers = {}

    def fit(self, X, y=None):
        # Calcula el centro (mediana) de cada región
        self.region_centers = X.groupby("region")[["latitude", "longitude"]].median().to_dict('index')
        return self

    def transform(self, X):
        X = X.copy()
        def calc_distance(row):
            region = row["region"]
            if region in self.region_centers:
                center = self.region_centers[region]
                return geodesic((row["latitude"], row["longitude"]),
                                (center["latitude"], center["longitude"])).km
            else:
                return np.nan
        X["distance_to_region_center"] = X.apply(calc_distance, axis=1)
        return X


In [None]:
# Carga los datos
df_1 = pd.read_csv('data1.csv')
df_2 = pd.read_csv('data2.csv')
df_target = pd.read_csv('objetivo.csv')

# Une los DataFrames de entrenamiento y combina con el target
df = pd.concat([df_1, df_2], axis=0)
df_final = pd.merge(df, df_target, on="id")

# Separa la variable objetivo
y = df_final['status_group']
X = df_final.drop(columns=['status_group'])

# Opcional: Elimina columnas que no vayas a usar
columns_to_drop = ['recorded_by']
X = X.drop(columns=columns_to_drop)

# Define los nombres de las columnas según el procesamiento:
# Aquí se asume que las variables numéricas y categóricas son aquellas que usarás en el modelo.
# Puedes modificar estas listas según tus necesidades.
numeric_features = ["gps_height", "waterpoint_age", "longitude", "latitude", "distance_to_region_center"]
categorical_features = ["region", "location_cluster", "public_meeting", "permit"]  



In [11]:
# Pipeline para el preprocesamiento de datos
preprocessor = ColumnTransformer(
    transformers=[
        # Transformación para variables numéricas: imputar y escalar
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy="median")),
            ('scaler', MinMaxScaler())
        ]), numeric_features),
        # Transformación para variables categóricas: imputar y aplicar OneHotEncoding
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy="most_frequent")),
            ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore'))
        ]), categorical_features)
    ]
)

In [None]:
# Pipeline completo que encadena los pasos de feature engineering y el modelo
pipeline = Pipeline(steps=[
    ('date_features', DateFeaturesTransformer()),
    ('location_cluster', LocationClusterTransformer(n_clusters=10)),
    ('region_distance', RegionDistanceTransformer()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])




In [None]:
# Dividir los datos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Entrenar el pipeline
pipeline.fit(X_train, y_train)

# Evaluar el modelo en el set de prueba
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# ======================
# PREDICCIÓN SOBRE NUEVOS DATOS
# ======================

# Cargar y procesar el DataFrame de predicción
prediciton_df = pd.read_csv('predicion.csv')

prediciton_df = prediciton_df.drop(columns=columns_to_drop)

# Aplica el pipeline entrenado (se aplicarán todos los pasos de transformación de forma consistente)
predictions = pipeline.predict(prediciton_df)

# Agrega las predicciones y guarda el resultado
prediciton_df["status_group"] = predictions
final_result = prediciton_df[['id', 'status_group']]
final_result.to_csv('resultados_predicciones.csv', index=False)
print("\nPrimeras 5 predicciones:")
print(prediciton_df[['id', 'status_group']].head())

Accuracy: 0.7047138047138047
Classification Report:
                          precision    recall  f1-score   support

             functional       0.74      0.79      0.76      9724
functional needs repair       0.42      0.27      0.33      1293
         non functional       0.68      0.67      0.68      6803

               accuracy                           0.70     17820
              macro avg       0.62      0.58      0.59     17820
           weighted avg       0.70      0.70      0.70     17820


Primeras 5 predicciones:
      id    status_group
0  50785  non functional
1  51630      functional
2  17168  non functional
3  45559  non functional
4  49871  non functional


In [21]:
# Aplicar los transformers personalizados manualmente
X_temp = X_train.copy()
X_temp = pipeline.named_steps['date_features'].transform(X_temp)
X_temp = pipeline.named_steps['location_cluster'].transform(X_temp)
X_temp = pipeline.named_steps['region_distance'].transform(X_temp)

# Imprimir las columnas de X_temp
print("Columnas en X_temp:", X_temp.columns.tolist())

Columnas en X_temp: ['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'year_recorded', 'month_recorded', 'day_recorded', 'waterpoint_age', 'location_cluster', 'distance_to_region_center']


In [22]:
X_train_preprocessed = pipeline.named_steps['preprocessor'].transform(X_temp)
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

print("Shape:", X_train_preprocessed.shape, "número de columnas:", len(feature_names))

Shape: (41580, 36) número de columnas: 36


In [23]:
X_train_preprocessed = pipeline.named_steps['preprocessor'].transform(X_temp)
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

print("Shape:", X_train_preprocessed.shape, "número de columnas:", len(feature_names))

Shape: (41580, 36) número de columnas: 36


In [26]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Suponiendo que ya tienes X_train_preprocessed (con shape (41580, 36)) y feature_names definidos
# Convertir el array preprocesado a DataFrame
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=feature_names)
# 2. Extraer la Importancia de las Características del Modelo
importances = pipeline.named_steps['classifier'].feature_importances_
feat_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Importancia de las 10 características principales:")
print(feat_importances.head(10))


ValueError: Shape of passed values is (41580, 1), indices imply (41580, 36)