In [1]:
# Importamos librerías
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler,  OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from joblib import dump

In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Cargamos los datos
Data = pd.read_csv("../Datos/data_adults.csv")

Data_cop = Data.drop("fnlwgt", axis=1)
Data_cop = Data_cop.drop("education-num", axis=1)

X = Data_cop.drop("income", axis=1)
y = Data_cop['income'].isin(['>50K.','>50K'])

# Definimos las variables categóricas y numéricas
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Creamos pipelines de preprocesamiento
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Usamos ColumnTransformer para combinar ambas transformaciones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [3]:
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creamos un nuevo pipeline que incluye el preprocesamiento y el modelo
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', GradientBoostingClassifier(random_state=42))])

# Entrenamos el pipeline completo en los datos de entrenamiento
model_pipeline.fit(X_train, y_train)

# Hacemos predicciones en el conjunto de prueba
y_pred = model_pipeline.predict(X_test)

# Evaluamos el rendimiento del modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.88      0.96      0.91      7414
        True       0.80      0.57      0.67      2355

    accuracy                           0.86      9769
   macro avg       0.84      0.76      0.79      9769
weighted avg       0.86      0.86      0.85      9769



In [5]:
# Guardamos el pipeline completo (preprocesamiento + modelo entrenado)
dump(model_pipeline, 'model_pipeline_gb.joblib')

['model_pipeline_gb.joblib']

## Optimización de hiperparámetros

In [6]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Cargamos los datos
Data = pd.read_csv("../Datos/data_adults.csv")

Data_cop = Data.drop("fnlwgt", axis=1)
Data_cop = Data_cop.drop("education-num", axis=1)

X = Data_cop.drop("income", axis=1)
y = Data_cop['income'].isin(['>50K.','>50K'])

# Definimos las variables categóricas y numéricas
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# Creamos pipelines de preprocesamiento
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Usamos ColumnTransformer para combinar ambas transformaciones
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [7]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth' : [2,5,10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2]
}

model_gb = GradientBoostingClassifier(random_state = 2024)

##Instanciar la busqueda del mejor modelo
model_busqueda = GridSearchCV(estimator=model_gb, 
                              param_grid = param_grid, 
                              cv= 3, 
                              verbose = 4,
                              scoring = 'f1',
                              n_jobs=-1)

model_pipeline_train = Pipeline(steps=[('preprocessor', preprocessor),
                       ('busqueda', model_busqueda)])

model_pipeline_train.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [8]:
busqueda_resultado = model_pipeline_train['busqueda']
busqueda_resultado.best_estimator_.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.2,
 'loss': 'log_loss',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 150,
 'n_iter_no_change': None,
 'random_state': 2024,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [9]:
y_pred = model_pipeline_train.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.89      0.95      0.92      7414
        True       0.79      0.64      0.71      2355

    accuracy                           0.87      9769
   macro avg       0.84      0.79      0.81      9769
weighted avg       0.87      0.87      0.87      9769



In [10]:
y_pred_train = model_pipeline_train.predict(X_train)

print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

       False       0.91      0.95      0.93     29741
        True       0.82      0.70      0.75      9332

    accuracy                           0.89     39073
   macro avg       0.87      0.82      0.84     39073
weighted avg       0.89      0.89      0.89     39073

