In [2]:
pip install imblearn

Collecting imblearnNote: you may need to restart the kernel to use updated packages.

  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Kodigo
print("Gabriel Guzmán - CSV\n")

# Cargar el archivo CSV desde GitHub
url = "https://raw.githubusercontent.com/nzepedacc/Pandas/main/bank-full.csv"
df = pd.read_csv(url, sep=';')

# Mostrar las primeras filas del DataFrame
print("DataFrame original:\n", df.head())

# ----------------- SEPARAR VARIABLES DEPENDIENTE E INDEPENDIENTES -----------------
# El objetivo es predecir la columna 'y' (si el cliente aceptó el depósito)
X = df.drop(columns=['y'])  # Variables predictoras
y = df['y']  # Variable objetivo

# Convertir la variable objetivo a binaria (yes=1, no=0)
y = y.map({'yes': 1, 'no': 0})

# ----------------- CREAR EL PIPELINE -----------------
# Seleccionar características numéricas y categóricas
numeric_features = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_features = [col for col in X.columns if col not in numeric_features]

# Pipeline para columnas numéricas: imputar valores faltantes y escalar
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputar valores faltantes con la media
    ('scaler', StandardScaler())  # Escalar las columnas numéricas
])

# Pipeline para columnas categóricas: imputar valores faltantes y aplicar One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),  # Imputar valores faltantes con 'Unknown'
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # Aplicar One-Hot Encoding
])

# Usar ColumnTransformer para aplicar las transformaciones correspondientes a cada tipo de columna
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Dividir el dataset en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ----------------- APLICAR TRANSFORMACIONES ANTES DE SMOTE -----------------
# Aplicar las transformaciones (One-Hot Encoding y escalado) en el conjunto de entrenamiento
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Aplicar SMOTE para balancear el conjunto de entrenamiento transformado
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

# ----------------- ENTRENAR EL MODELO -----------------
# Entrenar el modelo de regresión logística
model = LogisticRegression(max_iter=1000)
model.fit(X_train_smote, y_train_smote)

# Predecir los resultados en el conjunto de prueba
y_pred = model.predict(X_test_transformed)

# ----------------- EVALUACIÓN DEL MODELO -----------------
# Mostrar la precisión del modelo
accuracy = accuracy_score(y_test, y_pred)
print("\nPrecisión del modelo (accuracy):", accuracy)

# Mostrar reporte de clasificación
print("\nReporte de clasificación:\n", classification_report(y_test, y_pred))


Gabriel Guzmán - CSV

DataFrame original:
    age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  

Precisión del modelo (accuracy): 0.8462899480260976

Reporte de cla

In [5]:
import joblib

# Guardar el pipeline de preprocesamiento
joblib.dump(preprocessor, 'preprocessor_pipeline.pkl')

# Guardar el modelo entrenado
joblib.dump(model, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']

In [6]:
# Cargar el pipeline de preprocesamiento
preprocessor_loaded = joblib.load('preprocessor_pipeline.pkl')

# Cargar el modelo entrenado
model_loaded = joblib.load('logistic_regression_model.pkl')
