In [268]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


In [269]:
original = pd.read_csv('Data_Train.csv', sep = ';')
df = original.copy()

In [270]:
df.dropna(subset=['educacion'], inplace=True)

df.isnull().sum()

ID                                   0
edad                                 0
trabajo                              0
estado_civil                         0
educacion                            0
deuda                                0
saldo                                0
vivienda                             0
prestamo                             0
tipo_contacto                    12072
duracion                             0
fecha_contacto                       0
campaign                             0
tiempo_transcurrido                  0
contactos_anteriores                 0
resultado_campanas_anteriores    34670
target                               0
dtype: int64

In [271]:
# Reemplaza los valores nulos en las columnas especificadas con "Sin Información"
df[['tipo_contacto', 'resultado_campanas_anteriores']] = df[['tipo_contacto', 'resultado_campanas_anteriores']].fillna("Sin Información")

df.isnull().sum()

ID                               0
edad                             0
trabajo                          0
estado_civil                     0
educacion                        0
deuda                            0
saldo                            0
vivienda                         0
prestamo                         0
tipo_contacto                    0
duracion                         0
fecha_contacto                   0
campaign                         0
tiempo_transcurrido              0
contactos_anteriores             0
resultado_campanas_anteriores    0
target                           0
dtype: int64

In [272]:
df_train = df.copy()
df_train.sample(3)

Unnamed: 0,ID,edad,trabajo,estado_civil,educacion,deuda,saldo,vivienda,prestamo,tipo_contacto,duracion,fecha_contacto,campaign,tiempo_transcurrido,contactos_anteriores,resultado_campanas_anteriores,target
18616,18617,45,blue-collar,casado,priamaria,no,0,si,no,movil,137,24-jul-2021,1,-1,0,Sin Información,no
34632,34633,33,blue-collar,soltero,universitarios,no,350,si,no,movil,148,6-feb-2021,7,227,5,sin_exito,no
26190,26191,33,blue-collar,soltero,priamaria,no,-104,si,no,movil,34,15-may-2021,10,331,16,otro,no


In [273]:
df_train['educacion'] = df_train['educacion'].replace('priamaria', 'primaria')


In [274]:
columns_to_delete = ['fecha_contacto', 'ID']
df_train.drop(columns=columns_to_delete,inplace=True)

df_train.sample(3)

Unnamed: 0,edad,trabajo,estado_civil,educacion,deuda,saldo,vivienda,prestamo,tipo_contacto,duracion,campaign,tiempo_transcurrido,contactos_anteriores,resultado_campanas_anteriores,target
15186,48,blue-collar,casado,secundaria/superiores,no,5078,si,no,fijo,139,1,-1,0,Sin Información,no
19095,58,blue-collar,casado,secundaria/superiores,no,590,no,si,movil,136,1,167,1,sin_exito,no
25597,28,services,casado,secundaria/superiores,no,-494,si,no,Sin Información,232,2,-1,0,Sin Información,no


In [275]:
#Instancio el OneHot
onehot=OneHotEncoder()

# Aplico el OneHot a la columna job y guardo el resultao en a
onehot.fit(df_train[['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']])
a=onehot.transform(df_train[['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']])

# Convierto el a en data frame y lo llamo encoded_df
encoded_df = pd.DataFrame(a.toarray(), columns=onehot.get_feature_names_out(['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']))

# Reseteo el índice de los dos data frames antes de concatenarlos
df_train.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)

# concateno los dos data frames y los guardo machacando el df
df_train=pd.concat([df_train,encoded_df],axis=1)

# Elimino la columna original antigua, la categórica
df_train.drop(columns=['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores'],inplace=True)

# Muestro tamaño yu sample del data frame df transformado
print(df_train.shape)
df_train.sample(5)

(42446, 38)


Unnamed: 0,edad,saldo,duracion,campaign,tiempo_transcurrido,contactos_anteriores,target,trabajo_admin.,trabajo_blue-collar,trabajo_entrepreneur,...,vivienda_si,prestamo_no,prestamo_si,tipo_contacto_Sin Información,tipo_contacto_fijo,tipo_contacto_movil,resultado_campanas_anteriores_Sin Información,resultado_campanas_anteriores_exito,resultado_campanas_anteriores_otro,resultado_campanas_anteriores_sin_exito
22407,46,965,177,1,-1,0,no,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
22687,36,60,156,2,-1,0,si,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
38438,49,3677,276,3,-1,0,no,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
11502,53,4994,62,6,-1,0,no,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
33852,34,1702,22,4,-1,0,no,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [276]:
df_train['target'] = df_train['target'].map({'no': 0, 'si': 1})

In [277]:
X = df_train.drop(columns=["target"],inplace=False)
y = df_train["target"]

In [278]:
df_train

Unnamed: 0,edad,saldo,duracion,campaign,tiempo_transcurrido,contactos_anteriores,target,trabajo_admin.,trabajo_blue-collar,trabajo_entrepreneur,...,vivienda_si,prestamo_no,prestamo_si,tipo_contacto_Sin Información,tipo_contacto_fijo,tipo_contacto_movil,resultado_campanas_anteriores_Sin Información,resultado_campanas_anteriores_exito,resultado_campanas_anteriores_otro,resultado_campanas_anteriores_sin_exito
0,40,580,192,1,-1,0,0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,47,3644,83,2,-1,0,0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,25,538,226,1,-1,0,0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,42,1773,311,1,336,1,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,56,217,121,2,-1,0,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42441,38,-323,250,1,-1,0,0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
42442,49,1982,73,15,-1,0,0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
42443,51,794,747,1,-1,0,0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
42444,29,98,170,2,-1,0,0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [279]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [280]:
# Estandarización controlada

estandarizador = StandardScaler()
estandarizador.fit(X_train)
X_train_std=estandarizador.transform(X_train)
X_test_std=estandarizador.transform(X_test)

In [281]:
# Entrenar el modelo de Regresión Logística
model = LogisticRegression(max_iter=1000)  # Se utiliza max_iter=1000 para asegurar que converja
model.fit(X_train, y_train)

# Predecir las probabilidades de compra para el conjunto de prueba
probabilidades_compra = model.predict_proba(X_test)[:, 1]

# Agregar las probabilidades de compra al DataFrame original
df_train['probabilidad_compra'] = model.predict_proba(X)[:, 1].round(2)

# Evaluación del modelo (opcional)
auc = roc_auc_score(y_test, probabilidades_compra)
print("AUC Score:", auc)


AUC Score: 0.8850129681194574


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [282]:
df_train.sample(10)

Unnamed: 0,edad,saldo,duracion,campaign,tiempo_transcurrido,contactos_anteriores,target,trabajo_admin.,trabajo_blue-collar,trabajo_entrepreneur,...,prestamo_no,prestamo_si,tipo_contacto_Sin Información,tipo_contacto_fijo,tipo_contacto_movil,resultado_campanas_anteriores_Sin Información,resultado_campanas_anteriores_exito,resultado_campanas_anteriores_otro,resultado_campanas_anteriores_sin_exito,probabilidad_compra
9401,38,4771,217,1,-1,0,0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.05
33219,36,402,86,2,154,1,0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.03
40019,44,-75,104,1,-1,0,0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01
7256,26,288,49,2,-1,0,0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.01
7036,30,5359,427,1,191,3,1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.85
10900,45,1309,367,1,-1,0,1,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.19
1652,30,371,124,2,-1,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.06
6315,40,12409,196,2,-1,0,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.07
5035,40,10406,348,2,127,4,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.26
41279,41,621,111,1,-1,0,0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.05


### Utilizamos el modelo con 'Data_TEST'

In [283]:
original2 = pd.read_csv('Data_Test.csv', sep = ';')
df2 = original2.copy()

In [284]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Suponiendo que tienes tu DataFrame llamado 'df' con 50000 personas y diferentes columnas

# Separar las características categóricas y numéricas
categorical_features = df2.select_dtypes(include=['object']).columns
numeric_features = df2.select_dtypes(include=['int', 'float']).columns

# Construir el preprocesamiento de datos con codificación one-hot para características categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Crear el pipeline de preprocesamiento y modelado
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('kmeans', KMeans(n_clusters=5, random_state=42))])

# Ajustar el pipeline al DataFrame
pipeline.fit(df2)

# Obtener las etiquetas de los clústeres asignadas a cada muestra
cluster_labels = pipeline.named_steps['kmeans'].labels_

# Agregar las etiquetas de los clústeres al DataFrame original
df2['Cluster'] = cluster_labels

# Analizar los resultados del clustering
print(df2['Cluster'].value_counts())

Cluster
0    1637
3    1573
1     688
2     382
4     161
Name: count, dtype: int64


In [285]:
df2.dropna(subset=['educacion'], inplace=True)

df2.isnull().sum()

ID                                  0
edad                                0
trabajo                             0
estado_civil                        0
educacion                           0
deuda                               0
saldo                               0
vivienda                            0
prestamo                            0
tipo_contacto                    1233
duracion                            0
fecha_contacto                      0
campaign                            0
tiempo_transcurrido                 0
contactos_anteriores                0
resultado_campanas_anteriores    3494
target                              0
Cluster                             0
dtype: int64

In [286]:
# Reemplaza los valores nulos en las columnas especificadas con "Sin Información"
df2[['tipo_contacto', 'resultado_campanas_anteriores']] = df2[['tipo_contacto', 'resultado_campanas_anteriores']].fillna("Sin Información")

df2.isnull().sum()

ID                               0
edad                             0
trabajo                          0
estado_civil                     0
educacion                        0
deuda                            0
saldo                            0
vivienda                         0
prestamo                         0
tipo_contacto                    0
duracion                         0
fecha_contacto                   0
campaign                         0
tiempo_transcurrido              0
contactos_anteriores             0
resultado_campanas_anteriores    0
target                           0
Cluster                          0
dtype: int64

In [287]:
df_test = df2.copy()
df_test.sample(3)

Unnamed: 0,ID,edad,trabajo,estado_civil,educacion,deuda,saldo,vivienda,prestamo,tipo_contacto,duracion,fecha_contacto,campaign,tiempo_transcurrido,contactos_anteriores,resultado_campanas_anteriores,target,Cluster
430,50431,34,blue-collar,casado,secundaria/superiores,no,584,si,no,movil,154,18-jul-2021,2,-1,0,Sin Información,no,0
1879,51880,40,management,casado,universitarios,no,-17,si,si,movil,474,11-may-2021,1,256,1,exito,si,1
3188,53189,47,entrepreneur,casado,priamria,no,668,no,no,movil,908,12-may-2021,1,-1,0,Sin Información,no,2


In [288]:
df_test['educacion'] = df_test['educacion'].replace('priamaria', 'primaria')


In [289]:
columns_to_delete = ['fecha_contacto', 'ID','target']
df_test.drop(columns=columns_to_delete,inplace=True)

df_test.sample(3)

Unnamed: 0,edad,trabajo,estado_civil,educacion,deuda,saldo,vivienda,prestamo,tipo_contacto,duracion,campaign,tiempo_transcurrido,contactos_anteriores,resultado_campanas_anteriores,Cluster
1893,42,blue-collar,soltero,priamria,no,92,no,no,movil,86,2,-1,0,Sin Información,0
1546,53,admin.,casado,secundaria/superiores,no,225,si,no,movil,304,1,340,1,sin_exito,1
701,69,retired,soltero,universitarios,no,2144,no,no,movil,417,1,184,4,exito,1


In [290]:
#Instancio el OneHot
onehot=OneHotEncoder()

# Aplico el OneHot a la columna job y guardo el resultao en a
onehot.fit(df_test[['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']])
a=onehot.transform(df_test[['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']])

# Convierto el a en data frame y lo llamo encoded_df
encoded_df = pd.DataFrame(a.toarray(), columns=onehot.get_feature_names_out(['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores']))

# Reseteo el índice de los dos data frames antes de concatenarlos
df_test.reset_index(drop=True, inplace=True)
encoded_df.reset_index(drop=True, inplace=True)

# concateno los dos data frames y los guardo machacando el df
df_test=pd.concat([df_test,encoded_df],axis=1)

# Elimino la columna original antigua, la categórica
df_test.drop(columns=['trabajo','estado_civil','educacion','deuda','vivienda','prestamo','tipo_contacto','resultado_campanas_anteriores'],inplace=True)

# Muestro tamaño yu sample del data frame df transformado
print(df_test.shape)
df_test.sample(5)

(4255, 38)


Unnamed: 0,edad,saldo,duracion,campaign,tiempo_transcurrido,contactos_anteriores,Cluster,trabajo_admin.,trabajo_blue-collar,trabajo_entrepreneur,...,vivienda_si,prestamo_no,prestamo_si,tipo_contacto_Sin Información,tipo_contacto_fijo,tipo_contacto_movil,resultado_campanas_anteriores_Sin Información,resultado_campanas_anteriores_exito,resultado_campanas_anteriores_otro,resultado_campanas_anteriores_sin_exito
4188,42,-105,60,2,-1,0,3,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
6,41,5110,231,1,-1,0,0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3390,46,2420,405,2,-1,0,3,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3047,54,0,220,1,-1,0,3,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2779,40,782,224,2,-1,0,3,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [291]:
df_test

Unnamed: 0,edad,saldo,duracion,campaign,tiempo_transcurrido,contactos_anteriores,Cluster,trabajo_admin.,trabajo_blue-collar,trabajo_entrepreneur,...,vivienda_si,prestamo_no,prestamo_si,tipo_contacto_Sin Información,tipo_contacto_fijo,tipo_contacto_movil,resultado_campanas_anteriores_Sin Información,resultado_campanas_anteriores_exito,resultado_campanas_anteriores_otro,resultado_campanas_anteriores_sin_exito
0,51,-2082,123,6,-1,0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,50,2881,510,2,2,5,1,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,50,1412,131,3,-1,0,0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,37,0,247,13,-1,0,4,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,31,757,343,2,-1,0,0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4250,34,992,301,1,88,2,3,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4251,37,0,236,1,-1,0,3,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4252,34,943,96,4,-1,0,3,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4253,51,1315,303,2,-1,0,3,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [292]:
# Estandarización controlada

estandarizador = StandardScaler()
estandarizador.fit(df_test[['edad','saldo','duracion','campaign','tiempo_transcurrido','contactos_anteriores']])
X_train_std = estandarizador.transform(df_test[['edad','saldo','duracion','campaign','tiempo_transcurrido','contactos_anteriores']])
X_test_std = estandarizador.transform(df_test[['edad','saldo','duracion','campaign','tiempo_transcurrido','contactos_anteriores']])

In [296]:
# Entrenar el modelo de Regresión Logística
model = LogisticRegression(max_iter=1000)  # Se utiliza max_iter=1000 para asegurar que converja

# Ajustar el modelo a tus datos de entrenamiento (X_train, y_train)
# Ajustar el modelo a tus datos de entrenamiento (X_train, y_train)

# Predecir las probabilidades de compra para el conjunto de prueba
df_test[['NO_%','SI_%']] = model.predict_proba(X)

# Agregar las probabilidades de compra al DataFrame original
# df_test['probabilidad_compra'] = model.predict_proba(X)[:, 1]

# Evaluación del modelo (opcional)
# auc = roc_auc_score(y_test, probabilidades_compra)
# print("AUC Score:", auc)


NotFittedError: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
df_train.sample(10)