****| RETO 3: MERCADOTECNIA TELEFÓNICA CON APRENDIZAJE SUPERVISADO |****

====================================================================

In [3]:
# Manipulación y análisis de datos
import pandas as pd
import numpy as np

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report

**1. Preparación del entorno y carga de los datos**

In [4]:
# Carga de los datos
data = pd.read_csv("../data/bank_marketing.csv")

In [5]:
data.head(8)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,31,self-employed,married,tertiary,no,2666,no,no,cellular,10,nov,318,2,97,6,success,yes
1,29,unemployed,single,unknown,no,1584,no,no,cellular,6,sep,245,1,-1,0,unknown,yes
2,41,blue-collar,married,secondary,no,2152,yes,no,cellular,17,nov,369,1,-1,0,unknown,no
3,50,blue-collar,married,secondary,no,84,yes,no,cellular,17,jul,18,8,-1,0,unknown,no
4,40,admin.,married,secondary,no,0,no,no,cellular,28,jul,496,2,182,11,success,yes
5,58,retired,married,secondary,no,8332,no,no,cellular,13,aug,740,4,-1,0,unknown,no
6,36,services,single,secondary,no,198,yes,no,cellular,11,may,496,1,-1,0,unknown,no
7,26,technician,single,secondary,no,1231,yes,no,cellular,3,jun,385,2,96,5,success,yes


In [6]:
# Dimensiones del dataframe
print("Número de registros:", data.shape[0])
print("Número de variables:", data.shape[1])

Número de registros: 9000
Número de variables: 17


In [7]:
# Tipos de variables
print("\nTipos de variables:")
print(data.dtypes)


Tipos de variables:
age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object


In [8]:
# Datos faltantes
print("\nValores nulos por variable:")
print(data.isnull().sum())


Valores nulos por variable:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [9]:
# Descripción general de los datos
data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0
mean,41.090556,1482.262778,15.619556,353.832778,2.520111,50.511333,0.788889
std,11.664253,3031.013197,8.345305,336.945158,2.737758,107.691963,2.210273
min,18.0,-3058.0,1.0,3.0,1.0,-1.0,0.0
25%,32.0,109.0,8.0,131.0,1.0,-1.0,0.0
50%,39.0,519.0,15.0,240.5,2.0,-1.0,0.0
75%,49.0,1646.5,21.0,462.0,3.0,-1.0,0.0
max,95.0,81204.0,31.0,3253.0,58.0,850.0,58.0


Identificamos las siguientes variables

***Numéricas:*** age, balance, day, duration, campaing, pdays, previous

***Categóricas:*** job, marital, education, default, housing, loan, contact, month, poutcome, y

___

**2. Procesamiento de variables**

Optamos por emplear **LabelEncoder** para:

*Variables con orden* **->** Existe una jerarquía lógica

Y **OneHotEncoder** para:

*Variables sin orden* **->** Para evitar introducir un orden 'ficticio' y sesgo en el modelo

In [10]:
# == Variable de salida == #
# Revisamos valores únicos de 'y'
print("Valores únicos en 'y':", data['y'].unique())
print("Distribución de 'y':")
print(data['y'].value_counts())

Valores únicos en 'y': ['yes' 'no']
Distribución de 'y':
y
no     5213
yes    3787
Name: count, dtype: int64


In [109]:
print("\nClientes suscritos:", data['y'].value_counts())


Clientes suscritos: y
0    5213
1    3787
Name: count, dtype: int64


In [112]:
# == Variables binarias simples == #
print("\nDefault:", data['default'].unique())
print("\nHousing:", data['housing'].unique())
print("\nLoan:",data['loan'].unique())


Default: ['no' 'yes']

Housing: ['no' 'yes']

Loan: ['no' 'yes']


In [115]:
# == Aplicamos OneHotEncoder a variables nominales == #
# Lista de columnas nominales
onehot_cols = ['job', 'marital', 'contact', 'month', 'poutcome']

In [118]:
data_encoded.head()

Unnamed: 0,age,education,default,balance,housing,loan,day,duration,campaign,pdays,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,31,tertiary,0,2666,0,0,10,318,2,97,...,False,False,False,False,True,False,False,False,True,False
1,29,unknown,0,1584,0,0,6,245,1,-1,...,False,False,False,False,False,False,True,False,False,True
2,41,secondary,0,2152,1,0,17,369,1,-1,...,False,False,False,False,True,False,False,False,False,True
3,50,secondary,0,84,1,0,17,18,8,-1,...,True,False,False,False,False,False,False,False,False,True
4,40,secondary,0,0,0,0,28,496,2,182,...,True,False,False,False,False,False,False,False,True,False


En el caso de 'education', se identifica un orden implícito:
'primary' < 'secondary' < 'tertiary', con 'unknown' como caso especial.

In [122]:
print("\neducation:", data_encoded['education'].unique())


education: [3 0 2 1]


In [123]:
print("Shape final del dataset:", data_encoded.shape)
data_encoded.head()

Shape final del dataset: (9000, 41)


Unnamed: 0,age,education,default,balance,housing,loan,day,duration,campaign,pdays,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown
0,31,3,0,2666,0,0,10,318,2,97,...,False,False,False,False,True,False,False,False,True,False
1,29,0,0,1584,0,0,6,245,1,-1,...,False,False,False,False,False,False,True,False,False,True
2,41,2,0,2152,1,0,17,369,1,-1,...,False,False,False,False,True,False,False,False,False,True
3,50,2,0,84,1,0,17,18,8,-1,...,True,False,False,False,False,False,False,False,False,True
4,40,2,0,0,0,0,28,496,2,182,...,True,False,False,False,False,False,False,False,True,False
