# Preprocesamiento de datos

A continuación haremos udo de OneHotEncoder para las columnas categoricas, pues nos será necesario más adelante para trabajar con los modelos.

In [25]:
#Comenzamos importando las librerías necesarias
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

#Preprocesamiento de datos
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [7]:
#Cargamos el conjunto de datos
filename = "../data/base_preprocesamiento.csv"
data = pd.read_csv(filename)

Comprobamos que se ha cargado bien el conjunto de datos

In [8]:
data.head(10)

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,9,0,1500.0,0,INTERNET,16.224843,linux,1,1,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,2,0,1500.0,0,INTERNET,3.363854,other,1,1,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,30,0,200.0,0,INTERNET,22.730559,windows,0,1,0
3,0,0.6,0.4751,11,14,30,0.006991,-1.863101,AB,3483,...,1,0,200.0,0,INTERNET,15.215816,linux,1,1,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,26,0,200.0,0,INTERNET,3.743048,other,0,1,0
5,0,0.6,0.29484,-1,369,30,0.024232,-1.232556,AD,1204,...,30,0,200.0,0,INTERNET,6.987316,linux,1,1,0
6,0,0.2,0.773085,22,4,40,0.006919,-0.544676,AB,1998,...,1,0,200.0,0,INTERNET,28.199923,x11,1,1,0
7,0,0.8,0.15388,-1,103,40,0.045122,-1.101184,AB,1548,...,25,1,200.0,0,INTERNET,11.234264,other,1,1,0
8,0,0.3,0.523655,21,2,30,0.035206,-0.955737,AB,1781,...,2,0,200.0,0,INTERNET,5.329387,other,1,1,0
9,0,0.8,0.834475,-1,134,20,0.017245,-1.356393,AD,3113,...,15,0,1500.0,0,INTERNET,4.10397,other,1,1,0


Dividimos el conjunto de datos en datos de entranimento y datos de prueba.

In [9]:
X=data.drop(['fraud_bool'],axis=1)
y=data['fraud_bool']

print("Shape of X:",X.shape)
print("Shape of y:",y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123,shuffle=True)

Shape of X: (1000000, 29)
Shape of y: (1000000,)


In [17]:
X_train.shape

(800000, 29)

Reseteamos los indices para poder trabajar más adelante.

In [18]:
X_train=X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [19]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

Comenzamos haciendo el one hot encoding para las columnas categoricas del conjunto de datos, en este caso X, usando OneHotEncoder de la sibrería Scikit-Learn. Para ello vamos realizarlo primero para el conjunto de entrenamientos y después para el conjunto de prueba.

In [20]:
from sklearn.preprocessing import OneHotEncoder

#Extraemos las columnas categoricas del DataFrame
#De esta manera extraemos las columnas cuyos tipos de datos son objeto ('object')
categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()

#Inicializamos OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Aplicamos el one-hot encoding a las columnas categoricas
one_hot_encoded_train = encoder.fit_transform(X_train[categorical_columns])

#Creamos un nuevo DataFrame con las columnas one-hot encoded
#Usamos get_feature_names_out() para obtener los nombres de las columnas para los datos encoded
one_hot_train = pd.DataFrame(one_hot_encoded_train, columns=encoder.get_feature_names_out(categorical_columns))

#Concatenamos este nuevo Dataframe con el original
train_encoded = pd.concat([X_train, one_hot_train], axis=1)

#Hacemos un Drop para eliminar las columnas categoricas originales
train_encoded = train_encoded.drop(categorical_columns, axis=1)


# Mostramos el DataFrame resultante
print(f"Encoded Employee data : \n{train_encoded}")

Encoded Employee data : 
        income  name_email_similarity  prev_address_months_count  \
0          0.6               0.430393                         -1   
1          0.8               0.526791                         10   
2          0.9               0.261854                         25   
3          0.3               0.234989                         -1   
4          0.7               0.388970                         34   
...        ...                    ...                        ...   
799995     0.2               0.239980                         -1   
799996     0.1               0.862807                         -1   
799997     0.7               0.496823                         50   
799998     0.1               0.850088                         55   
799999     0.2               0.756882                         -1   

        current_address_months_count  customer_age  days_since_request  \
0                                 29            50            0.007139   
1         

In [21]:
#Comprobamos que se ha realizado correctamente
train_encoded.head(20)

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,0.6,0.430393,-1,29,50,0.007139,-0.770661,548,6840.253749,5208.1698,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.8,0.526791,10,6,40,0.024797,-1.353757,3190,6464.371764,7830.345921,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.9,0.261854,25,31,20,70.728204,-1.404875,989,9299.042157,5436.987186,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.3,0.234989,-1,112,40,0.001179,-0.730689,1227,4571.730988,3245.309667,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.7,0.38897,34,12,30,0.031244,16.776693,1120,5850.679056,4604.766939,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.1,0.417593,-1,38,40,0.015301,-1.368188,1249,9334.186209,6114.691297,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.9,0.993716,22,8,30,0.023317,48.601249,748,2611.810064,4913.913953,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
7,0.1,0.878703,11,4,20,0.008045,-1.256807,1061,9357.613997,5568.994706,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
8,0.7,0.221928,-1,23,50,0.029497,-0.646195,612,5120.631406,5185.149192,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
9,0.2,0.258522,-1,133,40,0.010357,-1.444077,1172,1612.101787,2406.880905,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [22]:
from sklearn.preprocessing import OneHotEncoder

#Extraemos las columnas categoricas del DataFrame
#De esta manera extraemos las columnas cuyos tipos de datos son objeto ('object')
categorical_columns = X_test.select_dtypes(include=['object']).columns.tolist()

#Inicializamos OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Aplicamos el one-hot encoding a las columnas categoricas
one_hot_encoded_test = encoder.fit_transform(X_test[categorical_columns])

#Creamos un nuevo DataFrame con las columnas one-hot encoded
#Usamos get_feature_names_out() para obtener los nombres de las columnas para los datos encoded
one_hot_test = pd.DataFrame(one_hot_encoded_test, columns=encoder.get_feature_names_out(categorical_columns))

#Concatenamos este nuevo Dataframe con el original
test_encoded = pd.concat([X_test, one_hot_test], axis=1)

#Hacemos un Drop para eliminar las columnas categoricas originales
test_encoded = test_encoded.drop(categorical_columns, axis=1)

# Mostramos el DataFrame resultante
print(f"Encoded Employee data : \n{test_encoded}")

Encoded Employee data : 
        income  name_email_similarity  prev_address_months_count  \
0          0.1               0.879216                         -1   
1          0.6               0.857586                         -1   
2          0.2               0.871257                         -1   
3          0.2               0.844386                         -1   
4          0.1               0.428755                         -1   
...        ...                    ...                        ...   
199995     0.6               0.728395                         -1   
199996     0.5               0.460383                         -1   
199997     0.4               0.731218                         -1   
199998     0.6               0.889546                         -1   
199999     0.2               0.645401                         11   

        current_address_months_count  customer_age  days_since_request  \
0                                 90            40            0.009005   
1         

In [23]:
#Comprobemos que se ha realizado correctamente
test_encoded.head()

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,housing_status_BE,housing_status_BF,housing_status_BG,source_INTERNET,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows,device_os_x11
0,0.1,0.879216,-1,90,40,0.009005,41.048629,1083,1152.844779,6955.17743,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.6,0.857586,-1,34,30,0.018432,15.536027,2514,6702.980154,4103.172344,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.2,0.871257,-1,233,50,0.003572,-0.295622,991,3130.756088,4135.792367,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.2,0.844386,-1,118,50,0.016426,-1.0158,579,8836.448311,5872.19222,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.1,0.428755,-1,56,20,0.016087,6.060433,1811,9918.660366,8871.35757,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [24]:
test_encoded.shape

(200000, 50)

Vamos a entrenar un modelo de regresión Lasso utilizando la técnica de LassoLars con validación cruzada.  Este modelopara seleccionar las variables que resultan más relevantes para nuestro modelo cuando tenemos un gran número de variables predictoras, como en nuestro caso. LassoLars es una variante de Lasso más eficiente cuando tenemos un gran número de variables. La validación cruzada se utiliza para evitar el sobreajuste y para  seleccionar el mejor valor del parámetro de regularización (alpha).


In [26]:
# Creamos un modelo Lasso con validación cruzada para seleccionar el mejor valor de alpha
lasso_lars_ic = LassoLarsCV(cv=5)
lasso_lars_ic.fit(train_encoded, y_train)

# Obtenemos el valor óptimo de alpha
alpha_optimo = lasso_lars_ic.alpha_

# Los coeficientes resultantes después de aplicar Lasso
coeficientes_lasso = lasso_lars_ic.coef_

# Examinamos los coeficientes para identificar las variables más importantes
print("Coeficientes Lasso:")
for nombre_variable, coef in zip(train_encoded.columns, coeficientes_lasso):
    print(f"{nombre_variable}: {coef:.5f}")

Coeficientes Lasso:
income: 0.00931
name_email_similarity: -0.01309
prev_address_months_count: -0.00005
current_address_months_count: 0.00001
customer_age: 0.00026
days_since_request: 0.00008
intended_balcon_amount: -0.00008
zip_count_4w: 0.00000
velocity_6h: -0.00000
velocity_24h: 0.00000
bank_branch_count_8w: -0.00000
date_of_birth_distinct_emails_4w: -0.00029
credit_risk_score: 0.00003
email_is_free: 0.00676
phone_home_valid: -0.00936
phone_mobile_valid: -0.00128
bank_months_count: 0.00015
has_other_cards: -0.01063
proposed_credit_limit: 0.00001
foreign_request: 0.00894
session_length_in_minutes: 0.00002
keep_alive_session: -0.00754
device_distinct_emails_8w: 0.01830
month: 0.00037
payment_type_AA: 0.00008
payment_type_AB: -0.00046
payment_type_AC: 0.00821
payment_type_AD: 0.00000
payment_type_AE: 0.00000
employment_status_CA: 0.00273
employment_status_CB: -0.00216
employment_status_CC: 0.00784
employment_status_CD: -0.00073
employment_status_CE: 0.00000
employment_status_CF: 0.0000

Después de entrenar el modelo, obtenemos el valor óptimo de alpha que minimiza el error de validación cruzada. A continuación, aplicamos el modelo Lasso con este valor óptimo de alpha a nuestros datos y obtenemos los coeficientes del modelo, estos coeficientes van a representar la relevancia de cada variable predictora en la predicción de la variable objetivo. Finalmente, imprimimos estos coeficientes para cada variable para poder analizar cuáles son las variables más importantes en nuestro modelo.

In [27]:
#Guardamos en una lista los nombres de las columnas con coeficientes Lasso suficientemente pequeños
Lista = []
for nombre_variable, coef in zip(train_encoded.columns, coeficientes_lasso):
    if abs(coef)<0.000005:
        Lista.append(nombre_variable)


In [28]:
#Comprobamos que la lista se ha creado bien
Lista

['zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'bank_branch_count_8w',
 'payment_type_AD',
 'payment_type_AE',
 'employment_status_CE',
 'employment_status_CF',
 'employment_status_CG',
 'housing_status_BE',
 'housing_status_BF',
 'housing_status_BG',
 'source_INTERNET',
 'device_os_x11']

In [29]:
#Eliminamos los datos de las columnas cuyo coeficiente LASSO es menor que 0.000001 menos las que son producto del OneHotEncoder de train_encoded

train_encoded = train_encoded.drop(['zip_count_4w'], axis=1, errors='ignore')
train_encoded = train_encoded.drop(['velocity_6h'], axis=1, errors='ignore')
train_encoded = train_encoded.drop(['velocity_4w'], axis=1, errors='ignore')
train_encoded = train_encoded.drop(['bank_branch_count_8w'], axis=1, errors='ignore')
train_encoded = train_encoded.drop(['source_INTERNET'], axis=1, errors='ignore')
train_encoded = train_encoded.drop(['device_os_x11'], axis=1, errors='ignore')

#Comprobamos que se ha hecho bien
train_encoded.shape

(800000, 45)

In [30]:
#Eliminamos los datosde las columnas cuyo coeficiente LASSO es menor que 0.000001 menos las que son producto del OneHotEncoder de test_encoded
test_encoded = test_encoded.drop(['zip_count_4w'], axis=1, errors='ignore')
test_encoded = test_encoded.drop(['velocity_6h'], axis=1, errors='ignore')
test_encoded = test_encoded.drop(['velocity_4w'], axis=1, errors='ignore')
test_encoded = test_encoded.drop(['bank_branch_count_8w'], axis=1, errors='ignore')
test_encoded = test_encoded.drop(['source_INTERNET'], axis=1, errors='ignore')
test_encoded =test_encoded.drop(['device_os_x11'], axis=1, errors='ignore')

#Comprobamos que se ha hecho bien
test_encoded.shape

(200000, 45)

In [31]:
#Comprobamos que se ha realizado correctamente y vemos una breve descripción matemática
train_encoded.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
income,800000.0,0.562608,0.290395,0.1,0.3,0.6,0.8,0.9
name_email_similarity,800000.0,0.493611,0.289204,1.015545e-05,0.224993,0.492218,0.755661,0.999999
prev_address_months_count,800000.0,16.688051,43.992924,-1.0,-1.0,-1.0,12.0,383.0
current_address_months_count,800000.0,86.530909,88.339448,-1.0,19.0,52.0,130.0,428.0
customer_age,800000.0,33.688925,12.019568,10.0,20.0,30.0,40.0,90.0
days_since_request,800000.0,1.021128,5.377205,4.03686e-09,0.007196,0.01519,0.026341,78.456904
intended_balcon_amount,800000.0,8.662381,20.232818,-15.53055,-1.181503,-0.830376,4.976656,112.756111
velocity_24h,800000.0,4769.835995,1478.467471,1300.307,3594.365962,4750.134839,5752.603322,9506.896596
date_of_birth_distinct_emails_4w,800000.0,9.506533,5.034181,0.0,6.0,9.0,13.0,39.0
credit_risk_score,800000.0,130.996193,69.654158,-170.0,83.0,122.0,178.0,389.0


Guardamos todos los conjuntos de datos creados en el fichero data para los siguientes notebooks.

In [33]:
data.to_csv('../data/base_preprocesado.csv', index=False)

In [35]:
train_encoded.to_csv('../data/X_train.csv', index=False)
test_encoded.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

In [36]:
train_encoded.head(10)

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,velocity_24h,date_of_birth_distinct_emails_4w,credit_risk_score,...,housing_status_BC,housing_status_BD,housing_status_BE,housing_status_BF,housing_status_BG,source_TELEAPP,device_os_linux,device_os_macintosh,device_os_other,device_os_windows
0,0.6,0.430393,-1,29,50,0.007139,-0.770661,5208.1698,2,50,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.8,0.526791,10,6,40,0.024797,-1.353757,7830.345921,21,131,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.9,0.261854,25,31,20,70.728204,-1.404875,5436.987186,7,87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.3,0.234989,-1,112,40,0.001179,-0.730689,3245.309667,4,103,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.7,0.38897,34,12,30,0.031244,16.776693,4604.766939,6,72,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.1,0.417593,-1,38,40,0.015301,-1.368188,6114.691297,14,221,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.9,0.993716,22,8,30,0.023317,48.601249,4913.913953,24,173,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.1,0.878703,11,4,20,0.008045,-1.256807,5568.994706,19,102,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.7,0.221928,-1,23,50,0.029497,-0.646195,5185.149192,5,28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.2,0.258522,-1,133,40,0.010357,-1.444077,2406.880905,3,38,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
