# Preprocesamiento de datos

In [1]:
#Comenzamos importando las librerías necesarias
# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

#Preprocesamiento de datos
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Cargamos el conjunto de datos
filename = "../data/data_preprocesamiento.csv"
data = pd.read_csv(filename)

Comprobamos que se ha cargado bien el conjunto de datos

In [3]:
data.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


Dividimos el conjunto de datos en datos de entranimento y datos de prueba.

In [4]:
X = data.drop(['Class'],axis=1)
y = data['Class']

print("Shape of X:",X.shape)
print("Shape of y:",y.shape)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123,shuffle=True)

Shape of X: (283726, 30)
Shape of y: (283726,)


In [5]:
X_train.shape

(226980, 30)

Reseteamos los indices para poder trabajar más adelante.

In [6]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [7]:
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

Vamos a entrenar un modelo de regresión Lasso utilizando la técnica de LassoLars con validación cruzada.  Este modelopara seleccionar las variables que resultan más relevantes para nuestro modelo cuando tenemos un gran número de variables predictoras, como en nuestro caso. LassoLars es una variante de Lasso más eficiente cuando tenemos un gran número de variables. La validación cruzada se utiliza para evitar el sobreajuste y para  seleccionar el mejor valor del parámetro de regularización (alpha).


In [8]:
# Creamos un modelo Lasso con validación cruzada para seleccionar el mejor valor de alpha
lasso_lars_ic = LassoLarsCV(cv=5)
lasso_lars_ic.fit(X_train, y_train)

# Obtenemos el valor óptimo de alpha
alpha_optimo = lasso_lars_ic.alpha_

# Los coeficientes resultantes después de aplicar Lasso
coeficientes_lasso = lasso_lars_ic.coef_

# Examinamos los coeficientes para identificar las variables más importantes
print("Coeficientes Lasso:")
for nombre_variable, coef in zip(X_train.columns, coeficientes_lasso):
    print(f"{nombre_variable}: {coef:.5f}")

Coeficientes Lasso:
Time: -0.00000
V1: -0.00189
V2: 0.00296
V3: -0.00509
V4: 0.00377
V5: -0.00229
V6: -0.00171
V7: -0.00694
V8: 0.00102
V9: -0.00363
V10: -0.00817
V11: 0.00629
V12: -0.01103
V13: -0.00018
V14: -0.01321
V15: -0.00016
V16: -0.00945
V17: -0.01631
V18: -0.00570
V19: 0.00185
V20: 0.00014
V21: 0.00168
V22: 0.00031
V23: 0.00000
V24: -0.00039
V25: 0.00023
V26: 0.00017
V27: 0.00117
V28: 0.00077
Amount: 0.00001


Después de entrenar el modelo, obtenemos el valor óptimo de alpha que minimiza el error de validación cruzada. A continuación, aplicamos el modelo Lasso con este valor óptimo de alpha a nuestros datos y obtenemos los coeficientes del modelo, estos coeficientes van a representar la relevancia de cada variable predictora en la predicción de la variable objetivo. Finalmente, imprimimos estos coeficientes para cada variable para poder analizar cuáles son las variables más importantes en nuestro modelo.

In [9]:
#Guardamos en una lista los nombres de las columnas con coeficientes Lasso suficientemente pequeños
Lista = []
for nombre_variable, coef in zip(X_train.columns, coeficientes_lasso):
    if abs(coef)<0.00000005:
        Lista.append(nombre_variable)


In [10]:
#Comprobamos que la lista se ha creado bien
Lista

['Time', 'V23']

In [30]:
#Eliminamos los datos de las columnas cuyo coeficiente LASSO es menor que 0.000001 menos las que son producto del OneHotEncoder de train_encoded

X_train = X_train.drop(['Time'], axis=1, errors='ignore')
X_train = X_train.drop(['V23'], axis=1, errors='ignore')

#Comprobamos que se ha hecho bien
X_train.shape

(226980, 28)

In [33]:
#Eliminamos los datosde las columnas cuyo coeficiente LASSO es menor que 0.000001 menos las que son producto del OneHotEncoder de test_encoded
X_test = X_test.drop(['Time'], axis=1, errors='ignore')
X_test = X_test.drop(['V23'], axis=1, errors='ignore')

#Comprobamos que se ha hecho bien
X_test.shape

(56746, 28)

In [13]:
#Comprobamos que se ha realizado correctamente y vemos una breve descripción matemática
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,226980.0,94808.392242,47490.225298,0.0,54198.0,84686.0,139308.25,172792.0
V1,226980.0,0.005618,1.949692,-56.40751,-0.917988,0.02038,1.317083,2.45493
V2,226980.0,-0.005226,1.652517,-72.715728,-0.600785,0.063624,0.798695,21.467203
V3,226980.0,0.001128,1.507595,-48.325589,-0.891539,0.17828,1.026482,9.382558
V4,226980.0,-0.004324,1.41319,-5.683171,-0.850602,-0.023257,0.73682,16.875344
V5,226980.0,0.003735,1.386681,-113.743307,-0.688585,-0.053879,0.613414,34.801666
V6,226980.0,-0.001602,1.336474,-26.160506,-0.768712,-0.276044,0.395324,73.301626
V7,226980.0,0.000512,1.23681,-41.506796,-0.552089,0.039653,0.568193,120.589494
V8,226980.0,-0.00141,1.195866,-73.216718,-0.2075,0.022618,0.326694,20.007208
V9,226980.0,-0.001495,1.093765,-13.434066,-0.643066,-0.052702,0.59608,15.594995


Guardamos todos los conjuntos de datos creados en el fichero data para los siguientes notebooks.

In [14]:
data.to_csv('../data/base_preprocesado.csv', index=False)

In [37]:
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

In [36]:
X_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V19,V20,V21,V22,V24,V25,V26,V27,V28,Amount
0,1.012595,0.185411,1.857002,2.625675,-0.813686,0.741198,-1.050510,0.411449,1.078229,0.410744,...,-1.766324,-0.240469,0.167722,0.665253,0.149377,0.061452,0.073725,0.024623,0.023365,12.16
1,1.929110,-0.637219,0.002467,0.277613,-0.835194,0.335546,-1.121181,0.360743,1.303539,0.155439,...,-0.034408,-0.226746,0.040977,0.198945,0.697337,-0.624296,0.310175,-0.013563,-0.042880,4.99
2,1.182737,-1.156850,1.079621,-0.413579,-1.671783,0.040173,-1.272085,0.263822,0.278398,0.423978,...,0.314613,0.043047,0.162604,0.385426,0.072680,0.203746,-0.167881,0.046775,0.030950,71.92
3,2.178247,-1.472885,-0.697008,-1.419590,-1.475410,-0.852524,-1.043807,-0.290200,-1.240592,1.502822,...,-0.282191,-0.285858,-0.111241,0.151666,-0.005200,-0.255401,-0.183330,0.020958,-0.036176,66.00
4,1.616164,-0.915927,-1.687828,0.220307,-0.110584,-0.593453,0.272640,-0.171075,0.249951,0.256631,...,-0.007905,0.233313,0.383392,0.599900,0.825674,-0.072261,0.710245,-0.147041,-0.035975,229.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226975,2.228543,-1.567987,-1.581274,-1.638552,-0.723433,-0.019712,-0.973196,-0.075987,-1.329295,1.691269,...,0.548810,-0.342555,0.016198,0.538773,0.223731,0.246228,0.099140,-0.020681,-0.065602,59.29
226976,-3.143138,-0.375485,1.110079,1.580071,1.888033,0.092566,1.149447,-0.380869,-1.100038,1.135804,...,-1.577824,-1.352575,-0.228987,0.340582,-0.234832,0.580277,-0.023141,-0.780843,0.363664,111.06
226977,-0.581294,1.327858,1.206441,0.007123,0.067904,-1.006753,0.718263,-0.096415,-0.294980,-0.216522,...,-0.124130,0.275089,-0.288499,-0.661535,0.348269,-0.089580,0.074641,0.359116,0.159540,1.78
226978,-0.771670,1.589985,-0.865454,-0.658085,0.841181,-0.765771,0.897101,0.009378,0.154664,-0.170615,...,-0.346774,0.266341,-0.392098,-0.712547,0.535096,-0.272581,0.088535,0.046118,-0.083461,12.99


In [None]:
# Credit Card Fraud Detection - Data Preprocessing

# Importar librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

# Cargar datos
data = pd.read_csv('../input/creditcard.csv')

# Vista preliminar de los datos
print("Vista preliminar de los datos:")
print(data.head())
print("\nDescripción de los datos:")
print(data.describe())
print("\nInformación de los datos:")
print(data.info())

# Verificar valores nulos
print("\nVerificación de valores nulos:")
print(data.isnull().sum())

# Análisis exploratorio de datos (EDA)
plt.figure(figsize=(16, 12))
for i, column in enumerate(data.columns[:-1], 1):
    plt.subplot(8, 4, i)
    sns.boxplot(x='Class', y=column, data=data, palette='viridis')
    plt.title(f'{column} por Clase')
plt.tight_layout()
plt.show()

# Distribución de la variable objetivo (Class)
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=data)
plt.title('Distribución de la variable objetivo (Class)')
plt.show()

# Balanceo de clases utilizando SMOTE
X = data.drop('Class', axis=1)
y = data['Class']
print("\nDistribución de clases antes del balanceo:")
print(Counter(y))

# División de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Escalado de características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar SMOTE para balancear las clases en los datos de entrenamiento
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)
print("\nDistribución de clases después del balanceo con SMote:")
print(Counter(y_train_res))

# Verificar la distribución después del balanceo
plt.figure(figsize=(6, 4))
sns.countplot(x=y_train_res)
plt.title('Distribución de la variable objetivo (Class) después del balanceo con SMOTE')
plt.show()

# Guardar los datos preprocesados
preprocessed_data = pd.DataFrame(X_train_res, columns=X.columns)
preprocessed_data['Class'] = y_train_res
preprocessed_data.to_csv('preprocessed_creditcard_data.csv', index=False)

print("\nDatos preprocesados guardados exitosamente.")

# Verificación final
print("\nVista preliminar de los datos preprocesados:")
print(preprocessed_data.head())

Explicación del Notebook
Importación de Librerías:

Se importan las librerías necesarias para el análisis y preprocesamiento de los datos, como pandas, numpy, seaborn, matplotlib, y herramientas de sklearn e imblearn.
Carga de Datos:

Se cargan los datos de la base de datos desde un archivo CSV.
Vista Preliminar de los Datos:

Se muestra una vista preliminar de los datos, su descripción estadística y su información básica.
Verificación de Valores Nulos:

Se verifica la presencia de valores nulos en los datos.
Análisis Exploratorio de Datos (EDA):

Se crean gráficos boxplot para visualizar la distribución de cada característica por clase (fraudulenta y no fraudulenta).
Se muestra la distribución de la variable objetivo (Class).
Balanceo de Clases utilizando SMOTE:

Se separan las características (X) de la variable objetivo (y).
Se dividen los datos en conjuntos de entrenamiento y prueba.
Se escalan las características utilizando StandardScaler.
Se aplica SMOTE para balancear las clases en los datos de entrenamiento.
Guardado de Datos Preprocesados:

Se guardan los datos preprocesados en un archivo CSV para su posterior uso en el modelado.
Verificación Final:

Se muestra una vista preliminar de los datos preprocesados.
Este notebook realiza un preprocesamiento completo de los datos, preparando un conjunto de datos balanceado y escalado, listo para ser utilizado en la construcción de modelos de detección de fraude.

In [20]:
# Importar librerías necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from collections import Counter

ModuleNotFoundError: No module named 'imblearn'

In [25]:
!pip uninstall imbalanced-learn


^C
