In [37]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

import pandas as pd

from datetime import datetime

from rich import print
from rich.console import Console
from rich.traceback import install

install(show_locals=True)
console = Console()

In [38]:
# The digits dataset, train set
cc_df = pd.read_csv('./data/creditcard.csv')

(n_samples, n_features) = cc_df.shape

print(f"n_samples: {n_samples}, n_features: {n_features}")

#### Preprocesamiento

##### Balanceamiento

In [39]:
FEATURES = cc_df.drop(['Class'], axis=1) # Features
TARGET = cc_df['Class']                  # Target


smote = SMOTE(random_state=42) # Crea un objeto SMOTE con semilla 42
X_sm, y_sm = smote.fit_resample(FEATURES, TARGET)

cc_balanced = pd.concat([pd.DataFrame(X_sm), pd.DataFrame(y_sm)], axis=1)

print('Valores de la columna Class:')
cc_balanced['Class'].describe()

count    568630.0
mean          0.5
std           0.5
min           0.0
25%           0.0
50%           0.5
75%           1.0
max           1.0
Name: Class, dtype: float64

In [40]:
data_corr = cc_balanced.corr()

for i in range(len(cc_balanced.columns)):
    for j in range(i):
        if abs(data_corr.iloc[i, j]) > 0.88:
            print(data_corr.columns[i], data_corr.columns[j], data_corr.iloc[i, j])

In [41]:
cc_balanced = cc_balanced.drop(['V1', 'V7', 'V10', 'V11', 'V14', 'V16', 'V18'], axis = 1)

##### Normalización

In [42]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()


cc_balanced.iloc[:,0:-1] = scaler.fit_transform(cc_balanced.iloc[:,0:-1].to_numpy())
cc_balanced.to_pickle('./pickled/cc_preprocesados.pkl')

In [43]:
import math

def convert_to_hours(time_diff):
    """
    Rounds the given time differences (in seconds) to the nearest whole number of hours.
    """
    
    if time_diff > 43200:
        time_diff = time_diff / 2
        
    return int(math.floor(time_diff / 3600))

df_temp = pd.read_csv('./data/creditcard.csv')
df_prep = pd.read_pickle('./pickled/cc_preprocesados.pkl')

df_prep['Hour'] = df_temp['Time'].apply(convert_to_hours)

In [44]:
last_column = df_prep.pop('Hour')
df_prep.drop(['Time'], axis = 1, inplace = True)
df_prep.insert(0, 'Hour', last_column)

In [45]:
df_prep.to_pickle('./pickled/cc_prep_final.pkl')

In [46]:
print(df_prep['Hour'].describe())

In [47]:
df_prep = pd.read_pickle('./pickled/cc_prep_final.pkl')

print(len(df_prep))

training_sample, test_smaple = train_test_split(df_prep, test_size = 0.5, random_state = 42)
validation_sample, test_sample = train_test_split(test_smaple, test_size = 0.5, random_state = 42)

print(f"Training sample: {training_sample.shape}")
print(f"Validation sample: {validation_sample.shape}")
print(f"Test sample: {test_sample.shape}")