# PREPROCESSING
Preprocesando el dataset de "Credit card fraud detection"

In [1]:
import pandas as pd
import numpy as np
import os, sys, warnings
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
## configs
warnings.filterwarnings("ignore", category=FutureWarning)
# pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = 200

In [3]:
data = pd.read_csv("../datasets/creditcard.csv.gz")
## example data
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
target = "Class"
X_cols = [c for c in data if c.startswith('V')]
print(f"Target: {target}")
print(f"Feature: {','.join(X_cols)}")

Target: Class
Feature: V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28


In [5]:
## Anomalous MAsk
mask = data[target]==1
print("Mask = ", dict(mask.value_counts()))
# current number of Anomalies
PercFraud = len(data[mask]) / len(data)
print(f"The proportion of REAL fraud in dataset is {len(data[mask])} [{PercFraud=:,.5f}]")
print(f"If we use 1% the records with Fraud will be {round(len(data)*0.01,0):n} records.")

Mask =  {False: 284315, True: 492}
The proportion of REAL fraud in dataset is 492 [PercFraud=0.00173]
If we use 1% the records with Fraud will be 2848 records.


### Preprocessing (Feature Engineer)

#### scale data

In [6]:
def scale_data(data:pd.DataFrame, target:str, scaler:str):
    from sklearn.preprocessing import MinMaxScaler, StandardScaler
    data1 = data.drop(["Time", target],axis=1).copy()
    _cols = data1.columns
    if scaler=="minmax":
        data1 = MinMaxScaler((0,1)).fit_transform(data1)
    elif scaler=="standar":
        data1 = StandardScaler().fit_transform(data1)
    data1 = pd.concat([data[["Time", target]], pd.DataFrame(data1, columns=_cols)], axis=1)
    data1.describe().T
    return data1

In [7]:
data1 = scale_data(data=data, target=target, scaler="minmax")
data1.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,284807.0,94813.859575,47488.145955,0.0,54201.5,84692.0,139320.5,172792.0
Class,284807.0,0.001727,0.041527,0.0,0.0,0.0,0.0,1.0
V1,284807.0,0.958294,0.033276,0.0,0.942658,0.958601,0.980645,1.0
V2,284807.0,0.767258,0.017424,0.0,0.760943,0.767949,0.775739,1.0
V3,284807.0,0.837414,0.026275,0.0,0.821985,0.84053,0.855213,1.0
V4,284807.0,0.25193,0.062764,0.0,0.214311,0.25105,0.284882,1.0
V5,284807.0,0.765716,0.009292,0.0,0.76106,0.765351,0.769836,1.0
V6,284807.0,0.26302,0.013395,0.0,0.255295,0.260263,0.267027,1.0
V7,284807.0,0.265356,0.007537,0.0,0.26198,0.2656,0.268831,1.0
V8,284807.0,0.785385,0.012812,0.0,0.783148,0.785625,0.788897,1.0
