In [91]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

In [92]:
df = pd.read_csv('Data/creditcard.csv')

In [93]:
## split by time (70% train - 15% val - 15% test)

df.sort_values('Time', inplace = True) #bc we split by time

#train/test
test_ratio = 0.3
index = int((1-test_ratio) * df.shape[0]) #size of test/train dataset

x_train = df.iloc[:index, 1:-1] #skip time column (following paper) and if you want np array: use .values after this
x_test = df.iloc[index:, 1:-1] 

y_train = df.iloc[:index, -1]  #labels
y_test = df.iloc[index:, -1]   #labels

#val from test
val_ratio = 0.5
index = int((1-val_ratio) * x_test.shape[0]) 

x_val = x_test.iloc[:index, :] #all cols
y_val = y_test.iloc[:index]

x_test = x_test.iloc[index:, :]
y_test = y_test.iloc[index:]

##now we have train-val-test 
print(x_train.shape, x_val.shape, x_test.shape) #train is used for training, val for param tuning and test for testing

(199364, 29) (42721, 29) (42722, 29)


## Setting 1: ANV (only normalizing amount variable)

In [94]:
## setting 1: only normalizing Amount
Scaler = MinMaxScaler()

for df in [x_train, x_val, x_test]:
    df[['Amount']] = Scaler.fit_transform(df[['Amount']])
    
# adding labels, so that we can export the full dataframes
# this is needed to calculate the AUC-ROC and AV-PR later on

train = x_train
train['class'] = y_train

val = x_val
val['class'] = y_val

test = x_test
test['class'] = y_test

In [95]:
train.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,class
count,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,...,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0
mean,-0.105411,0.003754,0.320783,0.075491,-0.112147,0.045975,-0.050588,0.017137,0.008822,-0.01124,...,-0.014674,-0.047919,-0.015519,0.004285,0.060814,0.006248,0.001588,0.001916,0.004567,0.001926
std,1.891043,1.621781,1.447531,1.399229,1.361533,1.311005,1.218068,1.209774,1.135743,1.092698,...,0.738184,0.691323,0.610755,0.603586,0.491248,0.488392,0.394051,0.313844,0.012663,0.043845
min,-56.40751,-72.715728,-33.680984,-5.683171,-42.147898,-26.160506,-43.557242,-73.216718,-13.434066,-24.588262,...,-34.830382,-10.933144,-44.807735,-2.836627,-10.295397,-2.604551,-22.565679,-11.710896,0.0,0.0
25%,-0.962291,-0.58151,-0.336367,-0.79265,-0.784262,-0.710519,-0.576456,-0.180954,-0.6639,-0.519549,...,-0.227103,-0.537661,-0.168573,-0.341725,-0.240214,-0.331146,-0.067411,-0.041606,0.000305,0.0
50%,-0.09206,0.081545,0.492453,0.076332,-0.169569,-0.219188,-0.010263,0.043241,-0.07399,-0.10019,...,-0.042975,-0.035025,-0.028892,0.052733,0.103368,-0.060899,0.005966,0.018491,0.00117,0.0
75%,1.213894,0.7835,1.222798,0.886439,0.46485,0.444175,0.500768,0.338716,0.626407,0.442543,...,0.150726,0.423196,0.115063,0.421865,0.384386,0.261533,0.089145,0.078257,0.004035,0.0
max,2.45493,22.057729,9.382558,16.875344,34.801666,22.529298,36.677268,20.007208,15.594995,23.745136,...,27.202839,10.50309,19.002942,4.022866,7.519589,3.517346,12.152401,33.847808,1.0,1.0


In [67]:
train.to_csv('Data/creditcard_train_1.csv', index=False) #ANV
val.to_csv('Data/creditcard_val_1.csv', index=False) #ANV
test.to_csv('Data/creditcard_test_1.csv', index=False) #ANV

In [71]:
print(np.sum(train['class']==1) / len(train)) #384 fraud in train
print(np.sum(val['class']==1) /len(val) )#56 fraud in val
print(np.sum(test['class']==1) / len(test)) #52 fraud in test

0.0019261250777472363
0.001310830738980829
0.0012171714807359207


## Setting 2: FNV (normalize all columns)

In [57]:
## setting 2: normalize all columns

# ** NOTE ** FIRST RE-RUN CELLS UP TO SETTING 1

Scaler = MinMaxScaler()   

for df in [x_train, x_val, x_test]:
    df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
           'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
           'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']] = Scaler.fit_transform(df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
           'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
           'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']])

#full df's for exporting to RDP script
train = x_train
train['class'] = y_train

val = x_val
val['class'] = y_val

test = x_test
test['class'] = y_test

train.to_csv('Data/creditcard_train_2.csv', index=False) #FNV
val.to_csv('Data/creditcard_val_2.csv', index=False) #FNV
test.to_csv('Data/creditcard_test_2.csv', index=False) #FNV

(199364, 29) (42721, 29) (42722, 29)


In [None]:
## the deletion of the variable 'Time' was done in the later notebooks/scripts of the methods