In [1]:
import csv
import numpy as np

np.set_printoptions(precision=6, suppress=True, linewidth=140)

- The datasets contains transactions made by credit cards in __September 2013__ by european cardholders.

- This dataset presents transactions that occurred in __two days__, where we have 492 frauds out of 284,807 transactions. 

- The dataset is __highly unbalanced__, the positive class (frauds) account for 0.172% of all transactions.

- It contains only numerical input variables which are the result of a __PCA transformation__. 

- Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. 

- Features V1, V2, … V28 are the principal components obtained with PCA, 

- the only features which have not been transformed with PCA are 'Time' and 'Amount'. 

- Feature __'Time'__ contains the seconds elapsed between each transaction and the first transaction in the dataset. 

- The feature __'Amount'__ is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. 

- Feature __'Class'__ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [2]:
fname = r'D:\MYLEARN\datasets\creditcard.csv'

In [3]:
all_features = []
all_targets  = []

with open(fname) as f:
    
    for i, line in enumerate(f):
        
        if i == 0:
            print("HEADER:", line.strip())
            continue  # Skip header
            
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([int(fields[-1].replace('"', ""))])
        
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])

features = np.array(all_features, dtype="float32")
targets  = np.array(all_targets, dtype="uint8")

HEADER: "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","Class"
EXAMPLE FEATURES: [0.0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62]


In [4]:
print("features.shape:", features.shape)
print("targets.shape:",  targets.shape)

features.shape: (284807, 30)
targets.shape: (284807, 1)


#### Prepare a validation set

In [5]:
num_val_samples = int(len(features) * 0.2)

train_features  = features[:-num_val_samples]
train_targets   =  targets[:-num_val_samples]

val_features    = features[-num_val_samples:]
val_targets     =  targets[-num_val_samples:]

print("Number of training samples:",   len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 227846
Number of validation samples: 56961


#### Analyze class imbalance in the targets

In [6]:
counts = np.bincount(train_targets[:, 0])

print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 417 (0.18% of total)


In [7]:
np.unique(train_targets, return_counts=True)

(array([0, 1], dtype=uint8), array([227429,    417], dtype=int64))

In [8]:
train_features

array([[     0.      ,     -1.359807,     -0.072781, ...,      0.133558,     -0.021053,    149.62    ],
       [     0.      ,      1.191857,      0.266151, ...,     -0.008983,      0.014724,      2.69    ],
       [     1.      ,     -1.358354,     -1.340163, ...,     -0.055353,     -0.059752,    378.66    ],
       ...,
       [145245.      ,     -0.061507,      1.024901, ...,     -0.048916,      0.023871,     39.4     ],
       [145247.      ,      2.050034,     -0.103557, ...,     -0.074529,     -0.074279,      0.17    ],
       [145248.      ,      1.914027,     -0.490068, ...,     -0.023843,     -0.037139,     50.      ]], dtype=float32)

#### Normalize the data using training set statistics

In [9]:
mean = np.mean(train_features, axis=0)

train_features -= mean
val_features   -= mean

std = np.std(train_features, axis=0)

train_features /= std
val_features   /= std

#### Build a binary classification model

In [10]:
from tensorflow import keras

model = keras.Sequential(
    [
        keras.layers.Dense(256, activation="relu", input_shape=(train_features.shape[-1],)),
        
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 256)               7936      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 139,777
Trainable params: 139,777
Non-trainable params: 0
__________________________________________________

#### Train the model with class_weight argument

In [11]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

In [18]:
model.compile(
            optimizer=keras.optimizers.Adam(1e-2), 
            loss="binary_crossentropy", 
            metrics=metrics
)

In [19]:
callbacks    = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

In [20]:
%%time
model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=1,
    callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Wall time: 3min 27s


<tensorflow.python.keras.callbacks.History at 0x18b169866c8>

In [21]:
predictions = model.predict(val_features)

In [22]:
val_targets

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=uint8)

In [23]:
predictions = predictions.astype('int64')
predictions

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [24]:
from sklearn import metrics

In [25]:
metrics.accuracy_score(val_targets, predictions)

0.998490195045733

In [26]:
metrics.confusion_matrix(val_targets, predictions)

array([[56843,    43],
       [   43,    32]], dtype=int64)

In [None]:
from sklearn.utils import class_weight
class_weights = dict(zip(np.unique(y_train), class_weight.compute_class_weight('balanced', np.unique(y_train), 
                y_train))) 