In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler
    )
from sklearn.metrics import (
    precision_score, recall_score, roc_auc_score, f1_score, confusion_matrix, accuracy_score
    )
from sklearn.ensemble import ExtraTreesClassifier
from category_encoders import (
    HashingEncoder, BinaryEncoder
    )
%matplotlib inline

In [2]:
%load_ext autoreload

%autoreload 2

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Synthetic Fraud data

### Load data

In [5]:
fraud_data = pd.read_csv('data/PS_20174392719_1491204439457_log.csv')

## Prepare data

In [6]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


I think the 

In [7]:
categorical_dict = {'type': {'CASH_OUT' : 0, 'TRANSFER' : 1, 'DEBIT' : 2, 'CASH_IN' : 3, 'PAYMENT' : 4}}
fraud_data.replace(categorical_dict, inplace=True)

benc = BinaryEncoder(cols=['type'])
fraud_data = benc.fit_transform(fraud_data)

In [8]:
fraud_data.columns

Index(['step', 'type_0', 'type_1', 'type_2', 'type_3', 'amount', 'nameOrig',
       'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'oldbalanceDest',
       'newbalanceDest', 'isFraud', 'isFlaggedFraud'],
      dtype='object')

In [9]:
fraud_data.head(20)

Unnamed: 0,step,type_0,type_1,type_2,type_3,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,0,0,0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,0,0,0,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,0,0,1,0,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,0,0,1,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,0,0,0,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0
5,1,0,0,0,1,7817.71,C90045638,53860.0,46042.29,M573487274,0.0,0.0,0,0
6,1,0,0,0,1,7107.77,C154988899,183195.0,176087.23,M408069119,0.0,0.0,0,0
7,1,0,0,0,1,7861.64,C1912850431,176087.23,168225.59,M633326333,0.0,0.0,0,0
8,1,0,0,0,1,4024.36,C1265012928,2671.0,0.0,M1176932104,0.0,0.0,0,0
9,1,0,1,0,0,5337.77,C712410124,41720.0,36382.23,C195600860,41898.0,40348.79,0,0


In [10]:
fraud_data.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1, inplace=True)

In [11]:
train_cols = ['step', 'type_0', 'type_1', 'type_2', 'type_3', 'amount',
       'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
label_col = ['isFraud']

In [12]:
scaler = MinMaxScaler(feature_range=(-1,1))
fraud_data = pd.DataFrame(scaler.fit_transform(fraud_data), columns=fraud_data.columns)

In [13]:
train_length = np.round(len(fraud_data.index) * 0.9, 0)
# train_X = fraud_data.loc[:train_length, train_cols]
# train_y = fraud_data.loc[:train_length, label_col]
# valid_X = fraud_data.loc[train_length:, train_cols]
# valid_y = fraud_data.loc[train_length:, label_col]

gan_set = fraud_data.loc[:train_length, train_cols + label_col]
gan_set_frauds = gan_set[gan_set['isFraud'] > 0]

In [14]:
gan_set_frauds.reset_index(drop=True, inplace=True)
gan_set.reset_index(drop=True, inplace=True)

In [15]:
len(gan_set_frauds.columns)

11

Hmmm.... the tricky thing is to train intermittently with the two separate sources. I suppose I could pre-generate noise, but that would be very unflexible. I think the most sensible while easy to read is to pre-shuffle the list of integers for the index and then pull that list from the dataset as a batch. That's the most flexible. I'm not sure how I would do dataset slices from tf, except maybe with .take()? Might also be worth a try.  
Sadly no. The function itself is pretty great, but its numpy iterator is persistent, it doesn't act like an iterator - the 'next' function appears broken? It only repeats the first entry.  
Else this would be it!  
I think instead it's better to have a list that's randomised and accesses a dataframe that doesn't change. The list gets deleted bit by bit - its deleted pieces could be added to the performance summary checklist, but might not have to. That seems dangerous.

In [16]:
# generate n real samples with class labels
def generate_real_samples(data, batch_numbers, n):
    # generate inputs 
    X = data.iloc[batch_numbers,:].values
    # generate class labels
    y = np.ones((n, 1))
    return X, y


In [17]:
# train a generative adversarial network on a one-dimensional function
from numpy import hstack
from numpy import zeros
from numpy import ones
from numpy.random import rand
from numpy.random import randn
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot

# define the standalone discriminator model
def define_discriminator(n_inputs=2):
    model = Sequential()
    model.add(Dense(500, activation='relu', kernel_initializer='he_uniform', input_dim=n_inputs))
    model.add(Dense(300, activation='relu', kernel_initializer='he_uniform'))
#    model.add(Dense(200, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(200, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(150, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(50, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(1, activation='sigmoid'))
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# define the standalone generator model
def define_generator(latent_dim, n_outputs=2):
    model = Sequential()
    model.add(Dense(200, activation='relu', kernel_initializer='he_uniform', input_dim=latent_dim))
#    model.add(Dense(200, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(80, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(n_outputs, activation='tanh'))
    return model

# define the combined generator and discriminator model, for updating the generator
def define_gan(generator, discriminator):
    # make weights in the discriminator not trainable
    discriminator.trainable = False
    # connect them
    model = Sequential()
    # add generator
    model.add(generator)
    # add the discriminator
    model.add(discriminator)
    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

# generate n real samples with class labels
def generate_real_samples(data, batch_numbers):
    # generate inputs 
    X = data.iloc[batch_numbers,:].values
    # generate class labels
    y = np.ones((len(batch_numbers), 1))
    return X, y

# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n):
    # generate points in the latent space
    x_input = randn(latent_dim * n)
    # reshape into a batch of inputs for the network
    x_input = x_input.reshape(n, latent_dim)
    return x_input

# use the generator to generate n fake examples, with class labels
def generate_fake_samples(generator, latent_dim, n):
    # generate points in latent space
    x_input = generate_latent_points(latent_dim, n)
    # predict outputs
    X = generator.predict(x_input)
    # create class labels
    y = zeros((n, 1))
    return X, y

# evaluate the discriminator and plot real and fake points
def summarize_performance(epoch, generator, discriminator, latent_dim, data, eval_numbers, n=100):
    # prepare real samples
    eval_sample = np.random.choice(eval_numbers, n)
    x_real, y_real = generate_real_samples(data, eval_sample)
    # evaluate discriminator on real examples
    _, acc_real = discriminator.evaluate(x_real, y_real, verbose=0)
    # prepare fake examples
    x_fake, y_fake = generate_fake_samples(generator, latent_dim, n)
    # evaluate discriminator on fake examples
    _, acc_fake = discriminator.evaluate(x_fake, y_fake, verbose=0)
    # summarize discriminator performance
    print("Epoch: {}".format(epoch))
    print("Accuracy real: {}".format(acc_real))
    print("Accuracy fake: {}".format(acc_fake))

# train the generator and discriminator
def train(g_model, d_model, gan_model, latent_dim, data, n_epochs=50, n_batch=4000, n_eval=50):
    # determine half the size of one batch, for updating the discriminator
    half_batch = int(n_batch / 2)
    data_length = len(data.index)
    shuffled_data_index = list(np.random.choice(range(data_length),
                                                data_length,
                                                replace=False)
                              )
    eval_numbers = []
    # manually enumerate epochs
    for i in range(n_epochs):
        # prepare real samples
        batch_numbers = shuffled_data_index[-n_batch:]
        x_real, y_real = generate_real_samples(data, batch_numbers)
        del shuffled_data_index[-n_batch:]
        eval_numbers.extend(batch_numbers)
        # prepare fake examples
        x_fake, y_fake = generate_fake_samples(g_model, latent_dim, half_batch)
        # update discriminator
        d_model.train_on_batch(x_real, y_real)
        d_model.train_on_batch(x_fake, y_fake)
        # prepare points in latent space as input for the generator
        x_gan = generate_latent_points(latent_dim, n_batch)
        # create inverted labels for the fake samples
        y_gan = ones((n_batch, 1))
        # update the generator via the discriminator's error
        gan_model.train_on_batch(x_gan, y_gan)
        # evaluate the model every n_eval epochs
        if (i+1) % n_eval == 0:
            summarize_performance(i, g_model, d_model, latent_dim, data, eval_numbers)

# size of the latent space
dataset_width = len(gan_set_frauds.columns)
latent_dim = dataset_width * 2
# create the discriminator
discriminator = define_discriminator(n_inputs=dataset_width)
# create the generator
generator = define_generator(latent_dim, dataset_width)
# create the gan
gan_model = define_gan(generator, discriminator)
# train model
train(generator, discriminator, gan_model, latent_dim, gan_set_frauds, n_epochs=500)

Using TensorFlow backend.


Epoch: 49
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 99
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 149
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 199
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 249
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 299
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 349
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 399
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 449
Accuracy real: 0.0
Accuracy fake: 1.0
Epoch: 499
Accuracy real: 0.0
Accuracy fake: 1.0


In [18]:
generate_fake_samples(generator, 25, 4)

ValueError: Error when checking input: expected dense_7_input to have shape (22,) but got array with shape (25,)

In [20]:
gan_set_frauds.head()

Unnamed: 0,step,type_enc,amount,nameOrig_enc,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,nameDest_enc,isFraud
0,1,4,181.0,1002156,181.0,0.0,0.0,0.0,439685,1
1,1,1,181.0,5828262,181.0,0.0,21182.0,0.0,391696,1
2,1,4,2806.0,1379875,2806.0,0.0,0.0,0.0,563886,1
3,1,1,2806.0,3619815,2806.0,0.0,26202.0,0.0,2134,1
4,1,4,20128.0,1232211,20128.0,0.0,0.0,0.0,251089,1


Yeah, hm. Obviously, that's not exactly useful. I reckon (which makes sense, duh) the model struggles with the ordinal encoding of the names, which are crazy large values that kill the weights. Given that sum encoding and backward difference encoding work much better than ordinal encoding according to this paper https://www.researchgate.net/profile/Kedar_Potdar/publication/320465713_A_Comparative_Study_of_Categorical_Variable_Encoding_Techniques_for_Neural_Network_Classifiers/links/59e6f9554585151e5465859c/A-Comparative-Study-of-Categorical-Variable-Encoding-Techniques-for-Neural-Network-Classifiers.pdf, I think it's worth giving this a shot and to see the difference. Then again, the training went crazy super fast and we only trained for 50 epochs, which is nothing.  
I'm also not very happy with the GAN overall, of course. I want to make it a proper class with all the methods needed to generate data after training etc.  
Okay, further looking into the attributes of the encoding parameters shows that they require as many columns as one hot encoding. That's no good. Target encoding seems more useful, but the classes are so imbalanced, overfitting is going to be a problem. Now, I think I have to explore further into the dataset to see how many of the doubles and triples in the names coincide with the frauds.