# GAN

The real samples has three types of features: 
- non sequential features which has a shape of (14, )
- sequential features which has a shape of (60,17)
- image features(MTF along the features) which has a shape of (31,31,1). 

There are three corresponding input layers in the discriminators which were then followed by Dense layers, LSTM layers or CNN layers and merged together. After that there are several dense layers for classification. 


Correspondingly, we have three generators to generate fake samples from noise. 


## Import data

In [1]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
import keras
from keras import layers
from keras_self_attention import SeqSelfAttention
from tensorflow.keras.optimizers import Adam,RMSprop,SGD
from keras.layers import Input, Embedding, multiply, BatchNormalization
from keras.models import Model, Sequential
from keras.layers.core import Reshape, Dense, Dropout, Flatten
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Dense, Dropout, Flatten,concatenate,LSTM,Input,Bidirectional
from keras.layers import Conv2D, MaxPooling2D,AveragePooling2D,Input, Dense, Reshape, Flatten, Embedding, Dropout,Conv2DTranspose
from keras import backend as K
from keras import initializers
from keras.utils import to_categorical
from keras import backend as K
from keras import regularizers
from keras import metrics


with open('label.json') as f:
    labels=json.load(f)
with open('non_sequential_features.json') as f:
    non_sequential_features=json.load(f)
with open('padded_sequential_features_3.json') as f:
    sequential_features=json.load(f)
with open('featurematrix.json') as f:
    arr_=json.load(f)
arr_=np.array(arr_)

# from dict to numpy array
feature1=np.array([sequential_features[key] for key in sequential_features.keys()])
feature2=np.array([non_sequential_features[key] for key in non_sequential_features.keys()])
label=np.array([labels[key] for key in labels.keys()])

#reshape features
arr_=arr_.reshape(-1,31,31,1)
feature1=feature1[:,:,1:]


Using TensorFlow backend.


## Preprocessing
- We will only use benign samples to train the model

In [2]:
#training set include only benign users
arr_benign=arr_[label==0]
arr_fraud=arr_[label==1]

feature1_benign=feature1[label==0]
feature1_fraud=feature1[label==1]

feature2_benign=feature2[label==0]
feature2_fraud=feature2[label==1]

label_benign=label[label==0]  
label_fraud=label[label==1]  

from sklearn.model_selection import train_test_split

X_train_arr, X_test_arr,X_train_f1, X_test_f1,X_train_f2, X_test_f2,y_train,y_test= train_test_split(arr_benign,
                                                                                                     feature1_benign,
                                                                                                     feature2_benign,
                                                                                                     label_benign,
                                                                                          test_size=0.40, random_state=42)
# testing set includes both benign and fraud 
X_test_arr=np.concatenate([X_test_arr, arr_fraud])
X_test_f1=np.concatenate([X_test_f1, feature1_fraud])
X_test_f2=np.concatenate([X_test_f2, feature2_fraud])
y_test=np.concatenate([y_test, label_fraud])


#shuffle testing set
randomize = np.arange(len(X_test_arr))
np.random.shuffle(randomize)
X_test_arr = X_test_arr[randomize]
X_test_f1 = X_test_f1[randomize]
X_test_f2 = X_test_f2[randomize]
y_test = y_test[randomize]

In [3]:
X_train_arr.shape,X_train_f1.shape,X_train_f2.shape,y_train.shape

((83878, 31, 31, 1), (83878, 60, 17), (83878, 14), (83878,))

In [3]:
# validation set for model selection, testing set for reporting 
X_valid_arr,X_test_arr,X_valid_f1,X_test_f1,X_valid_f2,X_test_f2,y_valid,y_test=train_test_split(X_test_arr,
                                                                                                 X_test_f1,
                                                                                                 X_test_f2,
                                                                                                 y_test,
                                                                                                 test_size=0.50, random_state=42)

In [8]:
X_valid_arr.shape

(33022, 31, 31, 1)

## Build multi-inputs GAN

In [4]:
class GAN():
    def __init__(self):
        self.feature_matrix_shape=(31,31,1)
        self.mtf_shape=(60,60,1)
        self.lstm_features_cnt=17
        self.non_seq_shape=(14,)
        optimizer_SGD= SGD()
        optimizer_adam= Adam()

        
        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy',optimizer=optimizer_SGD, metrics=['accuracy'])
        
        # Build the generator
        self.generator1 = self.build_generator1()
        noise_non_sequential=Input(shape=(14,),name="dense_input")
        non_sequential_fake=self.generator1(noise_non_sequential)
        
        self.generator2 = self.build_generator2()
        noise_fm=Input(shape=(32,32,1),name="cnn_input")
        fm_fake=self.generator2(noise_fm)
        
        self.generator3 = self.build_generator3()
        noise_rnn=Input(shape=(60,17),name="rnn_input")
        rnn_fake=self.generator3(noise_rnn)

        # combined generator and discriminator for generator training
        self.discriminator.trainable = False  #discriminator is not trainable when training generator
        validity = self.discriminator([non_sequential_fake,fm_fake,rnn_fake])
        self.combined = Model([noise_non_sequential,noise_fm,noise_rnn], validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer_adam)
        
            

    def build_discriminator(self):
        ## input from feature matrix
        input_fm=Input(shape=self.feature_matrix_shape,name="cnn_input")
        fm=BatchNormalization()(input_fm)
        cnn1=Conv2D(64, (3, 3), padding="same")(fm)
        cnn1=LeakyReLU(alpha=0.2)(cnn1)
        bn1=BatchNormalization()(cnn1)
        bn1=Dropout(0.3)(bn1)
        pool1=AveragePooling2D(pool_size=(2,2),strides=2)(bn1)
        cnn2=Conv2D(32, (3, 3), padding="same")(pool1)
        cnn2=LeakyReLU(alpha=0.2)(cnn2)
        bn2=BatchNormalization()(cnn2)
        pool2=AveragePooling2D(pool_size=(2,2),strides=2)(bn2)
        pool2=Dropout(0.3)(pool2)
        cnn3=Conv2D(1, (3, 3), padding="same")(pool2)
        cnn3=LeakyReLU(alpha=0.2)(cnn3)
        fm_output=Flatten()(cnn3)


        #input from sequential features
        input_rnn=Input(shape=(60,17),name="rnn_input")
        lstm1=Bidirectional(keras.layers.LSTM(64, activation='tanh', return_sequences=True))(input_rnn)
        #lstm1=SeqSelfAttention(attention_activation='sigmoid')(lstm1)
        lstm1=Dropout(0.3)(lstm1)
        rnn_output=Bidirectional(keras.layers.LSTM(32, activation='tanh', return_sequences=False))(lstm1)
        

        #input from non-sequential features
        input_non_sequential=Input(shape=self.non_seq_shape,name="dense_input")
        dense1=Dense(32)(input_non_sequential)
        dense1=LeakyReLU(alpha=0.2)(dense1)
        bn3=BatchNormalization()(dense1)
        drop2=Dropout(0.3)(bn3)
        dense2_output=Dense(32)(drop2)
        dense2_output=LeakyReLU(alpha=0.2)(dense2_output)

        
        #combine three types of input
        merged = concatenate([dense2_output,fm_output, rnn_output])
        dense3=Dense(128)(merged)
        dense3=LeakyReLU(alpha=0.2)(dense3)
        bn4=BatchNormalization()(dense3)
        bn4=Dropout(0.3)(bn4)
        dense4=Dense(64)(bn4)
        dense4=LeakyReLU(alpha=0.2)(dense4)
        bn5=BatchNormalization()(dense4)
        out = Dense(1, activation='sigmoid', name='output_layer')(bn5)

        model = Model(inputs=[input_non_sequential,input_fm,input_rnn], outputs=[out])
        #model.summary()
        return model
    
    def build_generator1(self):
        # generator to generate fake data like non-sequential features 
        model = Sequential()

        model.add(Dense(32))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dropout(0.25))
        model.add(Dense(64))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(14,activation='tanh'))

        #model.summary()

        noise = Input(shape=(14,))
        img = model(noise)

        return Model(noise, img)
    
    def build_generator2(self):
         # generator to generate fake data like feature matrix
        model = Sequential()

        model.add(Conv2DTranspose(8,strides=(2, 2), kernel_size=(3,3)))
        model.add(LeakyReLU(alpha=0.2))
        model.add(keras.layers.BatchNormalization(momentum=0.8))
        model.add(Dropout(0.25))
        model.add(Conv2D(filters=4,kernel_size=4,strides=2,padding='valid'))
        model.add(LeakyReLU(alpha=0.2))
        model.add(keras.layers.BatchNormalization(momentum=0.8))
        model.add(Conv2D(filters=1,kernel_size=4,strides=1,padding='same',activation='tanh'))
        noise = Input(shape=(32,32,1))
        img = model(noise)
        #model.summary()
        return Model(noise, img)
    
    def build_generator3(self):
        # generator to generate fake data like sequential features
        model = Sequential()
        model.add(keras.layers.LSTM(32, activation="tanh", return_sequences=True))
        model.add(Dropout(0.25))
        model.add(keras.layers.LSTM(64, activation="tanh", return_sequences=True))
        model.add(Dropout(0.25))
        model.add(keras.layers.LSTM(128, activation="tanh", return_sequences=True))
        model.add(Dropout(0.25))
        model.add(keras.layers.LSTM(17, activation="tanh",return_sequences=True))
        noise = Input(shape=(60,17))
        img = model(noise)
       # model.summary()
        return Model(noise, img)
    

    def train(self,X_train_arr,X_train_f1,X_train_f2,y_train,X_test_arr,X_test_f1,X_test_f2,y_test,
              epochs=200, batch_size=128):
        auc_list=[]
        ks_list=[]
        auc_progress = []
        half_batch = int(batch_size / 2)

        noise_until = epochs


        for epoch in range(epochs):
            # Select a random half batch of real benign users data
            idx = np.random.randint(0, y_train.shape[0], half_batch)
            arr=X_train_arr[idx]
            sequential=X_train_f1[idx]
            non_sequential=X_train_f2[idx]

           # Sample noise and generate a half batch of new fake data
            noise1 = np.random.normal(0, 1, (half_batch, 14))
            noise2 = np.random.normal(0, 1, (half_batch, 32,32,1))
            noise3 = np.random.normal(0, 1, (half_batch, 60,17))
                
            non_sequential_fake=self.generator1.predict(noise1)
            fm_fake=self.generator2.predict(noise2)
            rnn_fake=self.generator3.predict(noise3)

            valid = np.ones((half_batch, 1))
            fake = np.zeros((half_batch, 1))

            # Train the discriminator
            
            d_loss_real = self.discriminator.train_on_batch([non_sequential,arr,sequential], valid)
            d_loss_fake = self.discriminator.train_on_batch([non_sequential_fake,fm_fake,rnn_fake], fake)

 
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)



            #  Train Generator
  
            noise1 = np.random.normal(0, 1, (batch_size, 14))
            noise2 = np.random.normal(0, 1, (batch_size, 32,32,1))
            noise3 = np.random.normal(0, 1, (batch_size, 60,17))
                
            validity = np.ones((batch_size, 1))

           # if epoch<=1000 or epoch%3==0:
            g_loss = self.combined.train_on_batch([noise1,noise2,noise3], validity)
           
            #g_loss = self.combined.test_on_batch([noise1,noise2,noise3], validity)

            if epoch % 5 == 0:
                print ("%d [D loss: %f, acc: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

            if epoch % 50 == 0:
                y_pred = self.discriminator.predict([X_test_f2,X_valid_arr,X_test_f1],batch_size=batch_size)
                (fpr, tpr, thresholds) = roc_curve(y_test,y_pred)
                area = auc(fpr,tpr)
                auc_list.append(area)

                ks=(tpr-fpr)
                max_ks=np.max(ks)
                ks_list.append(max_ks)
                print('Epoch: {}, auc: {:.5f}, ks: {}'.format(epoch,area,max_ks))
                self.discriminator.save("gans_model_saved/"+str(epoch)+"_model_gans.h5")


        return auc_list,ks_list


In [5]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


In [None]:
if __name__ == '__main__':
    gan = GAN()
    auc_list,ks_list=gan.train(X_train_arr,X_train_f1,X_train_f2,y_train,X_valid_arr,X_valid_f1,X_valid_f2,y_valid,
             epochs=10000, 
             batch_size=512)