# Identify Fraud Accounts with Anomaly Detection: Day 7

## Environment Setup

In [1]:
# Library import
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats

import tensorflow as tf

from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers

import seaborn as sns
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

# rcParams['figure.figsize'] = 14, 8

RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]


Using TensorFlow backend.


## Loding Data

In [2]:
# Loding data
df_day_7 = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_k6.tsv", sep="\t")


In [36]:
df_day_7 = df_day_7.drop(['Unnamed: 0'], axis=1)

In [9]:
# schema = {
#     'integer': 'systemid',
#     'numeric(20,2)':[col for col in df_day_7.columns if col not in ['systemid', 'Unnamed: 0']]
# }

In [23]:
# df_test = df_day_7.copy()

In [41]:
# df_test.columns = [
#     col.replace('-', '_').replace('/', '_')
#     for col in df_test.columns
# ]

In [27]:
# list_base = [
#     'CREATE TABLE data_science.dewan_demo (',
#     '    systemid integer,',
# ]

# list_numeric = [
#     '    ' + str(col) +  ' numeric(20, 3),'
#     for col in df_test.columns
#     if col not in ['systemid', 'Unnamed: 0']
# ]

# list_base.extend(list_numeric)

In [28]:
# create_query = '\n'.join(list_base)

In [30]:
# print(create_query)

In [None]:
# df_test.to_csv('')


In [40]:
list(df_day_7)

['systemid',
 'activateestimate',
 'activateexpense',
 'activateotherincome',
 'activatepayment',
 'activateproject',
 'activaterecurringprofile',
 'activatestaff',
 'adminde-activation',
 'adminonlinepaymentattempt',
 'adminpayinvoiceonline-invoice',
 'adminpayinvoiceonline-listview',
 'archiveclient',
 'archiveexpense',
 'archiveotherincome',
 'archiveproject',
 'archivetask',
 'autobillpayment',
 'banktransferdisabled',
 'banktransferenabled',
 'bulkimportclientscomplete',
 'bulkimportitemsandservicescomplete',
 'clientimportcsvsucceeded',
 'clientlimitupgradenudge',
 'createbankaccount',
 'createbanktransaction',
 'createbanktransfer',
 'createcategory',
 'createcontact',
 'createcontractor',
 'createcreditnote',
 'createdexpense',
 'createestimate',
 'createexpense',
 'createitem',
 'createotherincome',
 'createreceipt',
 'createservice',
 'creditcardclientaccessgranted',
 'creditcardsystemaccessrevoked',
 'customemailsignature',
 'declinedonlinepaymentnotification',
 'deletebusin

## Further Feature Selection

In [39]:
# Checking null in data
df_day_7.isnull().values.any()


False

In [None]:
################################ Filtering Only Import Important Features ###########################################

# New Day 7: Importing importing features list
important_features = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_imp_features_names/model_anoml_important_features_day_7.tsv", sep="\n,")

# New Day 7: Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Filtering only important features 
df_day_7_imp = df_day_7[df_day_7.columns.intersection(imp_features_list)]

# Drop not so important feature columns
df_day_7_imp_noid = df_day_7_imp.drop(columns=['systemid',
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank'
                                 
                                ], axis=1)


In [None]:
df_day_7_imp_noid.head()

## Data Standarization

In [None]:
# Standarization with standard deviation:  (x-mean)/(std)
scaler_anml_day_7 = StandardScaler()
scaler_anml_day_7.fit(df_day_7_c1_imp_noid)
StandardScaler(copy=True, with_mean=True, with_std=True)

df_res = pd.DataFrame(scaler_anml_day_7.transform(df_day_7_imp_noid))


In [None]:
df_res.head()

In [None]:
# # Split data into train and test set
X_train, X_test = train_test_split(data, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.drop(['cluster_id_k6'], axis=1)

y_test = X_test['cluster_id_k6']
X_test = X_test.drop(['cluster_id_k6'], axis=1)

X_train = X_train.values
X_test = X_test.values

In [None]:
X_train.shape[1]



## Building Model
Our Autoencoder uses 4 fully connected layers with 14, 7, 7 and 29 neurons respectively. The first two layers are used for our encoder, the last two go for the decoder. Additionally, L1 regularization will be used during training:

In [None]:
input_dim = X_train.shape[1]
encoding_dim = 14


In [None]:
input_layer = Input(shape=(input_dim, ))

encoder = Dense(encoding_dim, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)

decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)


Let's train our model for 100 epochs with a batch size of 32 samples and save the best performing model to a file. The ModelCheckpoint provided by Keras is really handy for such tasks. Additionally, the training progress will be exported in a format that TensorBoard understands.

In [None]:
nb_epoch = 100
batch_size = 32

autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error', 
                    metrics=['accuracy'])

checkpointer = ModelCheckpoint(filepath="model.h5",
                               verbose=0,
                               save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',
                          histogram_freq=0,
                          write_graph=True,
                          write_images=True)

history = autoencoder.fit(X_train, X_train,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test),
                    verbose=1,
                    callbacks=[checkpointer, tensorboard]).history



In [None]:
autoencoder = load_model('model.h5')


## Evaluation

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right');


The reconstruction error on our training and test data seems to converge nicely. Is it low enough? Let's have a closer look at the error distribution:

In [None]:
predictions = autoencoder.predict(X_test)


In [None]:
mse = np.mean(np.power(X_test - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
                        'true_class': y_test})


In [None]:
error_df.describe()
