# Neural Netwok Based Classifier

In [1]:
# Library import
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

## Data Import

In [2]:
## Data file path and name
day = '91'
k = '_k7'
path1 = '/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/'
file_name1 = path1 + 'gmm_clutering_outputs_day_' + day + k + '.tsv'


In [3]:
# Loding data from the GMM outputs 
df = pd.read_csv(file_name1, sep="\t")

In [4]:
# Drop row with NaN
df = df.dropna()

# Keep a copy of the original data
df_orig = df.copy()

# Checking null in data
df.isnull().values.any()

False

In [5]:
## Get the column index
col_names = list(df)
L = len(col_names)

for i in range(0, L):
    print i, col_names[i]

0 activateexpense
1 activateotherincome
2 activatepayment
3 admindeactivation
4 adminonlinepaymentattempt
5 adminpayinvoiceonlineinvoice
6 adminpayinvoiceonlinelistview
7 archiveclient
8 archiveexpense
9 archiveotherincome
10 archiveproject
11 archivetask
12 autobillpayment
13 avg_wc_address_day_91
14 avg_wc_description_day_91
15 avg_wc_notes_day_91
16 avg_wc_terms_day_91
17 bulkimportclientscomplete
18 client_count_day_91
19 clientimportcsvsucceeded
20 clientlimitupgradenudge
21 createbankaccount
22 createbanktransaction
23 createbanktransfer
24 createcategory
25 createcontact
26 createcontractor
27 createcreditnote
28 createdexpense
29 createestimate
30 createexpense
31 createitem
32 createotherincome
33 createreceipt
34 createservice
35 creditcardclientaccessgranted
36 customemailsignature
37 declinedonlinepaymentnotification
38 deletebusinesspartner
39 deletecollaborator
40 deletecreditnote
41 deleteestimate
42 deleteexpense
43 deletehours
44 deleteinvoice
45 deleteitem
46 deleteot

## Data Standarization

In [6]:
# Normalized all features columns except the 'systemid'
column_names_to_normalize = ['systemid', 'cluster_id']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_normalize ]

# Normalized all features columns except the 'systemid'
min_max_scaler = MinMaxScaler()
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp


## Split Data: Train and Test

In [7]:
X = df.drop(columns=['systemid', 'cluster_id'],axis=1)
y = df['cluster_id']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Training Model

In [9]:
mlp = MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500)

In [10]:
mlp.fit(X_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [11]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(13, 13, 13), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Saving Model

In [12]:
##################################### Saving the Train Model #################################################

## Saving pickled model file path and name
path2 = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/'
file_name2 = path2 + 'fraud_detection_nn_classifier_day_' + day + k + '_model.sav'

# save the model to disk  
pickle.dump(mlp, open(file_name2, 'wb'))

## Prediction and Evalution

In [13]:
predictions = mlp.predict(X_test)

In [14]:
print(confusion_matrix(y_test,predictions))

[[35914     0     0     0    23     0     1]
 [    0  4233     0     0     4     2     0]
 [    0     0 50714     0    65     1     0]
 [    0     0     0   263     0     2    47]
 [   12     0   162     0 14514     0     0]
 [    0    14     0     1     0   104     0]
 [    0     0     0     9     0     0  3603]]


In [15]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35938
           1       1.00      1.00      1.00      4239
           2       1.00      1.00      1.00     50780
           3       0.96      0.84      0.90       312
           4       0.99      0.99      0.99     14688
           5       0.95      0.87      0.91       119
           6       0.99      1.00      0.99      3612

   micro avg       1.00      1.00      1.00    109688
   macro avg       0.98      0.96      0.97    109688
weighted avg       1.00      1.00      1.00    109688



In [16]:
# list(predictions)