# Fraud Detection Day 70 Model: Gaussian Mixture Model (GMM)

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 70

### 1.1 Import day 70 final features data


In [2]:
# Import day 56 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/training_data/final_features_day_70_no_fbtest_onlyimp.tsv", sep="\t")

In [3]:
# checking dataframe
# df.tail()
df.shape
# list(df)


(452565, 104)

In [4]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

In [5]:
## Sorting column in the dataframe
df = df.reindex(sorted(df.columns), axis=1)

In [6]:
## Get the column index
col_names = list(df)
L = len(col_names)

for i in range(0, L):
    print i, col_names[i]

0 activateestimate
1 activateexpense
2 activateotherincome
3 activatepayment
4 activateproject
5 activaterecurringprofile
6 activatestaff
7 admindeactivation
8 adminonlinepaymentattempt
9 adminpayinvoiceonlineinvoice
10 adminpayinvoiceonlinelistview
11 archiveclient
12 archiveexpense
13 archiveotherincome
14 archiveproject
15 archivetask
16 autobillpayment
17 avg_wc_address_day_70
18 avg_wc_description_day_70
19 avg_wc_notes_day_70
20 avg_wc_terms_day_70
21 banktransferdisabled
22 banktransferenabled
23 bulkimportclientscomplete
24 bulkimportitemsandservicescomplete
25 client_count_day_70
26 clientimportcsvsucceeded
27 clientlimitupgradenudge
28 createbankaccount
29 createbanktransaction
30 createbanktransfer
31 createcategory
32 createcontact
33 createcontractor
34 createcreditnote
35 createdexpense
36 createestimate
37 createexpense
38 createitem
39 createotherincome
40 createreceipt
41 createservice
42 creditcardclientaccessgranted
43 creditcardsystemaccessrevoked
44 customemailsign

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [8]:
# Finding the features columns varinaces
df = df.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank'
                                ], axis=1)

In [10]:
## Get the column index
col_names = list(df)
L = len(col_names)

for i in range(0, L):
    print i, col_names[i]

0 activateexpense
1 activateotherincome
2 activatepayment
3 admindeactivation
4 adminonlinepaymentattempt
5 adminpayinvoiceonlineinvoice
6 adminpayinvoiceonlinelistview
7 archiveclient
8 archiveexpense
9 archiveotherincome
10 archiveproject
11 archivetask
12 autobillpayment
13 avg_wc_address_day_70
14 avg_wc_description_day_70
15 avg_wc_notes_day_70
16 avg_wc_terms_day_70
17 bulkimportclientscomplete
18 client_count_day_70
19 clientimportcsvsucceeded
20 clientlimitupgradenudge
21 createbankaccount
22 createbanktransaction
23 createbanktransfer
24 createcategory
25 createcontact
26 createcontractor
27 createcreditnote
28 createdexpense
29 createestimate
30 createexpense
31 createitem
32 createotherincome
33 createreceipt
34 createservice
35 creditcardclientaccessgranted
36 customemailsignature
37 declinedonlinepaymentnotification
38 deletebusinesspartner
39 deletecollaborator
40 deletecreditnote
41 deleteestimate
42 deleteexpense
43 deletehours
44 deleteinvoice
45 deleteitem
46 deleteot

In [11]:
# Keep a copy of the original dataframe
df_orig = df.copy()

# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

### 2.3 Feature column normalization (model)

In [12]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

# Min-Max standarization model
min_max_scaler = MinMaxScaler()

# Fit and transfrom the data
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp


In [13]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/minmax_scaler_gmm_day_70.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [14]:
# Remove any column with 'Nan'
df.fillna(0, inplace=True)

# Drop the 'systemid' 
df_nosystemid = df.drop(['systemid'], axis=1)

In [15]:
df_nosystemid.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,...,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0
mean,2.6e-05,1.2e-05,1.8e-05,0.000277,6e-06,5e-06,7e-06,1.6e-05,2.9e-05,1.6e-05,...,0.000181,0.000356,0.000735,0.000853,9e-06,0.000471,0.000249,0.038345,4.2e-05,8.2e-05
std,0.002611,0.002442,0.002005,0.011912,0.001901,0.001535,0.002007,0.002068,0.002383,0.002619,...,0.004611,0.003157,0.012203,0.012626,0.001959,0.007533,0.005003,0.091065,0.001704,0.001797
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. Determine Number of Clusters: BIC Score Method

In [16]:
############################# Determine the Number of Clusters ########################################
# from sklearn.mixture import GaussianMixture

# gm_bic= []
# gm_score=[]
# for i in range(5,10):
#     gm = GaussianMixture(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(df_nosystemid)
#     print("BIC for number of cluster(s) {}: {}".format(i,gm.bic(df_nosystemid)))
#     print("Log-likelihood score for number of cluster(s) {}: {}".format(i,gm.score(df_nosystemid)))
#     print("-"*100)
#     gm_bic.append(-gm.bic(df_nosystemid))
#     gm_score.append(gm.score(df_nosystemid))

In [17]:
# plt.title("The Gaussian Mixture model BIC \nfor determining number of clusters\n",fontsize=16)
# plt.scatter(x=[i for i in range(5,10)],y=np.log(gm_bic),s=150,edgecolor='k')
# plt.grid(True)
# plt.xlabel("Number of clusters",fontsize=14)
# plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
# plt.xticks([i for i in range(2,12)],fontsize=14)
# plt.yticks(fontsize=15)
# plt.show()

## 4. Fitting GMM Clustering
From the above elbow graph, we fix number of clusters **n = 6, 7, 8**.

In [18]:
##################################### GMM Training (n=6) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_70_k6_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_70_k6.tsv", 
               sep="\t", index=False)

In [19]:
##################################### GMM Training (n=7) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=7)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_70_k7_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_70_k7.tsv", 
               sep="\t", index=False)

In [20]:
##################################### GMM Training (n=8) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=8)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_70_k8_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_70_k8.tsv", 
               sep="\t", index=False)