# Fraud Detection Day 14 Model: Gaussian Mixture Model (GMM)

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 14

### 1.1 Import day 14 final features data


In [2]:
# Import day 14 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/training_data/final_features_day_14_no_fbtest_onlyimp_noinactive.tsv", sep="\t")

In [3]:
# checking dataframe
# df.tail()
df.shape
# list(df)


(438298, 105)

In [4]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

In [5]:
## Sorting column in the dataframe
df = df.reindex(sorted(df.columns), axis=1)

In [10]:
## Get the column index
# col_names = list(df)
# L = len(col_names)

# for i in range(0, L):
#     print i, col_names[i]# Checking
# # df.head()
# # df.shape
# # list(df)

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [7]:
# Finding the features columns varinaces
df = df.drop(columns=[
                     'activateestimate',
                     'activateproject',
                     'activaterecurringprofile',
                     'activatestaff',
                     'banktransferdisabled',
                     'banktransferenabled',
                     'bulkimportitemsandservicescomplete',
                     'creditcardsystemaccessrevoked',
                     'deletetimeentry',
                     'deleterecurringexpense',
                     'deleteretainerprofile',
                     'deletebusinessaccountant',
                     'deletesystemgateway',
                     'exportjournalentries',
                     'generateinvoicefromrecurringprofile',
                     'fbpayuserconnectedbank',
                     'is_sales_managed'
                                ], axis=1)

In [11]:
## Get the column index
# col_names = list(df)
# L = len(col_names)

# for i in range(0, L):
#     print i, col_names[i]# Checking
# # df.head()
# # df.shape
# # list(df)

In [9]:
# Keep a copy of the original dataframe
df_orig = df.copy()

# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

### 2.3 Feature column normalization (model)

In [12]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

# Min-Max standarization model
min_max_scaler = MinMaxScaler()

# Fit and transfrom the data
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp


In [13]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/minmax_scaler_gmm_day_14.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [14]:
# Remove any column with 'Nan'
df.fillna(0, inplace=True)

# Drop the 'systemid' 
df_nosystemid = df.drop(['systemid'], axis=1)

In [15]:
df_nosystemid.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,...,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0
mean,2.2e-05,5e-06,2.6e-05,0.000192,4e-06,8e-06,4e-06,8e-06,4.6e-05,8e-06,...,0.00017,0.0004,0.000624,0.000911,1e-05,0.00027,0.000211,0.036708,0.000189,5.6e-05
std,0.002568,0.001594,0.002445,0.009903,0.001647,0.001815,0.001557,0.001771,0.003928,0.001837,...,0.005237,0.003214,0.012867,0.013621,0.002297,0.00402,0.005532,0.089437,0.003944,0.001646
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. Determine Number of Clusters: BIC Score Method

In [16]:
############################# Determine the Number of Clusters ########################################
# from sklearn.mixture import GaussianMixture

# gm_bic= []
# gm_score=[]
# for i in range(2,12):
#     gm = GaussianMixture(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(df_nosystemid)
#     print("BIC for number of cluster(s) {}: {}".format(i,gm.bic(df_nosystemid)))
#     print("Log-likelihood score for number of cluster(s) {}: {}".format(i,gm.score(df_nosystemid)))
#     print("-"*100)
#     gm_bic.append(-gm.bic(df_nosystemid))
#     gm_score.append(gm.score(df_nosystemid))

In [17]:
# plt.title("The Gaussian Mixture model BIC \nfor determining number of clusters\n",fontsize=16)
# plt.scatter(x=[i for i in range(2,12)],y=np.log(gm_bic),s=150,edgecolor='k')
# plt.grid(True)
# plt.xlabel("Number of clusters",fontsize=14)
# plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
# plt.xticks([i for i in range(2,12)],fontsize=14)
# plt.yticks(fontsize=15)
# plt.show()

## 4. Fitting GMM Clustering
From the above elbow graph, we fix number of clusters n = 6.

In [18]:
##################################### GMM Training (n=6) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)

# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_14_k6_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_14_k6.tsv", 
               sep="\t", index=False)

In [19]:
##################################### GMM Training (n=7) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=7)
gmm.fit(df_nosystemid)

# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_14_k7_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_14_k7.tsv", 
               sep="\t", index=False)

In [21]:
##################################### GMM Training (n=8) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=8)
gmm.fit(df_nosystemid)

# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_14_k8_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_14_k8.tsv", 
               sep="\t", index=False)