# Fraud Detection Day 91 Model: Gaussian Mixture Model (GMM)

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 91

### 1.1 Import day 91 final features data


In [2]:
# Import day 56 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/training_data/final_features_day_91_no_fbtest_onlyimp.tsv", sep="\t")

In [3]:
# checking dataframe
# df.tail()
df.shape
# list(df)


(452565, 104)

In [4]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

In [5]:
## Sorting column in the dataframe
df = df.reindex(sorted(df.columns), axis=1)

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [6]:
# Finding the features columns varinaces
df = df.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank' 
                                ], axis=1)

In [18]:
## Get the column index
# col_names = list(df)
# L = len(col_names)

# for i in range(0, L):
#     print i, col_names[i]

In [8]:
# Keep a copy of the original dataframe
df_orig = df.copy()

# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

### 2.3 Feature column normalization (model)

In [9]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

# Min-Max standarization model
min_max_scaler = MinMaxScaler()

# Fit and transfrom the data
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp


In [10]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/minmax_scaler_gmm_day_91.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [11]:
# Remove any column with 'Nan'
df.fillna(0, inplace=True)

# Drop the 'systemid' 
df_nosystemid = df.drop(['systemid'], axis=1)

In [12]:
df_nosystemid.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,...,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0,452565.0
mean,2.7e-05,1.3e-05,2.1e-05,0.000293,6e-06,4e-06,8e-06,1.8e-05,3.3e-05,1.1e-05,...,0.000177,0.000324,0.000758,0.000551,1.1e-05,0.000347,0.000232,0.038349,4.6e-05,8.6e-05
std,0.002769,0.002517,0.002081,0.012232,0.001899,0.001521,0.002225,0.002114,0.002616,0.002056,...,0.004381,0.003052,0.012496,0.008441,0.002068,0.00596,0.004627,0.091068,0.001821,0.001872
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. Determine Number of Clusters: BIC Score Method

In [13]:
############################# Determine the Number of Clusters ########################################
# from sklearn.mixture import GaussianMixture

# gm_bic= []
# gm_score=[]
# for i in range(5,10):
#     gm = GaussianMixture(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(df_nosystemid)
#     print("BIC for number of cluster(s) {}: {}".format(i,gm.bic(df_nosystemid)))
#     print("Log-likelihood score for number of cluster(s) {}: {}".format(i,gm.score(df_nosystemid)))
#     print("-"*100)
#     gm_bic.append(-gm.bic(df_nosystemid))
#     gm_score.append(gm.score(df_nosystemid))

In [14]:
# plt.title("The Gaussian Mixture model BIC \nfor determining number of clusters\n",fontsize=16)
# plt.scatter(x=[i for i in range(5,10)],y=np.log(gm_bic),s=150,edgecolor='k')
# plt.grid(True)
# plt.xlabel("Number of clusters",fontsize=14)
# plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
# plt.xticks([i for i in range(2,12)],fontsize=14)
# plt.yticks(fontsize=15)
# plt.show()

## 4. Fitting GMM Clustering
From the above elbow graph, we fix number of clusters **n**.

In [15]:
##################################### GMM Training n =6 #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict


# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_91_k6_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k6.tsv", 
               sep="\t", index=False)

In [16]:
##################################### GMM Training n = 7 #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=7)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict


# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_91_k7_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k7.tsv", 
               sep="\t", index=False)

In [17]:
##################################### GMM Training n = 8 #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=8)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict


# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/src/saved_models/fraud_detection_clustering_day_91_k8_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k8.tsv", 
               sep="\t", index=False)