# Fraud Detection Day 35 Model: Gaussian Mixture Model (GMM)

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn import mixture

## 1. Importing Features Data for Day 35

In [3]:
# Import day 7 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_35.tsv", sep="\t")

In [4]:
df.head()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateotherincome,activatepayment,activateproject,...,avg_wc_description_day_35,avg_wc_notes_day_35,avg_wc_terms_day_35,avg_wc_address_day_35,invoice_count_day_35,client_count_day_35,is_freshbooks_account_active,is_paying,base_subscription_amount_first_upgrade,upgrade_ever
0,3592461,0,0,0,0,0,0,0,0,0,...,2.0,6.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
1,3592463,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,3592465,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,3592467,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,3592469,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [5]:
df.shape

(452565, 234)

## 2. GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [6]:
# Finding the features columns varinaces
df_nosystemid = df.drop('systemid', axis=1)

In [7]:
df_nosystemid.head()

Unnamed: 0,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateotherincome,activatepayment,activateproject,activaterecurringprofile,...,avg_wc_description_day_35,avg_wc_notes_day_35,avg_wc_terms_day_35,avg_wc_address_day_35,invoice_count_day_35,client_count_day_35,is_freshbooks_account_active,is_paying,base_subscription_amount_first_upgrade,upgrade_ever
0,0,0,0,0,0,0,0,0,0,0,...,2.0,6.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


### 2.2 Analyzing variance of each feature

In [8]:
# Analyzing variance of each feature
df_var = pd.DataFrame(df_nosystemid.var().sort_values())

In [11]:
df_var.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_v1/features_var_day_35.tsv", sep="\t")

In [12]:
df_var.tail()

Unnamed: 0,0
client_count_day_35,2238.070845
createjournalentry,4494.521338
createbanktransaction,11120.110054
createexpense,21051.713338
api,21711.104339


### 2.3 Feature column normalization

In [13]:
# Normalize the columns
normed_df = (df_nosystemid - df_nosystemid.min()) / (df_nosystemid.max() - df_nosystemid.min())

In [14]:
normed_df.fillna(0, inplace=True)

### 2.2 Determining K value BIC

In [None]:
# lowest_bic = np.infty
# bic = []
# n_components_range = range(1, 15)
# cv_types = ['spherical', 'tied', 'diag', 'full']
# for cv_type in cv_types:
#     for n_components in n_components_range:
#         # Fit a Gaussian mixture with EM
#         gmm = mixture.GaussianMixture(n_components=n_components,
#                                       covariance_type=cv_type)
#         gmm.fit(normed_df)
#         bic.append(gmm.bic(normed_df))
#         if bic[-1] < lowest_bic:
#             lowest_bic = bic[-1]
#             best_gmm = gmm

# bic = np.array(bic)
# color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
#                               'darkorange'])
# clf = best_gmm
# bars = []

# # Plot the BIC scores
# plt.figure(figsize=(8, 6))
# spl = plt.subplot(2, 1, 1)
# for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
#     xpos = np.array(n_components_range) + .2 * (i - 2)
#     bars.append(plt.bar(xpos, bic[i * len(n_components_range):
#                                   (i + 1) * len(n_components_range)],
#                         width=.2, color=color))
# plt.xticks(n_components_range)
# plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
# plt.title('BIC score per model')
# xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
#     .2 * np.floor(bic.argmin() / len(n_components_range))
# plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
# spl.set_xlabel('Number of components')
# spl.legend([b[0] for b in bars], cv_types)

### 2.3 Fitting GMM clustering

In [15]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=7)
cluster_fit = gmm.fit(normed_df)

In [16]:
cluster_id = gmm.predict(normed_df)

In [17]:
cluster_id

array([5, 4, 1, ..., 0, 4, 1])

In [18]:
# Adding the clusters labels as a column
df['cluster_id'] = cluster_id

In [19]:
df.head()

Unnamed: 0,systemid,acceptestimate,accesstokencreated,activateclient,activateestimate,activateexpense,activateinvoice,activateotherincome,activatepayment,activateproject,...,avg_wc_notes_day_35,avg_wc_terms_day_35,avg_wc_address_day_35,invoice_count_day_35,client_count_day_35,is_freshbooks_account_active,is_paying,base_subscription_amount_first_upgrade,upgrade_ever,cluster_id
0,3592461,0,0,0,0,0,0,0,0,0,...,6.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,5
1,3592463,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4
2,3592465,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
3,3592467,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
4,3592469,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1


In [20]:
# Getting only 'systemid' and their corresponding 'cluster_id'
df_clf = df[['systemid', 'cluster_id']]

In [None]:
df_clf.tail()

In [None]:
# Separating 'systemid' according to 'cluster_id'
df_c0 = df_clf[df_clf.cluster_id == 0]
df_c1 = df_clf[df_clf.cluster_id == 1]
df_c2 = df_clf[df_clf.cluster_id == 2]
df_c3 = df_clf[df_clf.cluster_id == 3]
df_c4 = df_clf[df_clf.cluster_id == 4]
df_c5 = df_clf[df_clf.cluster_id == 5]
df_c5 = df_clf[df_clf.cluster_id == 6]

In [None]:
# Export Cluster's systemid
df_c0.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_00_accounts_gmm_day_7.tsv", sep="\t")
df_c1.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_01_accounts_gmm_day_7.tsv", sep="\t")
df_c2.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_02_accounts_gmm_day_7.tsv", sep="\t")
df_c3.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_03_accounts_gmm_day_7.tsv", sep="\t")
df_c4.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_04_accounts_gmm_day_7.tsv", sep="\t")
df_c5.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/cluster_05_accounts_gmm_day_7.tsv", sep="\t")


## 3. Analyzing identified clusters 

In [None]:
# Accounts list (systemid) in each cluster
c0_systemid_list = list(df_c0['systemid'])
c1_systemid_list = list(df_c1['systemid'])
c2_systemid_list = list(df_c2['systemid'])
c3_systemid_list = list(df_c3['systemid'])
c4_systemid_list = list(df_c4['systemid'])
c5_systemid_list = list(df_c5['systemid'])


In [None]:
# Checking
# len(c0_systemid_list)
# len(c1_systemid_list)
# len(c2_systemid_list)
# len(c3_systemid_list)
# len(c4_systemid_list)
# len(c5_systemid_list)


In [None]:
# Store and print cluster sizes 
cls_sizes = {'cluster_id': ['cluster 00', 'cluster 01', 'cluster 02', 'cluster 03', 'cluster 04', 'cluster 05'], 
             'size' : [df_c0.shape[0], df_c1.shape[0], df_c2.shape[0], df_c3.shape[0], df_c4.shape[0], df_c5.shape[0]]
            }
df_cls_sizes = pd.DataFrame(cls_sizes, columns =['cluster_id', 'size'])

In [None]:
df_cls_sizes

In [None]:
# Import fraud accounts list dated in last 12 months
fraud_accounts_12months = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/fraud_accounts_20180801_to_20190730.tsv", 
        sep="\t")

In [None]:
########################## Checking the number of fraud accounts exist in each Cluster ##########################

fraud_accounts_12months_list = list(fraud_accounts_12months['systemid'])
# Initialization: The number of fraud accounts in each clusters
fraud_accounts_num_c0 = 0
fraud_accounts_num_c1 = 0
fraud_accounts_num_c2 = 0
fraud_accounts_num_c3 = 0
fraud_accounts_num_c4 = 0
fraud_accounts_num_c5 = 0

for systemid in fraud_accounts_12months_list:
    
    # checking in cluster 00
    if systemid in c0_systemid_list:
        #print('Exist Cluster 00')
        fraud_accounts_num_c0 += 1
    elif systemid in c1_systemid_list:
        #print('Exist Cluster 01')
        fraud_accounts_num_c1 += 1
    elif systemid in c2_systemid_list:
        #print('Exist Cluster 02')
        fraud_accounts_num_c2 += 1
    elif systemid in c3_systemid_list:
        #print('Exist Cluster 03')
        fraud_accounts_num_c3 += 1
    elif systemid in c4_systemid_list:
        #print('Exist Cluster 04')
        fraud_accounts_num_c4 += 1
    elif systemid in c5_systemid_list:
        #print('Exist Cluster 05')
        fraud_accounts_num_c5 += 1
    else:
        print('NOT')
        continue

In [None]:
# Store and print the number of existance of fraud accounts in each cluster
fraud_accounts_num = {'cluster_id': ['cluster 00', 'cluster 01', 'cluster 02', 'cluster 03', 'cluster 04', 'cluster 05'], 
             '#fraud_accounts' : [fraud_accounts_num_c0, fraud_accounts_num_c1, fraud_accounts_num_c2, fraud_accounts_num_c3, fraud_accounts_num_c4, fraud_accounts_num_c5]
            }
df_cls_fraud_accounts_num = pd.DataFrame(fraud_accounts_num, columns =['cluster_id', '#fraud_accounts'])
        

In [None]:
df_cls_fraud_accounts_num