# Fraud Detection Day 7 Model: Gaussian Mixture Model (GMM) - K6

In [38]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 7

### 1.1 Import day 7 final features data


In [51]:
# Import day 7 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_final/final_features_day_7_no_fbtest_onlyimp_noinactive.tsv", sep="\t")

In [25]:
# checking dataframe
# df.tail()
# df.shape
# list(df)

In [52]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [53]:
# Finding the features columns varinaces
df = df.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank' 
                                ], axis=1)

In [54]:
# Checking
df.head()
# df.shape

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updateservice,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,3592461,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0,2.0
1,3592463,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3592465,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3592467,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3592469,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [62]:
df_orig = df.copy()

In [63]:
df_orig.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updateservice,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,3592461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001113,0.003446,0.0,0.0,0.000151,5.9e-05
1,3592463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3592465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3592467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3592469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

In [65]:
min_max_scaler = MinMaxScaler()

In [66]:
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp

In [67]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_minmax_scaler.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [68]:
df.fillna(0, inplace=True)

In [93]:
min_max_scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [72]:
# Drop the 'systemid'
df_nosystemid = df.drop(['systemid'], axis=1)

In [73]:
df_nosystemid.head()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updateservice,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001113,0.003446,0.0,0.0,0.000151,5.9e-05
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 2.3 Feature column normalization (model)

In [14]:
##################################### Data Standarizationå #################################################

# Standarization with min-max: (x-mean)/(max-mean)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit data 
scaler.fit(df_noid)

# Rescale the data 
df_noid_rescaled = pd.DataFrame(scaler.transform(df_noid))
df_noid_rescaled.fillna(0, inplace=True)

# Save the standarization model
filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
pickle.dump(scaler, open(filename_scaler, 'wb'))

  return self.partial_fit(X, y)


In [15]:
# Checking
# dfv = pd.DataFrame(df_noid_rescaled)

In [16]:
# dfv.shape

## 3. Fitting GMM Clustering (K=6)

In [76]:


# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# Separating 'systemid' according to 'cluster_id'
df_c0_k6 = df_orig[df_orig.cluster_id == 0]
df_c1_k6 = df_orig[df_orig.cluster_id == 1]
df_c2_k6 = df_orig[df_orig.cluster_id == 2]
df_c3_k6 = df_orig[df_orig.cluster_id == 3]
df_c4_k6 = df_orig[df_orig.cluster_id == 4]
df_c5_k6 = df_orig[df_orig.cluster_id == 5]


# Export Cluster's systemid
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_k6.tsv", sep="\t")
df_c0_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_00_k6.tsv", sep="\t")
df_c1_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_01_k6.tsv", sep="\t")
df_c2_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_02_k6.tsv", sep="\t")
df_c3_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_03_k6.tsv", sep="\t")
df_c4_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_04_k6.tsv", sep="\t")
df_c5_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_clusters/gmm_day_7_cluster_05_k6.tsv", sep="\t")


In [82]:
##################################### Saving the Train Model #################################################

# save the model to disk
filename = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_model.sav'
pickle.dump(gmm, open(filename, 'wb'))

array([5, 0, 2, ..., 0, 0, 2])

## 6. Predict Cluster ID for New Accounts

### 6.1 Required Function

In [87]:
#### Function: Filtering FB test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
    match_score = 0
    
    for i in range(0, L):
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
            #print(i, em, email_list[i], match_score)

    return match_score

##### Function for aggregating selected column values

def cell_value_sum (row, ex_cols_list):
#     print(row[0])
    sum = 0
    cols = range(len(row))
    new_cols = list(set(cols) - set(ex_cols_list))
    #print(new_cols)
    
    for i in new_cols:
        #print(row[i])
        sum =+ row[i]
    
    #print('Final sum: ', sum)
    return sum

def check_sales_manage(sm):
    #print(sm)
    
    sm_flg = 0
    
    if pd.isnull(sm):
        sm_flg = 0
    else:
        sm_val = int(sm)
        
        if sm_val == 1:
            sm_flg = 1
        else:
            sm_flg = 0
    return sm_flg


### 6.1 Import data for new accounts

In [84]:
################################# Predict Cluster For a recent new account #####################################

# Import data of the day features 
df_day_7_new = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_7.tsv", sep="\t")

In [85]:
df_day_7_new.shape


(114906, 242)

### 6.3 Get only important features

In [88]:
# ################################# Filter if Sale is managed #################################################

# Filtering all accounts if 'is_sales_managed' == 1 (i.e. it already checked by the sales team)
# df_new_v2 = df_new.apply(lambda x: int(x['is_sales_managed']) < 1)

df_day_7_new = df_day_7_new[df_day_7_new.apply(lambda x: check_sales_manage(x['is_sales_managed']) < 1, axis=1)]



In [90]:
df_day_7_new.shape

(114827, 242)

In [94]:
################################ Filtering Only Import Important Features ###########################################

# New Day 7: Importing importing features list
important_features = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_imp_features_names/model_important_features_day_7.tsv", sep="\n,")

# New Day 7: Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Filtering only important features 
df_day_7_new_imp = df_day_7_new[df_day_7_new.columns.intersection(imp_features_list)]

# Drop not so important feature columns
df_day_7_new_noid = df_day_7_new_imp.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank'
                                 
                                ], axis=1)

  """


In [104]:
list(df_day_7_new_noid)


['systemid',
 'activateexpense',
 'activateotherincome',
 'activatepayment',
 'adminde-activation',
 'adminonlinepaymentattempt',
 'adminpayinvoiceonline-invoice',
 'adminpayinvoiceonline-listview',
 'archiveclient',
 'archiveexpense',
 'archiveotherincome',
 'archiveproject',
 'archivetask',
 'autobillpayment',
 'bulkimportclientscomplete',
 'clientlimitupgradenudge',
 'createbankaccount',
 'createbanktransaction',
 'createcategory',
 'createcontact',
 'createcontractor',
 'createcreditnote',
 'createdexpense',
 'createestimate',
 'createexpense',
 'createitem',
 'createotherincome',
 'createreceipt',
 'createservice',
 'creditcardclientaccessgranted',
 'customemailsignature',
 'declinedonlinepaymentnotification',
 'deletebusinesspartner',
 'deletecollaborator',
 'deletecreditnote',
 'deleteestimate',
 'deleteexpense',
 'deletehours',
 'deleteinvoice',
 'deleteitem',
 'deleteotherincome',
 'deleteproject',
 'deletestaff',
 'deleteuser',
 'disconnectbankaccount',
 'disconnectpaymentgat

### 6.4 Data standarization

In [99]:
##################################### Data Standarizationå #################################################

# Load standarization parameter from the disk
filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
loaded_scaler = pickle.load(open(filename_scaler, 'rb'))



In [100]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df_day_7_new_noid) if x not in column_names_to_not_normalize ]


In [101]:
x2 = df_day_7_new_noid[column_names_to_normalize].values
x2_scaled = min_max_scaler.fit_transform(x2)
df_day_7_new_noid_temp = pd.DataFrame(x2_scaled, columns=column_names_to_normalize, index = df_day_7_new_noid.index)
df_day_7_new_noid[column_names_to_normalize] = df_day_7_new_noid_temp

In [102]:
df_day_7_new_noid.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,adminde-activation,adminonlinepaymentattempt,adminpayinvoiceonline-invoice,adminpayinvoiceonline-listview,archiveclient,archiveexpense,...,emailcreditnote,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,4504870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4504872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.00198,0.0,0.0,0.0,0.004082,5.1e-05
2,4504874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4504876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4504878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# New data standarization
df_day_7_new_noid_rescaled = pd.DataFrame(loaded_scaler.transform(df_day_7_new_noid))

# Fill 'NaN' cell with zero
type(df_day_7_new_noid_rescaled)

pandas.core.frame.DataFrame

In [31]:
df_day_7_new_noid_rescaled.fillna(0,inplace=True)

In [32]:
# df_day_7_new_noid_rescaled.head()

### 6.5 Predict cluster for new data

In [33]:
# load the model from disk
filename = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_model.sav'
loaded_model_gmm_day_07_k6 = pickle.load(open(filename, 'rb'))

# Predicting clustering
cluster_id_k6_day_7_new = loaded_model_gmm_day_07_k6.predict(df_day_7_new_noid_rescaled)

# Adding clusters id of each account to the dataframe
df_day_7_new_imp['cluster_id_k6'] = cluster_id_k6_day_7_new


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [34]:
df_day_7_new_imp.head()

Unnamed: 0,systemid,activateestimate,activateexpense,activateotherincome,activatepayment,activateproject,activaterecurringprofile,adminde-activation,adminonlinepaymentattempt,adminpayinvoiceonline-invoice,...,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7,cluster_id_k6
0,4504870,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
1,4504872,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,3
2,4504874,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
3,4504876,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
4,4504878,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3


In [35]:
# Separating risky accounts (cluster 01 and cluster 05)

# Separating 'systemid' according to 'cluster_id'
df_day_7_new_imp_c0_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 0]
df_day_7_new_imp_c1_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 1]
df_day_7_new_imp_c2_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 2]
df_day_7_new_imp_c3_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 3]
df_day_7_new_imp_c4_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 4]
df_day_7_new_imp_c5_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 5]


# Export Cluster's systemid
df_day_7_new_imp_c0_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_00_k6.tsv", sep="\t")
df_day_7_new_imp_c1_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_01_k6.tsv", sep="\t")
df_day_7_new_imp_c2_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_02_k6.tsv", sep="\t")
df_day_7_new_imp_c3_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_03_k6.tsv", sep="\t")
df_day_7_new_imp_c4_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_04_k6.tsv", sep="\t")


In [36]:
df_day_7_new_imp_c1_k6.shape

(5185, 106)