# Fraud Detection Day 14 Model: Gaussian Mixture Model (GMM)

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 7

### 1.1 Import day 7 final features data


In [3]:
# Import day 7 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/training_data/final_features_day_14_no_fbtest_onlyimp_noinactive.tsv", sep="\t")

In [1]:
# checking dataframe
# df.tail()
# df.shape
# list(df)


In [4]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [5]:
# Finding the features columns varinaces
df = df.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank' 
                                ], axis=1)

In [None]:
# Checking
# df.head()
# df.shape
# list(df)

In [6]:
# Keep a copy of the original dataframe
df_orig = df.copy()

# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

### 2.3 Feature column normalization (model)

In [7]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

In [8]:
min_max_scaler = MinMaxScaler()

In [9]:
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp

In [10]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/data/saved_models/minmax_scaler_gmm_day_14.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [11]:
# Remove any column with 'Nan'
df.fillna(0, inplace=True)

In [12]:
# Drop the 'systemid' 
df_nosystemid = df.drop(['systemid'], axis=1)

In [13]:
df_nosystemid.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,...,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0,438298.0
mean,2.2e-05,5e-06,2.6e-05,0.000192,4e-06,8e-06,4e-06,8e-06,4.6e-05,8e-06,...,0.00017,0.0004,0.000624,0.000911,1e-05,0.00027,0.000211,0.036708,0.000189,5.6e-05
std,0.002568,0.001594,0.002445,0.009903,0.001647,0.001815,0.001557,0.001771,0.003928,0.001837,...,0.005237,0.003214,0.012867,0.013621,0.002297,0.00402,0.005532,0.089437,0.003944,0.001646
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
##################################### Data Standarizationå #################################################

# Standarization with min-max: (x-mean)/(max-mean)
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# # Fit data 
# scaler.fit(df_noid)

# # Rescale the data 
# df_noid_rescaled = pd.DataFrame(scaler.transform(df_noid))
# df_noid_rescaled.fillna(0, inplace=True)

# # Save the standarization model
# filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
# pickle.dump(scaler, open(filename_scaler, 'wb'))

## 3. Fitting GMM Clustering (K=6)

In [14]:
##################################### GMM Training #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict


In [15]:
##################################### Saving the Train Model #################################################

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/data/saved_models/fraud_detection_clustering_day_14_k6_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

In [16]:
df_orig.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_14,avg_wc_notes_day_14,avg_wc_terms_day_14,avg_wc_address_day_14,invoice_count_day_14,client_count_day_14,cluster_id
0,3592461,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0,2.0,1
1,3592463,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
2,3592465,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
3,3592467,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
4,3592469,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3


In [17]:
# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm/gmm_clutering_outputs_day_14_k6.tsv", 
               sep="\t", index=False)

## 6. Predict Cluster ID for New Accounts

### 6.1 Required Function

In [None]:
# #### Function: Filtering FB test account by using admin email
# from difflib import SequenceMatcher

# def email_match(em, email_list):
    
#     L = len(email_list)
#     match_score = 0
    
#     for i in range(0, L):
#         if pd.isnull(em):
#             match_score = 0
#             break;
#         else: 
#             match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
#             #print(i, em, email_list[i], match_score)

#     return match_score

# ##### Function for aggregating selected column values

# def cell_value_sum (row, ex_cols_list):
# #     print(row[0])
#     sum = 0
#     cols = range(len(row))
#     new_cols = list(set(cols) - set(ex_cols_list))
#     #print(new_cols)
    
#     for i in new_cols:
#         #print(row[i])
#         sum =+ row[i]
    
#     #print('Final sum: ', sum)
#     return sum

# def check_sales_manage(sm):
#     #print(sm)
    
#     sm_flg = 0
    
#     if pd.isnull(sm):
#         sm_flg = 0
#     else:
#         sm_val = int(sm)
        
#         if sm_val == 1:
#             sm_flg = 1
#         else:
#             sm_flg = 0
#     return sm_flg


### 6.1 Import data for new accounts

In [None]:
# ################################# Predict Cluster For a recent new account #####################################

# # Import data of the day features 
# df_day_7_new = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_7.tsv", sep="\t")

In [None]:
df_day_7_new.shape


### 6.3 Get only important features

In [None]:
# # ################################# Filter if Sale is managed #################################################

# # Filtering all accounts if 'is_sales_managed' == 1 (i.e. it already checked by the sales team)
# # df_new_v2 = df_new.apply(lambda x: int(x['is_sales_managed']) < 1)

# df_day_7_new = df_day_7_new[df_day_7_new.apply(lambda x: check_sales_manage(x['is_sales_managed']) < 1, axis=1)]



In [None]:
# df_day_7_new.shape

In [None]:
# ################################ Filtering Only Import Important Features ###########################################

# # New Day 7: Importing importing features list
# important_features = pd.read_csv(
#     "/Users/dwahid/Documents/GitHub/fraud_detection/data_imp_features_names/model_important_features_day_7.tsv", sep="\n,")

# # New Day 7: Get the important feature as a list
# imp_features_list = list(important_features['important_feature'])

# # Filtering only important features 
# df_day_7_new_imp = df_day_7_new[df_day_7_new.columns.intersection(imp_features_list)]

# # Drop not so important feature columns
# df_day_7_new_noid = df_day_7_new_imp.drop(columns=[
#                                  'activateestimate',
#                                  'activateproject',
#                                  'activaterecurringprofile',
#                                  'activatestaff',
#                                  'banktransferdisabled',
#                                  'banktransferenabled',
#                                  'bulkimportitemsandservicescomplete',
#                                  'creditcardsystemaccessrevoked',
#                                  'deletetimeentry',
#                                  'deleterecurringexpense',
#                                  'deleteretainerprofile',
#                                  'deletebusinessaccountant',
#                                  'deletesystemgateway',
#                                  'exportjournalentries',
#                                  'generateinvoicefromrecurringprofile',
#                                  'fbpayuserconnectedbank'
                                 
#                                 ], axis=1)

In [None]:
# list(df_day_7_new_noid)


### 6.4 Data standarization

In [None]:
# ##################################### Data Standarizationå #################################################

# # Load standarization parameter from the disk
# filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
# loaded_scaler = pickle.load(open(filename_scaler, 'rb'))



In [None]:
# # Normalized all features columns except the 'systemid'
# column_names_to_not_normalize = ['systemid']
# column_names_to_normalize = [x for x in list(df_day_7_new_noid) if x not in column_names_to_not_normalize ]


In [None]:
# x2 = df_day_7_new_noid[column_names_to_normalize].values
# x2_scaled = min_max_scaler.fit_transform(x2)
# df_day_7_new_noid_temp = pd.DataFrame(x2_scaled, columns=column_names_to_normalize, index = df_day_7_new_noid.index)
# df_day_7_new_noid[column_names_to_normalize] = df_day_7_new_noid_temp

In [None]:
# df_day_7_new_noid.head()

In [None]:
# # New data standarization
# df_day_7_new_noid_rescaled = pd.DataFrame(loaded_scaler.transform(df_day_7_new_noid))

# # Fill 'NaN' cell with zero
# type(df_day_7_new_noid_rescaled)

In [None]:
# df_day_7_new_noid_rescaled.fillna(0,inplace=True)

In [None]:
# df_day_7_new_noid_rescaled.head()

### 6.5 Predict cluster for new data

In [None]:
# # load the model from disk
# filename = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_model.sav'
# loaded_model_gmm_day_07_k6 = pickle.load(open(filename, 'rb'))

# # Predicting clustering
# cluster_id_k6_day_7_new = loaded_model_gmm_day_07_k6.predict(df_day_7_new_noid_rescaled)

# # Adding clusters id of each account to the dataframe
# df_day_7_new_imp['cluster_id_k6'] = cluster_id_k6_day_7_new


In [None]:
# df_day_7_new_imp.head()

In [None]:
# # Separating risky accounts (cluster 01 and cluster 05)

# # Separating 'systemid' according to 'cluster_id'
# df_day_7_new_imp_c0_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 0]
# df_day_7_new_imp_c1_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 1]
# df_day_7_new_imp_c2_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 2]
# df_day_7_new_imp_c3_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 3]
# df_day_7_new_imp_c4_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 4]
# df_day_7_new_imp_c5_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 5]


# # Export Cluster's systemid
# df_day_7_new_imp_c0_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_00_k6.tsv", sep="\t")
# df_day_7_new_imp_c1_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_01_k6.tsv", sep="\t")
# df_day_7_new_imp_c2_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_02_k6.tsv", sep="\t")
# df_day_7_new_imp_c3_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_03_k6.tsv", sep="\t")
# df_day_7_new_imp_c4_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_04_k6.tsv", sep="\t")


In [None]:
# df_day_7_new_imp_c1_k6.shape