# Fraud Detection Day 7 Model: Gaussian Mixture Model (GMM) - K6

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 7

### 1.1 Import day 7 final features data


In [2]:
# Import day 7 features data 
df = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data/training_data/final_features_day_7_no_fbtest_onlyimp_noinactive.tsv", sep="\t")

In [3]:
# checking dataframe
# df.tail()
# df.shape
# list(df)


In [4]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'systemid'

In [5]:
# Finding the features columns varinaces
df = df.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank' 
                                ], axis=1)

In [6]:
df.shape

(438297, 89)

In [7]:
df.describe()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updateservice,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
count,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,...,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0
mean,4047850.0,0.015706,0.000128,0.000559,0.000349,0.000219,0.000201,7.3e-05,0.007942,0.033404,...,0.068527,0.000739,0.959258,0.035558,1.374157,1.725432,1.713881,2.5e-05,0.738522,2.271715
std,262713.5,1.962485,0.026595,0.060209,0.018923,0.089463,0.030955,0.026851,1.799962,3.094477,...,2.203347,0.027179,0.197692,0.185186,7.838523,9.248244,31.488272,0.007601,13.279213,46.523454
min,3592461.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,3820277.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,4048082.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,4275416.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
max,4502194.0,790.0,11.0,26.0,2.0,56.0,12.0,17.0,1017.0,861.0,...,1362.0,1.0,1.0,1.0,1797.0,1741.2,7123.0,3.0,6607.0,16972.0


In [8]:
# Checking
# df.head()
# df.shape
# list(df)

In [9]:
df_orig = df.copy()

In [10]:
df_orig.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,updateservice,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,3592461,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0,2.0
1,3592463,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3592465,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3592467,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,3592469,0,0,0,0,0,0,0,0,0,...,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

In [12]:
df.head()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
df.shape

(438297, 89)

In [14]:
df.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,...,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0
mean,0.015706,0.000128,0.000559,0.000349,0.000219,0.000201,7.3e-05,0.007942,0.033404,0.000315,...,0.124413,0.571101,0.005864,0.064132,6.4e-05,0.135981,0.635081,0.14617,0.095661,0.068527
std,1.962485,0.026595,0.060209,0.018923,0.089463,0.030955,0.026851,1.799962,3.094477,0.084452,...,3.828618,4.628231,0.12396,0.914618,0.017485,1.832369,17.560821,0.357074,1.971122,2.203347
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,790.0,11.0,26.0,2.0,56.0,12.0,17.0,1017.0,861.0,48.0,...,704.0,1755.0,10.0,85.0,8.0,368.0,3523.0,4.0,530.0,1362.0


### 2.3 Feature column normalization (model)

In [15]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

In [16]:
min_max_scaler = MinMaxScaler()

In [19]:
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp

In [20]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/data/saved_models/minmax_scaler_gmm_day_07_k6.sav'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [21]:
df.fillna(0, inplace=True)

In [22]:
# df['activatepayment']

In [23]:
# min_max_scaler

In [24]:
# Drop the 'systemid'
df_nosystemid = df.drop(['systemid'], axis=1)

In [25]:
df_nosystemid.head()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df_nosystemid.describe()

Unnamed: 0,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,archiveotherincome,...,updatecategory,updateclient,updatecompanyprofile,updatecontractor,updatecreditnote,updateestimate,updateexpense,updateinvoicesample,updateitem,updateservice
count,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,...,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0,438297.0
mean,2e-05,1.2e-05,2.1e-05,0.000175,4e-06,1.7e-05,4e-06,8e-06,3.9e-05,7e-06,...,0.000177,0.000325,0.000586,0.000754,8e-06,0.00037,0.00018,0.036543,0.00018,5e-05
std,0.002484,0.002418,0.002316,0.009462,0.001598,0.00258,0.001579,0.00177,0.003594,0.001759,...,0.005438,0.002637,0.012396,0.01076,0.002186,0.004979,0.004985,0.089268,0.003719,0.001618
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
##################################### Data Standarizationå #################################################

# Standarization with min-max: (x-mean)/(max-mean)
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# # Fit data 
# scaler.fit(df_noid)

# # Rescale the data 
# df_noid_rescaled = pd.DataFrame(scaler.transform(df_noid))
# df_noid_rescaled.fillna(0, inplace=True)

# # Save the standarization model
# filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
# pickle.dump(scaler, open(filename_scaler, 'wb'))

  return self.partial_fit(X, y)


In [15]:
# Checking
# dfv = pd.DataFrame(df_noid_rescaled)

In [16]:
# dfv.shape

## 3. Fitting GMM Clustering (K=6)

In [28]:
# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_nosystemid)


# Predicting clustering
cluster_predict = gmm.predict(df_nosystemid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict


In [29]:
gmm


GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=6, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [30]:
##################################### Saving the Train Model #################################################

# save the model to disk
filename_clustering = '/Users/dwahid/Documents/GitHub/fraud_detection/data/saved_models/fraud_detection_clustering_day_07_k6_model.sav'
pickle.dump(gmm, open(filename_clustering, 'wb'))

In [31]:
df_orig.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,admindeactivation,adminonlinepaymentattempt,adminpayinvoiceonlineinvoice,adminpayinvoiceonlinelistview,archiveclient,archiveexpense,...,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7,cluster_id
0,3592461,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,2.0,6.0,0.0,0.0,1.0,2.0,1
1,3592463,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5
2,3592465,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,3592467,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4,3592469,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [32]:
# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data/model_outputs_gmm/gmm_clutering_outputs_day_07_k6.tsv", 
               sep="\t", index=False)

## 6. Predict Cluster ID for New Accounts

### 6.1 Required Function

In [33]:
#### Function: Filtering FB test account by using admin email
from difflib import SequenceMatcher

def email_match(em, email_list):
    
    L = len(email_list)
    match_score = 0
    
    for i in range(0, L):
        if pd.isnull(em):
            match_score = 0
            break;
        else: 
            match_score =  max(match_score, SequenceMatcher(None,em, email_list[i]).ratio())
            #print(i, em, email_list[i], match_score)

    return match_score

##### Function for aggregating selected column values

def cell_value_sum (row, ex_cols_list):
#     print(row[0])
    sum = 0
    cols = range(len(row))
    new_cols = list(set(cols) - set(ex_cols_list))
    #print(new_cols)
    
    for i in new_cols:
        #print(row[i])
        sum =+ row[i]
    
    #print('Final sum: ', sum)
    return sum

def check_sales_manage(sm):
    #print(sm)
    
    sm_flg = 0
    
    if pd.isnull(sm):
        sm_flg = 0
    else:
        sm_val = int(sm)
        
        if sm_val == 1:
            sm_flg = 1
        else:
            sm_flg = 0
    return sm_flg


### 6.1 Import data for new accounts

In [84]:
################################# Predict Cluster For a recent new account #####################################

# Import data of the day features 
df_day_7_new = pd.read_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_final/new_final_features_day_7.tsv", sep="\t")

In [85]:
df_day_7_new.shape


(114906, 242)

### 6.3 Get only important features

In [88]:
# ################################# Filter if Sale is managed #################################################

# Filtering all accounts if 'is_sales_managed' == 1 (i.e. it already checked by the sales team)
# df_new_v2 = df_new.apply(lambda x: int(x['is_sales_managed']) < 1)

df_day_7_new = df_day_7_new[df_day_7_new.apply(lambda x: check_sales_manage(x['is_sales_managed']) < 1, axis=1)]



In [90]:
df_day_7_new.shape

(114827, 242)

In [94]:
################################ Filtering Only Import Important Features ###########################################

# New Day 7: Importing importing features list
important_features = pd.read_csv(
    "/Users/dwahid/Documents/GitHub/fraud_detection/data_imp_features_names/model_important_features_day_7.tsv", sep="\n,")

# New Day 7: Get the important feature as a list
imp_features_list = list(important_features['important_feature'])

# Filtering only important features 
df_day_7_new_imp = df_day_7_new[df_day_7_new.columns.intersection(imp_features_list)]

# Drop not so important feature columns
df_day_7_new_noid = df_day_7_new_imp.drop(columns=[
                                 'activateestimate',
                                 'activateproject',
                                 'activaterecurringprofile',
                                 'activatestaff',
                                 'banktransferdisabled',
                                 'banktransferenabled',
                                 'bulkimportitemsandservicescomplete',
                                 'creditcardsystemaccessrevoked',
                                 'deletetimeentry',
                                 'deleterecurringexpense',
                                 'deleteretainerprofile',
                                 'deletebusinessaccountant',
                                 'deletesystemgateway',
                                 'exportjournalentries',
                                 'generateinvoicefromrecurringprofile',
                                 'fbpayuserconnectedbank'
                                 
                                ], axis=1)

  """


In [104]:
list(df_day_7_new_noid)


['systemid',
 'activateexpense',
 'activateotherincome',
 'activatepayment',
 'adminde-activation',
 'adminonlinepaymentattempt',
 'adminpayinvoiceonline-invoice',
 'adminpayinvoiceonline-listview',
 'archiveclient',
 'archiveexpense',
 'archiveotherincome',
 'archiveproject',
 'archivetask',
 'autobillpayment',
 'bulkimportclientscomplete',
 'clientlimitupgradenudge',
 'createbankaccount',
 'createbanktransaction',
 'createcategory',
 'createcontact',
 'createcontractor',
 'createcreditnote',
 'createdexpense',
 'createestimate',
 'createexpense',
 'createitem',
 'createotherincome',
 'createreceipt',
 'createservice',
 'creditcardclientaccessgranted',
 'customemailsignature',
 'declinedonlinepaymentnotification',
 'deletebusinesspartner',
 'deletecollaborator',
 'deletecreditnote',
 'deleteestimate',
 'deleteexpense',
 'deletehours',
 'deleteinvoice',
 'deleteitem',
 'deleteotherincome',
 'deleteproject',
 'deletestaff',
 'deleteuser',
 'disconnectbankaccount',
 'disconnectpaymentgat

### 6.4 Data standarization

In [99]:
##################################### Data Standarizationå #################################################

# Load standarization parameter from the disk
filename_scaler = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_scaler.sav'
loaded_scaler = pickle.load(open(filename_scaler, 'rb'))



In [100]:
# Normalized all features columns except the 'systemid'
column_names_to_not_normalize = ['systemid']
column_names_to_normalize = [x for x in list(df_day_7_new_noid) if x not in column_names_to_not_normalize ]


In [101]:
x2 = df_day_7_new_noid[column_names_to_normalize].values
x2_scaled = min_max_scaler.fit_transform(x2)
df_day_7_new_noid_temp = pd.DataFrame(x2_scaled, columns=column_names_to_normalize, index = df_day_7_new_noid.index)
df_day_7_new_noid[column_names_to_normalize] = df_day_7_new_noid_temp

In [102]:
df_day_7_new_noid.head()

Unnamed: 0,systemid,activateexpense,activateotherincome,activatepayment,adminde-activation,adminonlinepaymentattempt,adminpayinvoiceonline-invoice,adminpayinvoiceonline-listview,archiveclient,archiveexpense,...,emailcreditnote,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7
0,4504870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4504872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.00198,0.0,0.0,0.0,0.004082,5.1e-05
2,4504874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4504876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4504878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# New data standarization
df_day_7_new_noid_rescaled = pd.DataFrame(loaded_scaler.transform(df_day_7_new_noid))

# Fill 'NaN' cell with zero
type(df_day_7_new_noid_rescaled)

pandas.core.frame.DataFrame

In [31]:
df_day_7_new_noid_rescaled.fillna(0,inplace=True)

In [32]:
# df_day_7_new_noid_rescaled.head()

### 6.5 Predict cluster for new data

In [33]:
# load the model from disk
filename = '/Users/dwahid/Documents/GitHub/fraud_detection/trained_models/v1_fraud_detection_clustering_day_07_k6_model.sav'
loaded_model_gmm_day_07_k6 = pickle.load(open(filename, 'rb'))

# Predicting clustering
cluster_id_k6_day_7_new = loaded_model_gmm_day_07_k6.predict(df_day_7_new_noid_rescaled)

# Adding clusters id of each account to the dataframe
df_day_7_new_imp['cluster_id_k6'] = cluster_id_k6_day_7_new


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [34]:
df_day_7_new_imp.head()

Unnamed: 0,systemid,activateestimate,activateexpense,activateotherincome,activatepayment,activateproject,activaterecurringprofile,adminde-activation,adminonlinepaymentattempt,adminpayinvoiceonline-invoice,...,is_sales_managed,is_freshbooks_account_active,is_paying,avg_wc_description_day_7,avg_wc_notes_day_7,avg_wc_terms_day_7,avg_wc_address_day_7,invoice_count_day_7,client_count_day_7,cluster_id_k6
0,4504870,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
1,4504872,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,3
2,4504874,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
3,4504876,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3
4,4504878,0,0,0,0,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3


In [35]:
# Separating risky accounts (cluster 01 and cluster 05)

# Separating 'systemid' according to 'cluster_id'
df_day_7_new_imp_c0_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 0]
df_day_7_new_imp_c1_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 1]
df_day_7_new_imp_c2_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 2]
df_day_7_new_imp_c3_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 3]
df_day_7_new_imp_c4_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 4]
df_day_7_new_imp_c5_k6 = df_day_7_new_imp[df_day_7_new_imp.cluster_id_k6 == 5]


# Export Cluster's systemid
df_day_7_new_imp_c0_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_00_k6.tsv", sep="\t")
df_day_7_new_imp_c1_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_01_k6.tsv", sep="\t")
df_day_7_new_imp_c2_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_02_k6.tsv", sep="\t")
df_day_7_new_imp_c3_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_03_k6.tsv", sep="\t")
df_day_7_new_imp_c4_k6.to_csv("/Users/dwahid/Documents/GitHub/fraud_detection/data_new_clusters/gmm_day_7_new_cluster_04_k6.tsv", sep="\t")


In [36]:
df_day_7_new_imp_c1_k6.shape

(5185, 106)