# Cluster Analysis: GMM

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
# plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')



In [None]:
## Import fraud accounts list dated in last 12 months
fraud_accounts = pd.read_csv("data/fraud_risk_acc_historic/fraud_accounts_12months.tsv", sep="\t")


In [None]:
# Import the GMM clustering output (k=6)
df_k6 = pd.read_csv("data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k6.tsv", sep="\t")

# Import the GMM clustering output (k=7)
df_k7 = pd.read_csv("data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k7.tsv", sep="\t")

# Import the GMM clustering output (k=8)
df_k8 = pd.read_csv("data/model_outputs_gmm_for_nn_training/gmm_clutering_outputs_day_91_k8.tsv", sep="\t")

## GMM Cluster Analysis (n = 6)

In [None]:
## Separating 'id' according to 'cluster_id'
df_k6_c0 = df_k6[df_k6.cluster_id == 0]
df_k6_c1 = df_k6[df_k6.cluster_id == 1]
df_k6_c2 = df_k6[df_k6.cluster_id == 2]
df_k6_c3 = df_k6[df_k6.cluster_id == 3]
df_k6_c4 = df_k6[df_k6.cluster_id == 4]
df_k6_c5 = df_k6[df_k6.cluster_id == 5]

## Removing 'NaN' with zero
df_k6_c0.fillna(0, inplace=True)
df_k6_c1.fillna(0, inplace=True)
df_k6_c2.fillna(0, inplace=True)
df_k6_c3.fillna(0, inplace=True)
df_k6_c4.fillna(0, inplace=True)
df_k6_c5.fillna(0, inplace=True)


# Accounts list (id) in each cluster
c0_id_list = list(df_k6_c0['id'])
c1_id_list = list(df_k6_c1['id'])
c2_id_list = list(df_k6_c2['id'])
c3_id_list = list(df_k6_c3['id'])
c4_id_list = list(df_k6_c4['id'])
c5_id_list = list(df_k6_c5['id'])


# # Store and print cluster sizes 
cls_sizes = {'cluster_id': ['cluster 00', 
                            'cluster 01', 
                            'cluster 02', 
                            'cluster 03', 
                            'cluster 04', 
                            'cluster 05'], 
             'size' : [df_k6_c0.shape[0], 
                       df_k6_c1.shape[0], 
                       df_k6_c2.shape[0], 
                       df_k6_c3.shape[0], 
                       df_k6_c4.shape[0], 
                       df_k6_c5.shape[0]]}


## Adding cluster size corresponding to each cluster
df_k6_cls_sizes = pd.DataFrame(cls_sizes, columns =['cluster_id', 'size'])


########################## Checking the number of fraud accounts exist in each Cluster ##########################

fraud_accounts_12months_list = list(fraud_accounts['id'])


# Initialization: The number of fraud accounts in each clusters
fraud_accounts_num_c0 = 0
fraud_accounts_num_c1 = 0
fraud_accounts_num_c2 = 0
fraud_accounts_num_c3 = 0
fraud_accounts_num_c4 = 0
fraud_accounts_num_c5 = 0


# Initialization: The fraud accounts in each clusters
fraud_accounts_c0 = []
fraud_accounts_c1 = []
fraud_accounts_c2 = []
fraud_accounts_c3 = []
fraud_accounts_c4 = []
fraud_accounts_c5 = []


for id in fraud_accounts_12months_list:
    
    # checking in cluster 00
    if id in c0_id_list:
        #print('Exist Cluster 00')
        fraud_accounts_num_c0 += 1
        fraud_accounts_c0.append(id)
        
    # checking in cluster 01   
    elif id in c1_id_list:
        #print('Exist Cluster 01')
        fraud_accounts_num_c1 += 1
        fraud_accounts_c1.append(id)
        
    # checking in cluster 02
    elif id in c2_id_list:
        #print('Exist Cluster 02')
        fraud_accounts_num_c2 += 1
        fraud_accounts_c2.append(id)
        
    # checking in cluster 03
    elif id in c3_id_list:
        #print('Exist Cluster 03')
        fraud_accounts_num_c3 += 1
        fraud_accounts_c3.append(id)
    
    # checking in cluster 04
    elif id in c4_id_list:
        #print('Exist Cluster 04')
        fraud_accounts_num_c4 += 1
        fraud_accounts_c4.append(id)
        
    # checking in cluster 05
    elif id in c5_id_list:
        #print('Exist Cluster 05')
        fraud_accounts_num_c5 += 1
        fraud_accounts_c5.append(id)

    else:
        #print('NOT')
        continue
        
# # Store and print the number of existance of fraud accounts in each cluster
fraud_accounts_num = {'cluster_id': ['cluster 00', 
                                     'cluster 01', 
                                     'cluster 02', 
                                     'cluster 03', 
                                     'cluster 04', 
                                     'cluster 05'], 
             '#fraud_accounts' : [fraud_accounts_num_c0, 
                                  fraud_accounts_num_c1, 
                                  fraud_accounts_num_c2, 
                                  fraud_accounts_num_c3, 
                                  fraud_accounts_num_c4, 
                                  fraud_accounts_num_c5]}

## Adding cluster size corresponding to each cluster
df_k6_cls_fraud_accounts_num = pd.DataFrame(fraud_accounts_num, columns =['cluster_id', '#fraud_accounts'])
        
    
# Add number of frauds accounts corresponding to the cluster sieze and id
df_k6_cls_sizes['#fraud_accounts']= df_k6_cls_fraud_accounts_num['#fraud_accounts']


## GMM Cluster Analysis (n = 7)

In [None]:
## Separating 'id' according to 'cluster_id'
df_k7_c0 = df_k7[df_k7.cluster_id == 0]
df_k7_c1 = df_k7[df_k7.cluster_id == 1]
df_k7_c2 = df_k7[df_k7.cluster_id == 2]
df_k7_c3 = df_k7[df_k7.cluster_id == 3]
df_k7_c4 = df_k7[df_k7.cluster_id == 4]
df_k7_c5 = df_k7[df_k7.cluster_id == 5]
df_k7_c6 = df_k7[df_k7.cluster_id == 6]


## Removing 'NaN' with zero
df_k7_c0.fillna(0, inplace=True)
df_k7_c1.fillna(0, inplace=True)
df_k7_c2.fillna(0, inplace=True)
df_k7_c3.fillna(0, inplace=True)
df_k7_c4.fillna(0, inplace=True)
df_k7_c5.fillna(0, inplace=True)
df_k7_c6.fillna(0, inplace=True)


# Accounts list (id) in each cluster
c0_id_list = list(df_k7_c0['id'])
c1_id_list = list(df_k7_c1['id'])
c2_id_list = list(df_k7_c2['id'])
c3_id_list = list(df_k7_c3['id'])
c4_id_list = list(df_k7_c4['id'])
c5_id_list = list(df_k7_c5['id'])
c6_id_list = list(df_k7_c6['id'])


## Store and print cluster sizes 
cls_sizes = {'cluster_id': ['cluster 00', 
                            'cluster 01', 
                            'cluster 02', 
                            'cluster 03', 
                            'cluster 04', 
                            'cluster 05',
                            'cluster 06'], 
             'size' : [df_k7_c0.shape[0], 
                       df_k7_c1.shape[0], 
                       df_k7_c2.shape[0], 
                       df_k7_c3.shape[0], 
                       df_k7_c4.shape[0], 
                       df_k7_c5.shape[0],
                       df_k7_c6.shape[0]]}


## Adding cluster size corresponding to each cluster
df_k7_cls_sizes = pd.DataFrame(cls_sizes, columns =['cluster_id', 'size'])


########################## Checking the number of fraud accounts exist in each Cluster ##########################

fraud_accounts_12months_list = list(fraud_accounts['id'])


# Initialization: The number of fraud accounts in each clusters
fraud_accounts_num_c0 = 0
fraud_accounts_num_c1 = 0
fraud_accounts_num_c2 = 0
fraud_accounts_num_c3 = 0
fraud_accounts_num_c4 = 0
fraud_accounts_num_c5 = 0
fraud_accounts_num_c6 = 0

# Initialization: The fraud accounts in each clusters
fraud_accounts_c0 = []
fraud_accounts_c1 = []
fraud_accounts_c2 = []
fraud_accounts_c3 = []
fraud_accounts_c4 = []
fraud_accounts_c5 = []
fraud_accounts_c6 = []

for id in fraud_accounts_12months_list:
    
    # checking in cluster 00
    if id in c0_id_list:
        #print('Exist Cluster 00')
        fraud_accounts_num_c0 += 1
        fraud_accounts_c0.append(id)
        
    # checking in cluster 01   
    elif id in c1_id_list:
        #print('Exist Cluster 01')
        fraud_accounts_num_c1 += 1
        fraud_accounts_c1.append(id)
        
    # checking in cluster 02
    elif id in c2_id_list:
        #print('Exist Cluster 02')
        fraud_accounts_num_c2 += 1
        fraud_accounts_c2.append(id)
        
    # checking in cluster 03
    elif id in c3_id_list:
        #print('Exist Cluster 03')
        fraud_accounts_num_c3 += 1
        fraud_accounts_c3.append(id)
    
    # checking in cluster 04
    elif id in c4_id_list:
        #print('Exist Cluster 04')
        fraud_accounts_num_c4 += 1
        fraud_accounts_c4.append(id)
        
    # checking in cluster 05
    elif id in c5_id_list:
        #print('Exist Cluster 05')
        fraud_accounts_num_c5 += 1
        fraud_accounts_c5.append(id)
    
    # checking in cluster 06
    elif id in c5_id_list:
        #print('Exist Cluster 06')
        fraud_accounts_num_c6 += 1
        fraud_accounts_c6.append(id)
    else:
        #print('NOT')
        continue
        
# # Store and print the number of existance of fraud accounts in each cluster
fraud_accounts_num = {'cluster_id': ['cluster 00', 
                                     'cluster 01', 
                                     'cluster 02', 
                                     'cluster 03', 
                                     'cluster 04', 
                                     'cluster 05',
                                     'cluster 06'], 
             '#fraud_accounts' : [fraud_accounts_num_c0, 
                                  fraud_accounts_num_c1, 
                                  fraud_accounts_num_c2, 
                                  fraud_accounts_num_c3, 
                                  fraud_accounts_num_c4, 
                                  fraud_accounts_num_c5,
                                  fraud_accounts_num_c6]}

## Adding cluster size corresponding to each cluster
df_k7_cls_fraud_accounts_num = pd.DataFrame(fraud_accounts_num, columns =['cluster_id', '#fraud_accounts'])
        
    
# Add number of frauds accounts corresponding to the cluster sieze and id
df_k7_cls_sizes['#fraud_accounts']= df_k7_cls_fraud_accounts_num['#fraud_accounts']



## GMM Cluster Analysis (n = 8)

In [None]:
## Separating 'id' according to 'cluster_id'
df_k8_c0 = df_k8[df_k8.cluster_id == 0]
df_k8_c1 = df_k8[df_k8.cluster_id == 1]
df_k8_c2 = df_k8[df_k8.cluster_id == 2]
df_k8_c3 = df_k8[df_k8.cluster_id == 3]
df_k8_c4 = df_k8[df_k8.cluster_id == 4]
df_k8_c5 = df_k8[df_k8.cluster_id == 5]
df_k8_c6 = df_k8[df_k8.cluster_id == 6]
df_k8_c7 = df_k8[df_k8.cluster_id == 7]

## Removing 'NaN' with zero
df_k8_c0.fillna(0, inplace=True)
df_k8_c1.fillna(0, inplace=True)
df_k8_c2.fillna(0, inplace=True)
df_k8_c3.fillna(0, inplace=True)
df_k8_c4.fillna(0, inplace=True)
df_k8_c5.fillna(0, inplace=True)
df_k8_c6.fillna(0, inplace=True)
df_k8_c7.fillna(0, inplace=True)

# Accounts list (id) in each cluster
c0_id_list = list(df_k8_c0['id'])
c1_id_list = list(df_k8_c1['id'])
c2_id_list = list(df_k8_c2['id'])
c3_id_list = list(df_k8_c3['id'])
c4_id_list = list(df_k8_c4['id'])
c5_id_list = list(df_k8_c5['id'])
c6_id_list = list(df_k8_c6['id'])
c7_id_list = list(df_k8_c7['id'])

# Store and print cluster sizes 
cls_sizes = {'cluster_id': ['cluster 00', 
                            'cluster 01', 
                            'cluster 02', 
                            'cluster 03', 
                            'cluster 04', 
                            'cluster 05',
                            'cluster 06',
                            'cluster 07'], 
             'size' : [df_k8_c0.shape[0], 
                       df_k8_c1.shape[0], 
                       df_k8_c2.shape[0], 
                       df_k8_c3.shape[0], 
                       df_k8_c4.shape[0], 
                       df_k8_c5.shape[0],
                       df_k8_c6.shape[0],
                       df_k8_c7.shape[0]]}

## Adding cluster size corresponding to each cluster
df_k8_cls_sizes = pd.DataFrame(cls_sizes, columns =['cluster_id', 'size'])


########################## Checking the number of fraud accounts exist in each Cluster ##########################

fraud_accounts_12months_list = list(fraud_accounts['id'])


# Initialization: The number of fraud accounts in each clusters
fraud_accounts_num_c0 = 0
fraud_accounts_num_c1 = 0
fraud_accounts_num_c2 = 0
fraud_accounts_num_c3 = 0
fraud_accounts_num_c4 = 0
fraud_accounts_num_c5 = 0
fraud_accounts_num_c6 = 0
fraud_accounts_num_c7 = 0

# Initialization: The fraud accounts in each clusters
fraud_accounts_c0 = []
fraud_accounts_c1 = []
fraud_accounts_c2 = []
fraud_accounts_c3 = []
fraud_accounts_c4 = []
fraud_accounts_c5 = []
fraud_accounts_c6 = []
fraud_accounts_c7 = []

for id in fraud_accounts_12months_list:
    
    # checking in cluster 00
    if id in c0_id_list:
        #print('Exist Cluster 00')
        fraud_accounts_num_c0 += 1
        fraud_accounts_c0.append(id)
        
    # checking in cluster 01   
    elif id in c1_id_list:
        #print('Exist Cluster 01')
        fraud_accounts_num_c1 += 1
        fraud_accounts_c1.append(id)
        
    # checking in cluster 02
    elif id in c2_id_list:
        #print('Exist Cluster 02')
        fraud_accounts_num_c2 += 1
        fraud_accounts_c2.append(id)
        
    # checking in cluster 03
    elif id in c3_id_list:
        #print('Exist Cluster 03')
        fraud_accounts_num_c3 += 1
        fraud_accounts_c3.append(id)
    
    # checking in cluster 04
    elif id in c4_id_list:
        #print('Exist Cluster 04')
        fraud_accounts_num_c4 += 1
        fraud_accounts_c4.append(id)
        
    # checking in cluster 05
    elif id in c5_id_list:
        #print('Exist Cluster 05')
        fraud_accounts_num_c5 += 1
        fraud_accounts_c5.append(id)
    
    # checking in cluster 06
    elif id in c5_id_list:
        #print('Exist Cluster 06')
        fraud_accounts_num_c6 += 1
        fraud_accounts_c6.append(id)
        
    # checking in cluster 07
    elif id in c7_id_list:
        #print('Exist Cluster 07')
        fraud_accounts_num_c7 += 1
        fraud_accounts_c7.append(id)
        
    else:
        #print('NOT')
        continue


# Store and print the number of existance of fraud accounts in each cluster
fraud_accounts_num = {'cluster_id': ['cluster 00', 
                                     'cluster 01', 
                                     'cluster 02', 
                                     'cluster 03', 
                                     'cluster 04', 
                                     'cluster 05',
                                     'cluster 06',
                                     'cluster 07'], 
             '#fraud_accounts' : [fraud_accounts_num_c0, 
                                  fraud_accounts_num_c1, 
                                  fraud_accounts_num_c2, 
                                  fraud_accounts_num_c3, 
                                  fraud_accounts_num_c4, 
                                  fraud_accounts_num_c5,
                                  fraud_accounts_num_c6,
                                  fraud_accounts_num_c7]}

df_k8_cls_fraud_accounts_num = pd.DataFrame(fraud_accounts_num, columns =['cluster_id', '#fraud_accounts'])
        
    
# Add number of frauds accounts corresponding to the cluster sieze and id
df_k8_cls_sizes['#fraud_accounts']= df_k8_cls_fraud_accounts_num['#fraud_accounts']



In [None]:
# Printing
df_k6_cls_sizes

In [None]:
# Printing
df_k7_cls_sizes

In [None]:
# Printing
df_k8_cls_sizes