# Fraud Detection Day 14 Model: Gaussian Mixture Model (GMM)

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
sns.set(color_codes=True)

import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

from scipy import stats
get_ipython().magic(u'config IPCompleter.greedy=True')

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture
import pickle

## 1. Import and Filter Features Data for Day 14

### 1.1 Import day 14 final features data


In [None]:
# Import day 14 features data 
df = pd.read_csv("path", sep="\t")

In [None]:
# Editing dataframe colume names
df.columns = [
    col.replace('-', '').replace('/', '')
    for col in df.columns
]

In [None]:
## Sorting column in the dataframe
df = df.reindex(sorted(df.columns), axis=1)

## 2. Cleaning Data for GMM Clustering
### 2.1 Drop row with missing 'id'

In [None]:
# Finding the features columns varinaces
df = df.drop(columns=['feature_list'], axis=1)

In [None]:
# Keep a copy of the original dataframe
df_orig = df.copy()

# Rearranging Columns (alphabatically)
df = df.sort_index(axis=1)

### 2.3 Feature column normalization (model)

In [None]:
# Normalized all features columns except the 'id'
column_names_to_not_normalize = ['id']
column_names_to_normalize = [x for x in list(df) if x not in column_names_to_not_normalize ]

# Min-Max standarization model
min_max_scaler = MinMaxScaler()

# Fit and transfrom the data
x = df[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = df.index)
df[column_names_to_normalize] = df_temp


In [None]:
# Save the standarization model: min-max scalar
filename_minmax_scaler = 'path'
pickle.dump(min_max_scaler, open(filename_minmax_scaler, 'wb'))

In [None]:
# Remove any column with 'Nan'
df.fillna(0, inplace=True)

# Drop the 'id' 
df_noid = df.drop(['id'], axis=1)

## 3. Determine Number of Clusters: BIC Score Method

In [None]:
############################# Determine the Number of Clusters ########################################
# from sklearn.mixture import GaussianMixture

# gm_bic= []
# gm_score=[]
# for i in range(2,12):
#     gm = GaussianMixture(n_components=i,n_init=10,tol=1e-3,max_iter=1000).fit(df_noid)
#     print("BIC for number of cluster(s) {}: {}".format(i,gm.bic(df_noid)))
#     print("Log-likelihood score for number of cluster(s) {}: {}".format(i,gm.score(df_noid)))
#     print("-"*100)
#     gm_bic.append(-gm.bic(df_noid))
#     gm_score.append(gm.score(df_noid))

In [None]:
# plt.title("The Gaussian Mixture model BIC \nfor determining number of clusters\n",fontsize=16)
# plt.scatter(x=[i for i in range(2,12)],y=np.log(gm_bic),s=150,edgecolor='k')
# plt.grid(True)
# plt.xlabel("Number of clusters",fontsize=14)
# plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
# plt.xticks([i for i in range(2,12)],fontsize=14)
# plt.yticks(fontsize=15)
# plt.show()

## 4. Fitting GMM Clustering
From the above elbow graph, we fix number of clusters n = 6.

In [None]:
##################################### GMM Training (n=6) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6)
gmm.fit(df_noid)

# Predicting clustering
cluster_predict = gmm.predict(df_noid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = 'path'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("path", sep="\t", index=False)

In [None]:
##################################### GMM Training (n=7) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=7)
gmm.fit(df_noid)

# Predicting clustering
cluster_predict = gmm.predict(df_noid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = 'path'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("path", sep="\t", index=False)

In [None]:
##################################### GMM Training (n=8) #######################################################

# GMM fitting to the data 
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=8)
gmm.fit(df_noid)

# Predicting clustering
cluster_predict = gmm.predict(df_noid)

# Adding clusters id of each account to the dataframe
df_orig['cluster_id'] = cluster_predict

# save the model to disk
filename_clustering = 'path'
pickle.dump(gmm, open(filename_clustering, 'wb'))

# Export the original users data with corresponding cluster id label (clustering output)
df_orig.to_csv("path", sep="\t", index=False)