In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA, PCA
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance
from yellowbrick.cluster import KElbowVisualizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import time
from scipy import linalg
import matplotlib as mpl
import itertools

from sklearn.metrics import v_measure_score, homogeneity_score, adjusted_mutual_info_score


import matplotlib.cm as cm

from sklearn.utils import shuffle
from sklearn.utils import check_random_state
from sklearn.cluster import MiniBatchKMeans

from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, train_test_split, validation_curve)   
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from yellowbrick.model_selection import LearningCurve, ValidationCurve

from sklearn.model_selection import cross_val_score
import time

np.random.seed(42)

In [33]:
def load_data(ds):
    df = pd.read_csv("data/" + ds)
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = X.to_numpy(), y.to_numpy()
    return X, y

## PCA + K-means

In [34]:
X, y = load_data('bank_personal_loan_modelling.csv')
pca = PCA(n_components= 10).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=KMeans(3).fit(X_pca)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_pca)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_pca = kmeans.predict(X_pca)

result = pd.concat([pd.DataFrame(X_pca), pd.DataFrame(y_pca)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))



Train time: 0.046709537506103516
Query time: 0.001999378204345703
Cross validation score: 0.7105
Train time: 0.0521848201751709
Query time: 0.0020062923431396484
Test Accuracy: 0.688


## ICA + K-means

In [35]:
X, y = load_data('bank_personal_loan_modelling.csv')
ica = FastICA(n_components= 11).fit(X)
X_ica = ica.transform(X)
start = time.time()
kmeans=KMeans(3).fit(X_ica)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_ica)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_ica = kmeans.predict(X_ica)

result = pd.concat([pd.DataFrame(X_ica), pd.DataFrame(y_ica)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))



Train time: 0.13132977485656738
Query time: 0.0010008811950683594
Cross validation score: 0.7014999999999998
Train time: 1.0304443836212158
Query time: 0.0010004043579101562
Test Accuracy: 0.724


## RP + K-means

In [36]:
X, y = load_data('bank_personal_loan_modelling.csv')
rp = random_projection.SparseRandomProjection(n_components=6).fit(X)
X_rp = rp.transform(X)
start = time.time()
kmeans=KMeans(3).fit(X_rp)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_rp)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_rp = kmeans.predict(X_rp)

result = pd.concat([pd.DataFrame(X_rp), pd.DataFrame(y_rp)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.06789302825927734
Query time: 0.0010018348693847656




Cross validation score: 0.7032499999999998
Train time: 0.9021775722503662
Query time: 0.0012564659118652344
Test Accuracy: 0.717


## SVD + K-means

In [37]:
X, y = load_data('bank_personal_loan_modelling.csv')
tsvd = TruncatedSVD(n_components=4).fit(X)
X_tsvd = tsvd.transform(X)
start = time.time()
kmeans=KMeans(3).fit(X_tsvd)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_tsvd)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_tsvd = kmeans.predict(X_tsvd)

result = pd.concat([pd.DataFrame(X_tsvd), pd.DataFrame(y_tsvd)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.04311037063598633
Query time: 0.0030579566955566406




Cross validation score: 0.7070000000000002
Train time: 0.9463660717010498
Query time: 0.0016164779663085938
Test Accuracy: 0.702


## PCA + EM

In [38]:
X, y = load_data('bank_personal_loan_modelling.csv')
pca = PCA(n_components= 10).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_pca)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_pca)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_pca = kmeans.predict(X_pca)

result = pd.concat([pd.DataFrame(X_pca), pd.DataFrame(y_pca)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.09575152397155762
Query time: 0.003027200698852539
Cross validation score: 0.70175
Train time: 1.3839733600616455
Query time: 0.00099945068359375
Test Accuracy: 0.723


## ICA + EM

In [39]:
X, y = load_data('bank_personal_loan_modelling.csv')
ica = FastICA(n_components= 11).fit(X)
X_ica = ica.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_ica)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_ica)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_ica = kmeans.predict(X_ica)

result = pd.concat([pd.DataFrame(X_ica), pd.DataFrame(y_ica)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.027276992797851562
Query time: 0.0020101070404052734




Cross validation score: 0.7060000000000001
Train time: 0.09665298461914062
Query time: 0.0009987354278564453
Test Accuracy: 0.706


## RP + EM

In [40]:
X, y = load_data('bank_personal_loan_modelling.csv')
rp = random_projection.SparseRandomProjection(n_components= 6).fit(X)
X_rp = rp.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_rp)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_rp)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_rp = kmeans.predict(X_rp)

result = pd.concat([pd.DataFrame(X_rp), pd.DataFrame(y_rp)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.04836583137512207
Query time: 0.0020537376403808594
Cross validation score: 0.7097500000000001
Train time: 2.1209890842437744
Query time: 0.0010008811950683594
Test Accuracy: 0.691


## SVD + EM

In [41]:
X, y = load_data('bank_personal_loan_modelling.csv')
tsvd = TruncatedSVD(n_components= 6).fit(X)
X_tsvd = tsvd.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_tsvd)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
kmeans.predict(X_tsvd)
query_time = time.time() - start
print("Query time: " + str(query_time))
y_tsvd = kmeans.predict(X_tsvd)

result = pd.concat([pd.DataFrame(X_tsvd), pd.DataFrame(y_tsvd)], axis=1, sort=False)
result.columns = [0, 1, 2, 3, 4, 5, 6]

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(result, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Train time: 0.03817915916442871
Query time: 0.0010001659393310547
Cross validation score: 0.7077500000000002
Train time: 0.5260059833526611
Query time: 0.0010089874267578125
Test Accuracy: 0.699


In [42]:

# KMeans = 2
X, y = load_data('audit_risk.csv')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=KMeans(2).fit(X_pca)
labels = kmeans.predict(X_pca)

print(v_measure_score(y, labels))
print(adjusted_mutual_info_score(y, labels))
print(homogeneity_score(y, labels))

0.01703257895487564
0.0149246238661465
0.009012509131575818




In [43]:
# Gaussian = 2
X, y = load_data('audit_risk.csv')
pca = PCA(n_components= 2).fit(X)
X_pca = pca.transform(X)
start = time.time()
kmeans=GaussianMixture(2).fit(X_pca)
labels = kmeans.predict(X_pca)

print(v_measure_score(y, labels))
print(adjusted_mutual_info_score(y, labels))
print(homogeneity_score(y, labels))

0.00353731589969135
0.0009843862577776188
0.0017947731571505909
