In [77]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import FastICA, PCA
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import accuracy_score
from sklearn.model_selection import (GridSearchCV, train_test_split, validation_curve)   
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from yellowbrick.model_selection import LearningCurve, ValidationCurve

from sklearn.model_selection import cross_val_score
import time

np.random.seed(42)

# Utilities

In [78]:
def load_data(ds):
    df = pd.read_csv("data/" + ds)
    X, y = df.iloc[:, :-1], df.iloc[:, -1]
    X, y = X.to_numpy(), y.to_numpy()
    return X, y

# Personal Loan Dataset

## PCA

In [79]:
X, y = load_data('bank_personal_loan_modelling.csv')
pca = PCA(n_components= 10).fit(X)
X_pca = pca.transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.7055
Train time: 0.9665329456329346
Query time: 0.0010025501251220703
Test Accuracy: 0.708


## ICA

In [80]:
X, y = load_data('bank_personal_loan_modelling.csv')
ica = FastICA(n_components= 11, max_iter=10000, tol=0.0001).fit(X)
X_ica = ica.transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(X_ica, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))



Cross validation score: 0.70875
Train time: 2.3448617458343506
Query time: 0.0008947849273681641
Test Accuracy: 0.695


## Randomized Projections

In [81]:
X, y = load_data('bank_personal_loan_modelling.csv')
rp = random_projection.SparseRandomProjection(n_components=6)
X_rp=rp.fit_transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(X_rp, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.7034999999999998
Train time: 1.311185359954834
Query time: 0.000995635986328125
Test Accuracy: 0.716


## SVD

In [82]:
X, y = load_data('bank_personal_loan_modelling.csv')
tsvd = TruncatedSVD(n_components=4)
X_tsvd = tsvd.fit_transform(X)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(X_tsvd, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.7065
Train time: 2.059286594390869
Query time: 0.0
Test Accuracy: 0.704


## No Reduction

In [83]:
X, y = load_data('bank_personal_loan_modelling.csv')

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.0001)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20).mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = accuracy_score(y_test, y_pred)
print("Test Accuracy: " + str(score))

Cross validation score: 0.71325
Train time: 0.21257829666137695
Query time: 0.0009968280792236328
Test Accuracy: 0.677
