## ETL

In [67]:
# Importing Modules
import getpass    # For users
import h5py       # For data files input
import numpy as np
import time

# Setting up data input directory
user = getpass.getuser()
desktop_or_laptop = 'd' # 'l' # This is for Dan

if user == 'scgst':
    with h5py.File('images_training.h5','r') as H:
        X_train = np.copy(H['data'])
    with h5py.File('labels_training.h5','r') as H:
        y_train = np.copy(H['label'])
    with h5py.File('images_testing.h5','r') as H:
        X_test = np.copy(H['data'])[range(2000), ]
    with h5py.File('labels_testing_2000.h5','r') as H:
        y_test = np.copy(H['label'])
        
# Transforming the feature inputs
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)

# # check
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

# Normalisation on features
from sklearn.preprocessing import normalize
X_train = normalize(X_train.astype('float32'))
X_test = normalize(X_test.astype('float32'))

In [68]:
# Use test set or validation set
from sklearn.model_selection import train_test_split
validate_switch = False

if validate_switch:
    # Split Train into train + validation
    validate_percentage = 0.7
    X_train_validate, X_validate, y_train_validate, y_validate = train_test_split(X_train, y_train, test_size = validate_percentage, random_state = 109)
    
    X_train = X_train_validate 
    y_train = y_train_validate
    X_test = X_validate
    y_test = y_validate

# # check
# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

In [69]:
# PCA
# https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
from sklearn.decomposition import PCA

PCA_switch = True

if PCA_switch:
    component_nbr = 100
    pca = PCA(n_components = component_nbr, svd_solver = 'randomized', whiten = True).fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)

## Gaussian Naive Bayes - SKlearn

In [71]:
# Gaussian Naive Bayes
# https://www.datacamp.com/community/tutorials/naive-bayes-scikit-learn
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
start_0 = time.time()

gnb = GaussianNB()
gnb_fit = gnb.fit(X_train, y_train)
y_pred = gnb_fit.predict(X_test)

end_0 = time.time()
print("Gaussian Naive Bayes Method takes", end_0 - start_0, "seconds")

print("Gaussian Naive Bayes Method's accuracy is", metrics.accuracy_score(y_test, y_pred))
print("Gaussian Naive Bayes Method's confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

Gaussian Naive Bayes Method takes 0.11070466041564941 seconds
Gaussian Naive Bayes Method's accuracy is 0.7575
Gaussian Naive Bayes Method's confusion matrix:
[[129   0   0  15   0  10  19   0   5   0]
 [  2 174   2   9   0   2   2   0   0   0]
 [  1   0 134   3  44  11  13   0   4   0]
 [  4   0   2 153  12  14   4   0   2   0]
 [  0   0  19  10 159  11  10   0   3   0]
 [  0   0   0   1   0 182   0  23   1   7]
 [ 34   0  19  10  53  16  62   0   6   0]
 [  0   0   0   0   0  23   0 159   0  16]
 [  0   0   3   2   2  11   4   1 196   0]
 [  0   0   0   0   0  12   0   6   2 167]]


## SVM - SKlearn

In [72]:
# SVM - SKlearn
# https://www.datacamp.com/community/tutorials/svm-classification-scikit-learn-python
from sklearn import svm
from sklearn import metrics

start_0 = time.time()

# Classify 0
svc = svm.SVC(kernel = 'rbf', C = 1E6, gamma = 'scale', decision_function_shape = 'ovo') #kernel could also be linear, etc..
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

end_0 = time.time()
print("SVM Method takes", end_0 - start_0, "seconds")

print("SVM Method's accuracy is", metrics.accuracy_score(y_test, y_pred))
print("SVM Method's confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

SVM Method takes 116.88875722885132 seconds
SVM Method's accuracy is 0.877
SVM Method's confusion matrix:
[[149   0   4   4   0   1  19   0   1   0]
 [  0 187   0   4   0   0   0   0   0   0]
 [  2   0 178   1  15   1  13   0   0   0]
 [  4   1   1 169   6   4   6   0   0   0]
 [  0   1  22   9 159   0  21   0   0   0]
 [  0   0   0   0   0 208   0   2   0   4]
 [ 21   1  20   7  17   3 128   0   3   0]
 [  0   0   0   0   0   5   0 185   0   8]
 [  0   0   0   0   0   5   1   1 212   0]
 [  0   0   0   0   0   1   0   6   1 179]]


## Logistic Regression - SKlearn

In [73]:
# Logistic Regression - SKlearn
# https://www.datacamp.com/community/tutorials/understanding-logistic-regression-python
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

start_0 = time.time()

lgr = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class = 'multinomial')
lgr.fit(X_train, y_train)

y_pred = lgr.predict(X_test)

end_0 = time.time()
print("Logistic Regression takes", end_0 - start_0, "seconds")

print("Logistic Regression's accuracy is", metrics.accuracy_score(y_test, y_pred))
print("Logistic Regression's confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

Logistic Regression takes 3.2173678874969482 seconds
Logistic Regression's accuracy is 0.8435
Logistic Regression's confusion matrix:
[[144   0   1  13   0   0  18   0   2   0]
 [  0 185   1   5   0   0   0   0   0   0]
 [  5   1 152   1  32   0  16   0   3   0]
 [  6   4   1 173   5   0   1   0   1   0]
 [  0   0  21   9 160   0  20   0   2   0]
 [  0   1   0   0   0 202   0   3   0   8]
 [ 34   0  25  10  32   0  96   0   3   0]
 [  0   0   0   0   0   6   0 187   0   5]
 [  1   1   1   2   0   2   4   2 206   0]
 [  0   0   0   0   0   1   0   3   1 182]]




## KNN - SKlearn

In [74]:
# KNN - SKlearn
# https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

start_0 = time.time()

KNN = KNeighborsClassifier(n_neighbors = 10)
KNN.fit(X_train, y_train)

y_pred = KNN.predict(X_test)

end_0 = time.time()
print("Logistic Regression takes", end_0 - start_0, "seconds")

print("Logistic Regression's accuracy is", metrics.accuracy_score(y_test, y_pred))
print("Logistic Regression's confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

Logistic Regression takes 25.109833240509033 seconds
Logistic Regression's accuracy is 0.8195
Logistic Regression's confusion matrix:
[[164   1   1   1   2   0   6   0   3   0]
 [  1 185   0   3   1   0   1   0   0   0]
 [  4   0 161   0  29   0  15   0   1   0]
 [  8   2   2 168   8   0   2   0   1   0]
 [  0   0  18  10 171   0  13   0   0   0]
 [  1   0   0   0   1 121   0  37   3  51]
 [ 40   0  25   6  32   0  93   0   4   0]
 [  0   0   0   0   0   0   0 185   0  13]
 [  1   1   2   1   4   0   0   0 210   0]
 [  0   0   0   0   0   0   0   5   1 181]]
