In [None]:
# Importing Modules
import getpass    # For users
import h5py       # For data files input
import numpy as np
import time
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Setting up data input directory
user = getpass.getuser()
desktop_or_laptop = 'd' # 'l' # This is for Dan

if user == 'scgst':
    with h5py.File('images_training.h5','r') as H:
        X_train = np.copy(H['data'])
    with h5py.File('labels_training.h5','r') as H:
        y_train = np.copy(H['label'])
    with h5py.File('images_testing.h5','r') as H:
        X_test = np.copy(H['data'])[range(2000), ]
    with h5py.File('labels_testing_2000.h5','r') as H:
        y_test = np.copy(H['label'])
        
# Transforming the feature inputs
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)

# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

# Normalisation on features
X_train = normalize(X_train.astype('float32'))
X_test = normalize(X_test.astype('float32'))

# print(X_train.shape)
# print(X_test.shape)

# Use test set or validation set
# for i in [0.25, 0.2, 0.1, 0.05, 0.02, 0.01]:
i = 0.2
validate_percentage = i
print("--------------- train data proportion", (1-i), "---------------")

X_train_i, X_validate_i, y_train_i, y_validate_i = train_test_split(X_train, y_train, test_size = validate_percentage, random_state = 109)

component_nbr = 100
pca = PCA(n_components = component_nbr, svd_solver = 'randomized', whiten = True).fit(X_train_i)
X_train_i = pca.transform(X_train_i)
X_validate_i = pca.transform(X_validate_i)
    
#     print(X_train_i.shape)
#     print(y_train_i.shape)
#     print(X_validate_i.shape)
#     print(y_validate_i.shape)

# Gaussian Naive Bayes
start_0 = time.time()

gnb = GaussianNB()
gnb_fit = gnb.fit(X_train_i, y_train_i)

y_train_pred_i = gnb_fit.predict(X_train_i)
y_validate_pred_i = gnb_fit.predict(X_validate_i)

end_0 = time.time()
print("Gaussian Naive Bayes Method")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Training Precision:", metrics.precision_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Training Recall:", metrics.recall_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")

# KNN - SKlearn
start_0 = time.time()

KNN = KNeighborsClassifier(n_neighbors = 10)
KNN.fit(X_train_i, y_train_i)

y_train_pred_i = KNN.predict(X_train_i)
y_validate_pred_i = KNN.predict(X_validate_i)

end_0 = time.time()
print("KNN")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Training Precision:", metrics.precision_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Training Recall:", metrics.recall_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


# Logistic Regression - SKlearn
start_0 = time.time()

lgr = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class = 'multinomial')
lgr.fit(X_train_i, y_train_i)

y_train_pred_i = lgr.predict(X_train_i)
y_validate_pred_i = lgr.predict(X_validate_i)

end_0 = time.time()
print("Logestic Regression")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Training Precision:", metrics.precision_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Training Recall:", metrics.recall_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


# SVM - SKlearn
start_0 = time.time()

svc = svm.SVC(kernel = 'rbf', C = 1E6, gamma = 'scale', decision_function_shape = 'ovo') #kernel could also be linear, etc..
svc.fit(X_train_i, y_train_i)

y_train_pred_i = svc.predict(X_train_i)
y_validate_pred_i = svc.predict(X_validate_i)

end_0 = time.time()
print("SVM")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Training Precision:", metrics.precision_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Training Recall:", metrics.recall_score(y_train_i, y_train_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


--------------- train data proportion 0.8 ---------------
Gaussian Naive Bayes Method
Time: 0.7220706939697266 seconds
Training Accuracy: 0.7692916666666667
Testing Accuracy: 0.7645
Training Precision: 0.77368319688817
Testing Precision: 0.7685281797846424
Training Recall: 0.7692916666666667
Testing Recall: 0.7645
Testing F1: 0.7603272702641615
 


In [None]:
# Importing Modules
import getpass    # For users
import h5py       # For data files input
import numpy as np
import time
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Setting up data input directory
user = getpass.getuser()
desktop_or_laptop = 'd' # 'l' # This is for Dan

if user == 'scgst':
    with h5py.File('images_training.h5','r') as H:
        X_train = np.copy(H['data'])
    with h5py.File('labels_training.h5','r') as H:
        y_train = np.copy(H['label'])
    with h5py.File('images_testing.h5','r') as H:
        X_test = np.copy(H['data'])[range(2000), ]
    with h5py.File('labels_testing_2000.h5','r') as H:
        y_test = np.copy(H['label'])
        
# Transforming the feature inputs
X_train = X_train.reshape(-1, 784)
X_test = X_test.reshape(-1, 784)

# print(X_train.shape)
# print(X_test.shape)
# print(y_train.shape)
# print(y_test.shape)

# Normalisation on features
X_train = normalize(X_train.astype('float32'))
X_test = normalize(X_test.astype('float32'))

# print(X_train.shape)
# print(X_test.shape)

# Use test set or validation set
    #X_train_i, X_validate_i, y_train_i, y_validate_i = train_test_split(X_train, y_train, test_size = validate_percentage, random_state = 109)
X_train_i = X_train
X_validate_i = X_test
y_train_i = y_train
y_validate_i = y_test

component_nbr = 100
pca = PCA(n_components = component_nbr, svd_solver = 'randomized', whiten = True).fit(X_train_i)
X_train_i = pca.transform(X_train_i)
X_validate_i = pca.transform(X_validate_i)

#     print(X_train_i.shape)
#     print(y_train_i.shape)
#     print(X_validate_i.shape)
#     print(y_validate_i.shape)

# Gaussian Naive Bayes
start_0 = time.time()

gnb = GaussianNB()
gnb_fit = gnb.fit(X_train_i, y_train_i)

y_train_pred_i = gnb_fit.predict(X_train_i)
y_validate_pred_i = gnb_fit.predict(X_validate_i)

end_0 = time.time()
print("Gaussian Naive Bayes Method")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


# KNN - SKlearn
start_0 = time.time()

KNN = KNeighborsClassifier(n_neighbors = 10)
KNN.fit(X_train_i, y_train_i)

y_train_pred_i = KNN.predict(X_train_i)
y_validate_pred_i = KNN.predict(X_validate_i)

end_0 = time.time()
print("KNN")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


# Logistic Regression - SKlearn
start_0 = time.time()

lgr = LogisticRegression(random_state = 0, solver = 'lbfgs', multi_class = 'multinomial')
lgr.fit(X_train_i, y_train_i)

y_train_pred_i = lgr.predict(X_train_i)
y_validate_pred_i = lgr.predict(X_validate_i)

end_0 = time.time()
print("Logistic Regression")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")


# SVM - SKlearn
start_0 = time.time()

svc = svm.SVC(kernel = 'rbf', C = 1E6, gamma = 'scale', decision_function_shape = 'ovo') #kernel could also be linear, etc..
svc.fit(X_train_i, y_train_i)

y_train_pred_i = svc.predict(X_train_i)
y_validate_pred_i = svc.predict(X_validate_i)

end_0 = time.time()
print("SVM")
print("Time:", end_0 - start_0, "seconds")
print("Training Accuracy:", metrics.accuracy_score(y_train_i, y_train_pred_i))
print("Testing Accuracy:", metrics.accuracy_score(y_validate_i, y_validate_pred_i))
print("Testing Precision:", metrics.precision_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing Recall:", metrics.recall_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print("Testing F1:", metrics.f1_score(y_validate_i, y_validate_pred_i, average = 'weighted'))
print(" ")