### Load Fashion MNIST dataset

In [146]:
from matplotlib import pyplot as plt
import numpy as np

# import mnist_reader
def load_mnist(path, kind):
    import os
    import gzip
    import numpy as np

    """Load MNIST data from `path`"""
    labels_path = os.path.join(path,
                               '%s-labels-idx1-ubyte.gz'
                               % kind)
    images_path = os.path.join(path,
                               '%s-images-idx3-ubyte.gz'
                               % kind)

    with gzip.open(labels_path, 'rb') as lbpath:
        labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
                               offset=8)

    with gzip.open(images_path, 'rb') as imgpath:
        images = np.frombuffer(imgpath.read(), dtype=np.uint8,
                               offset=16).reshape(len(labels), 784)

    return images, labels

train_set, train_labels = load_mnist('/Users/macbookpro/Desktop/MLfinalproj/fashion-mnist-master/data/fashion', kind='train')
test_set, test_labels = load_mnist('/Users/macbookpro/Desktop/MLfinalproj/fashion-mnist-master/data/fashion', kind='t10k')

### Preprocess the dataset
#### I took the first 10,000 samples from the 60,000 sample dataset, transposed the matrix so the columns would contain each sample and the rows are the features and I then centered and normalized the data

In [147]:
# print(X_train[0:1000].shape)
# np.random.shuffle(np.transpose(train_set))
# print(train_set.shape)
X_new_train = train_set[0:10000]
new_labels = train_labels[0:10000]
# X_new_train = X_new_train.transpose()
# Arranging training data so that each sample is a column in the data matrix
X = X_new_train.T # Data matrix
# X = X_train[0:1000]
# Computation of the empirical mean of data, both as a vector and as a tiled matrix
mean_vec = X.mean(axis = 1)
mean_mat = np.tile(mean_vec.reshape(X.shape[0],1),[1,X.shape[1]])

# Centered data matrix
centered_X = X - mean_mat

# Arranging test data so that each sample is a column in the data matrix
test_set = test_set.T

# Appropriate shaping of the empirical mean as a tiled matrix whose dimensions match those of test data matrix
mean_mat = np.tile(mean_vec.reshape(test_set.shape[0],1),[1,test_set.shape[1]])

# Centered test data matrix
centered_X_test = test_set - mean_mat

## PCA
#### I decided to reduce the dimensions of my dataset for k-NN in order to be able to run the algorithm on my own computer, this reduced the features down to 184 when taking the features that accounted for 95% variation of the data

In [148]:
# Singular value decomposition of X
U, s, Vh = np.linalg.svd(centered_X)
# Computation of an appropriate 'r'
r = 0; cum_sum = 0
data_energy = np.linalg.norm(centered_X)**2
for i in range(len(s)):
    cum_sum = cum_sum + s[i]**2
    if cum_sum/data_energy >= 0.95:
        r = i+1
        break
        
# Print the value of r
# display(Latex(r'The calculated value of the integer $r$ is {}.'.format(r)))

# Matrix of the top-r principal components of the centered data matrix
U_r = U[:,0:r]

# Compute the PCA-based features of the centered data using the top-r principal components
X_tilde = U_r.T@centered_X

# # Compute the PCA-based features of the centered test data using the top-r principal components
X_tilde_test = U_r.T@centered_X_test

## k-NN

In [151]:
def kNN(train, train_labels, k, test_x):
#     train = np.delete(train, np.s_[testval:testval+1], 1)
#     print(train.shape)
    # Compute distances between all training samples and the test sample
#     dist = np.array([np.linalg.norm(train[:,testval] - train[:,i]) for i in range(train.shape[1])])
    
    dist = np.array([np.linalg.norm(test_x - train[:,i]) for i in range(train.shape[1])])
    
    # Sort the distances and find the indices of the k-NNs
    sort_dist = dist.argsort() # Sorting
#     print(sort_dist)
    kNN_labels = train_labels[sort_dist[0:k]] # Indices
#     print(kNN_labels)
    estlabels = np.zeros(10)
    for i in range(len(kNN_labels)):
        estlabels[kNN_labels[i]] += 1
#     print(estlabels)
    return np.argmax(estlabels)
#     est_labels[0] = 1;
    # Return the label that occurs most frequently within the k-NNs
#     if np.size(np.where(kNN_labels==0)) >= np.size(np.where(kNN_labels==1)):
#         return 0
#     else:
#         return 1
#     print(kNN_labels)

In [152]:
# kNN(X_tilde, new_labels, 141, 3)
est_labels = np.array([kNN(X_tilde,new_labels,100,X_tilde_test[:,i]) for i in range(1000)])

# print(est_labels)
# print(test_labels[0:1000])
acc_label = np.equal(est_labels,test_labels[0:1000])
# print(acc_label)
# Calculation of the average classification error
ave_clf_err = (np.size(np.where(acc_label==False)))/acc_label.size

### The average classification error of k-NN on this dataset came out to be about 0.19, if I could run the entire dataset on my computer I believe the error would be a lot smaller

In [153]:
print(ave_clf_err)

0.195


## Logistic Regression
### For logistic regression I chose to pick the first two classes of my dataset, class 0 and class 1.

In [159]:
X_new = []
labels = []
for i in range(train_set.shape[0]):
    if(train_labels[i] == 0):
        X_new.append(train_set[i])
        labels.append(0)
    elif(train_labels[i] == 1):
        X_new.append(train_set[i])
        labels.append(0)
X_new = np.array(X_new)
labels = np.array(labels)
print(X_new.shape)
print(labels.shape)
X_new = X_new.T

X_test = []
testlabels = []
for i in range(test_set.shape[0]):
    if(test_labels[i] == 0):
        X_test.append(test_set[i])
        testlabels.append(0)
    elif(test_labels[i] == 1):
        X_test.append(test_set[i])
        testlabels.append(0)
        
X_test = np.array(X_test)
testlabels = np.array(testlabels)
print(X_test.shape)
print(testlabels.shape)
X_test = X_test.T

(12000, 784)
(12000,)
(167, 10000)
(167,)


In [None]:
def sigmoid(z):
    
    s = 1.0 / (1.0 + np.exp(-z))
    
    return s

def initialize(dim):
    w = np.zeros((dim,1))
    b = 0
    
    assert (w.shape == (dim,1))
    assert (isinstance(b, float) or isinstance(b,int))
    
    return w,b


def propagate(w, b, X, Y):
    
    m = X.shape[1]
    
    z = np.dot(w.T,X)+b
    A = sigmoid(z)
    cost = -1.0/m*np.sum(Y*np.log(A)+(1.0-Y)*np.log(1.0-A))
    
    dw = 1.0/m*np.dot(X, (A-Y).T)
    db = 1.0/m*np.sum(A-Y)
    
    assert (dw.shape == w.shape)
    assert (db.dtype == float)
    
    cost = np.squeeze(cost)
    assert (cost.shape == ())

    return grads, cost

def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = False):
    
    for i in range(num_iterations):
        
        w = w - learning_rate*dw
        b = b - learning_rate*db
        
    return params, grads

def predict (w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1,m))
    w = w.reshape(X.shape[0],1)
    
    A = sigmoid (np.dot(w.T, X)+b)
    
    for i in range(A.shape[1]):
        if (A[:,i] > 0.5): 
            Y_prediction[:, i] = 1
        elif (A[:,i] <= 0.5):
            Y_prediction[:, i] = 0
            
    assert (Y_prediction.shape == (1,m))
    
    return Y_prediction

def model (X_train, Y_train, X_test, Y_test, num_iterations = 1000, learning_rate = 0.5, print_cost = False):
    
    w, b = initialize(X_train.shape[0])
    parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost)
    
    w = parameters["w"]
    b = parameters["b"]
    
    Y_prediction_test = predict (w, b, X_test)
    Y_prediction_train = predict (w, b, X_train)
    
    train_accuracy = 100.0 - np.mean(np.abs(Y_prediction_train-Y_train)*100.0)
    test_accuracy = 100.0 - np.mean(np.abs(Y_prediction_test-Y_test)*100.0)
    
    return d

In [None]:
d = model (X_new, labels, X_test, testlabels[0], num_iterations = 4000, learning_rate = 0.05, print_cost = True)

In [None]:
lr.predict(X_test)

## K-means clustering

In [180]:
cluster = np.random.randint(10,size=10000)
# mean = np.zeros((3,2))
# population = np.zeros((3,1))

print(X_tilde.shape)
X_t = X_tilde.T
print(X_t.shape)
change = True
num_of_iter = 0
while(change):
    population = np.zeros((10,1))
    mean = np.zeros((10,784))
# print(mean.shape)
    for i in range(10000):
        population[cluster[i]] += 1
        mean[cluster[i]] += X_t[i]
    for i in range(10):
        mean[i] = mean[i]/population[i]
    change = False
    for i in range(10000):
        for j in range(10):
            if j != cluster[i]:
                if np.linalg.norm(X_t[i]-mean[j]) < np.linalg.norm(X_t[i]-mean[cluster[i]]):
                    cluster[i] = j
                    change = True
    num_of_iter += 1
print(population)


(182, 10000)
(10000, 182)


ValueError: operands could not be broadcast together with shapes (784,) (182,) (784,) 

In [181]:
print("k-means converged after ", num_of_iter, " iterations")
print(cluster.shape)
acc_label = np.equal(cluster,new_labels)
ave_clf_err = (np.size(np.where(acc_label==False)))/acc_label.size
print(ave_clf_err)
# x0 = []
# x1 = []
# x2 = []
# for i in range(3000):
#     if cluster[i] == 0:
#         x0.append(X[i])
#     elif(cluster[i] == 1):
#         x1.append(X[i])
#     elif(cluster[i] == 1):
#         x1.append(X[i])
#     elif(cluster[i] == 1):
#         x1.append(X[i])
#     else:
#         x2.append(X[i])
        
# x0 = np.asarray(x0)
# x1 = np.asarray(x1)
# x2 = np.asarray(x2)

k-means converged after  0  iterations
(10000,)
0.8987


## SVM

In [182]:
w = np.zeros(X_tilde.shape)
epochs = 1
alpha = 0.0001 # learning rate

while(epochs < 10000): # one epoch is one iteration through the complete dataset
    y = np.sum(w * X_tilde, axis = 1)
    prod = y.reshape((-1,1)) * new_labels.reshape((-1,1))
    if(epochs % 1000 == 0):
        print(epochs)
    index = 0
    for val in prod:
        if(val >= 1):
            cost = 0
            w = w - alpha * (2 * 1/epochs * w) # updating the parameters
            
        else:
            cost = 1 - val
            w = w + alpha * (X_tilde[index] * new_labels[index] - 2 * 1/epochs * w) # iterating over all points
        index += 1
    epochs += 1

ValueError: operands could not be broadcast together with shapes (182,1) (10000,1) 