In [33]:
import numpy as np


class PCA:
    def __init__(self, k):
        self.k = k
        self.components = None

    def fit(self, data):
        """
        finds best params for X = Mu + A * Lambda
        :param data: data of shape (number of samples, number of features)
        HINT! use SVD
        """
        _,s,v = np.linalg.svd(data)
        eigenvalues = np.diagonal(s)**2
        biggest_indices = eigenvalues.argsort()[-self.k:][::-1]
        self.components = v[biggest_indices]
        
    def transform(self, data):
        """
        for given data returns Lambdas
        x_i = mu + A dot lambda_i
        where mu is location_, A is matrix_ and lambdas are projection of x_i
        on linear space from A's rows as basis
        :param data: data of shape (number of samples, number of features)
        """
        # Lemma: x is vector and A dot A.T == I, then x's coordinates in Linear Space(A's rows as basis)
        # is A dot x
        lambdas = np.dot(self.new_basis, data.T).T
        return lambdas
    
    def return_components(self):
        return self.components


In [1]:
#!/usr/bin/env python3
"""
This is a boilerplate file for you to get started on MNIST dataset and run SVD.

This file has code to read labels and data from .gz files you can download from
http://yann.lecun.com/exdb/mnist/

Files will work if train-images-idx3-ubyte.gz file and
train-labels-idx1-ubyte.gz files are in the same directory as this
python file.
"""
from __future__ import print_function
import argparse
import gzip
import struct
import numpy as np
import matplotlib.pyplot as plt
from PCA import PCA
from sklearn.cluster import KMeans

def parse_args(*argument_array):
    parser = argparse.ArgumentParser()
    parser.add_argument('--mnist-train-data',
                        default='train-images-idx3-ubyte.gz',  # noqa
                        help='Path to train-images-idx3-ubyte.gz file '
                        'downloaded from http://yann.lecun.com/exdb/mnist/')
    parser.add_argument('--mnist-train-labels',
                        default='train-labels-idx1-ubyte.gz',  # noqa
                        help='Path to train-labels-idx1-ubyte.gz file '
                        'downloaded from http://yann.lecun.com/exdb/mnist/')
    args = parser.parse_args(*argument_array)
    return args


def main(args):
    # Read data file into numpy matrices
    with gzip.open(args.mnist_train_data, 'rb') as in_gzip:
        magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16))
        all_data = np.array([np.array(struct.unpack('>{}B'.format(rows * columns),
                                           in_gzip.read(rows * columns)))
                    for _ in range(6000)])
    with gzip.open(args.mnist_train_labels, 'rb') as in_gzip:
        magic, num = struct.unpack('>II', in_gzip.read(8))
        all_labels = struct.unpack('>6000B', in_gzip.read(6000))
    each_label = np.empty(10, dtype = object)
    for i in range(10):
        each_label[i] = all_data[np.array(all_labels) == i]
    pca = PCA(5)
    pca.fit(all_data)
    all_data_transform = pca.transform(all_data)
    kmeans_labels = KMeans(n_clusters=10, random_state=0).fit_predict(all_data)
    each_cluster = np.empty(10, dtype = object)
    for i in range(10):
        each_cluster[i] = all_data_transform[:,:2][np.array(kmeans_labels) == i]
    f, axarr = plt.subplots(2, 10, figsize=(18, 4), sharey=True)
    for i in range(10):
        a = pca.transform(each_label[i])
        axarr[0][i].scatter(a.T[0], a.T[1], s = 1)
    for i in range(10):
        axarr[1][i].scatter(each_cluster[i].T[0], each_cluster[i].T[1], s = 1)
    plt.show()
    coincidence_matrix = np.zeros((10,10)).astype(int)
    for i in range(6000):
        coincidence_matrix[all_labels[i], kmeans_labels[i]]+=1
    print(coincidence_matrix)
    #plt.savefig("labels_vs_kmeans_clusters.jpg")

if __name__ == '__main__':
    args = parse_args()
    main(args)