# CS 51 Final Project Interface

By: Olivia Angiuli, Martin Reindl, Ty Rocca, Wilder Wohns

**Purpose:** This top level iPython notebook is designed to be a place to see how our code works. It is also a good way for us to show off our results

In [1]:
#!/usr/bin/env python

In [2]:
import numpy as np
import os, struct,random #,
# import sys
from numpy import append, array, int8, uint8, zeros
import Initialize
import Accuracy
import Distance
import Load
import Kmeans
import ClassifyClusters
import base64
import json
import timeit

######################################
# Load in training images and labels #
######################################

# load training and testing images and labels as 60,000 x 28 x 28 array
train_images,train_labels = Load.load_mnist("training",path=os.getcwd(), prop = 5)
test_images,test_labels = Load.load_mnist("testing",path=os.getcwd())

# flatten training images into 60,000 x 784 array
train_images_flat = np.array([np.ravel(img) for img in train_images])
test_images_flat = np.array([np.ravel(img) for img in test_images])

def main (k, m="means", init_type="random"):
    
    # Starting clustering timer
    start_cluster = timeit.default_timer()
    
    # Process arguments
    if k < 10:
        raise ValueError("Minimum cluster number is 10")
    
    #Process method of clustering
    if m not in ["means", "medoids", "medians"]:
        raise ValueError("Not a valid method specification; must be 'means',\
          'medoids', or 'medians'")
    
    # Method for clustering
    initial_clusters = None
    if init_type == "random":
        initial_clusters = Initialize.random_centers(k)
    else:
        init_type = "k_plus_plus"
        initial_clusters = kmeans_plusplus(k, train_images_flat,
                                           dist_fn=Distance.sumsq)
        
        
        
    # Run clustering algorithm
    final_responsibilities, final_clusters = Kmeans.kmeans(k, train_images_flat,
        initial_clusters, distfn = Distance.sumsq, method=m)
    
    # Output of results
    print final_responsibilities.sum(axis=0)

    # Time to cluster
    end_cluster = timeit.default_timer()
    clustering_time = end_cluster - start_cluster
    print "Time spent clustering : ", clustering_time


    # Save representative images to file.
    title = m + "_" + init_type + "_cluster" + str(k)
    Load.save_images(k, train_images, final_responsibilities, 
                     final_clusters, title)

    # Calculate final accuracy for clusters
    final, cluster_set = Accuracy.final_accuracy(final_responsibilities, 
        train_labels, train_images_flat, final_clusters)

    # Now see how well we can classify the dataset
    start_cluster_test = timeit.default_timer()
    predictions = ClassifyClusters.classify(cluster_set, test_images_flat, 
        test_labels, distfn = Distance.sumsq, n=10)
    finish_cluster_test = timeit.default_timer()

    testing_time = finish_cluster_test - start_cluster_test
    print "Time spent testing : ", testing_time
    
    ###########
    # Outputs #
    ###########

    # Serializing numpy array - from below source 
    # http://stackoverflow.com/questions/3488934/simplejson-and-numpy-array
    class NumpyEncoder(json.JSONEncoder):
        def default(self, obj):
            """
            if input object is a ndarray it will be converted into a 
            dict holding dtype, shape and the data base64 encoded
            """
            if isinstance(obj, np.ndarray):
                data_b64 = base64.b64encode(obj.data)
                return dict(__ndarray__=data_b64,
                            dtype=str(obj.dtype),
                            shape=obj.shape)
            # Let the base class default method raise the TypeError
            return json.JSONEncoder(self, obj)


    def json_numpy_obj_hook(dct):
        """
        Decodes a previously encoded numpy ndarray
        with proper shape and dtype
        :param dct: (dict) json encoded ndarray
        :return: (ndarray) if input was an encoded ndarray
        """
        if isinstance(dct, dict) and '__ndarray__' in dct:
            data = base64.b64decode(dct['__ndarray__'])
            return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
        return dct


    # k, prediction level, cluster_set, 
    results = {"k" : k, "prediction_accuracy" : predictions[1], 
    "cluster_means" : cluster_set, "cluster_stats" : final,
    "clustering_time" : clustering_time, "testing_time" : testing_time}


    with open(title + '_results.json', 'w') as outfile:
        json.dump(results, outfile, cls=NumpyEncoder)


In [3]:
####################
# Call to function #
####################
main(10)

0 [ 301.  437.   57.  188.  650.   97.  356.  262.  486.  166.]
1 [ 314.  407.  143.  221.  507.  157.  320.  301.  453.  177.]
2 [ 323.  371.  192.  254.  435.  206.  288.  332.  429.  170.]
3 [ 330.  348.  218.  277.  399.  225.  264.  349.  421.  169.]
4 [ 341.  337.  234.  279.  377.  230.  248.  356.  423.  175.]
5 [ 345.  335.  245.  278.  367.  228.  230.  368.  426.  178.]
6 [ 351.  337.  252.  270.  360.  233.  213.  370.  433.  181.]
7 [ 362.  336.  267.  260.  358.  232.  198.  368.  439.  180.]
8 [ 367.  336.  283.  251.  353.  229.  187.  367.  449.  178.]
9 [ 370.  334.  293.  241.  348.  227.  183.  368.  463.  173.]
10 [ 373.  334.  299.  235.  346.  222.  187.  369.  468.  167.]
11 [ 376.  331.  304.  229.  347.  221.  188.  369.  473.  162.]
12 [ 378.  332.  305.  220.  347.  221.  184.  373.  482.  158.]
13 [ 379.  331.  306.  209.  349.  221.  185.  375.  490.  155.]
14 [ 381.  330.  307.  205.  352.  221.  181.  377.  491.  155.]
15 [ 383.  330.  307.  204.  354.  