# CS 51 Final Project Interface

By: Olivia Angiuli, Martin Reindl, Ty Rocca, Wilder Wohns

**Purpose:** This top level iPython notebook is designed to be a place to see how our code works. It is also a good way for us to show off our results

In [1]:
#!/usr/bin/env python

In [8]:
import numpy as np
import os, struct,random #,
# import sys
from numpy import append, array, int8, uint8, zeros
import Initialize
import Accuracy
import Distance
import Load
import Kmeans
import ClassifyClusters
import base64
import json
import timeit

######################################
# Load in training images and labels #
######################################

# load training and testing images and labels as 60,000 x 28 x 28 array
train_images,train_labels = Load.load_mnist("training",path=os.getcwd(), prop = 5)
test_images,test_labels = Load.load_mnist("testing",path=os.getcwd())

# flatten training images into 60,000 x 784 array
train_images_flat = np.array([np.ravel(img) for img in train_images])
test_images_flat = np.array([np.ravel(img) for img in test_images])

def main (k, m="means", init_type="random"):
    
    # Starting clustering timer
    start_cluster = timeit.default_timer()
    
    # Process arguments
    if k < 10:
        raise ValueError("Minimum cluster number is 10")
    
    #Process method of clustering
    if m not in ["means", "medoids", "medians"]:
        raise ValueError("Not a valid method specification; must be 'means',\
          'medoids', or 'medians'")
    
    # Method for clustering
    initial_clusters = None
    if init_type == "random":
        initial_clusters = Initialize.random_centers(k)
    else:
        init_type = "kplusplus"
        initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,
                                           dist_fn=Distance.sumsq)
        
        
        
    # Run clustering algorithm
    final_responsibilities, final_clusters = Kmeans.kmeans(k, train_images_flat,
        initial_clusters, distfn = Distance.sumsq, method=m)
    
    # Output of results
    print final_responsibilities.sum(axis=0)

    # Time to cluster
    end_cluster = timeit.default_timer()
    clustering_time = end_cluster - start_cluster
    print "Time spent clustering : ", clustering_time


    # Save representative images to file.
    title = m + "_" + init_type + "_cluster" + str(k)
    Load.save_images(k, train_images, final_responsibilities, 
                     final_clusters, title)

    # Calculate final accuracy for clusters
    final, cluster_set = Accuracy.final_accuracy(final_responsibilities, 
        train_labels, train_images_flat, final_clusters)

    # Now see how well we can classify the dataset
    start_cluster_test = timeit.default_timer()
    predictions = ClassifyClusters.classify(cluster_set, test_images_flat, 
        test_labels, distfn = Distance.sumsq, n=None)
    finish_cluster_test = timeit.default_timer()

    testing_time = finish_cluster_test - start_cluster_test
    print "Time spent testing : ", testing_time
    
    ###########
    # Outputs #
    ###########

    # Serializing numpy array - from below source 
    # http://stackoverflow.com/questions/3488934/simplejson-and-numpy-array
    class NumpyEncoder(json.JSONEncoder):
        def default(self, obj):
            """
            if input object is a ndarray it will be converted into a 
            dict holding dtype, shape and the data base64 encoded
            """
            if isinstance(obj, np.ndarray):
                data_b64 = base64.b64encode(obj.data)
                return dict(__ndarray__=data_b64,
                            dtype=str(obj.dtype),
                            shape=obj.shape)
            # Let the base class default method raise the TypeError
            return json.JSONEncoder(self, obj)


    def json_numpy_obj_hook(dct):
        """
        Decodes a previously encoded numpy ndarray
        with proper shape and dtype
        :param dct: (dict) json encoded ndarray
        :return: (ndarray) if input was an encoded ndarray
        """
        if isinstance(dct, dict) and '__ndarray__' in dct:
            data = base64.b64decode(dct['__ndarray__'])
            return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
        return dct


    # k, prediction level, cluster_set, 
    results = {"k" : k, "prediction_accuracy" : predictions[1], 
    "cluster_means" : cluster_set, "cluster_stats" : final,
    "clustering_time" : clustering_time, "testing_time" : testing_time}


    with open('./' + title + '/' + title + '_results.json', 'w') as outfile:
        json.dump(results, outfile, cls=NumpyEncoder)


In [9]:
####################
# Call to function #
####################
main(10, m="medians", init_type="k_plus_plus")

0 [ 478.  228.  223.   93.  469.  432.  281.  297.  307.  192.]
1 [ 347.  252.  245.  113.  479.  402.  352.  307.  315.  188.]
2 [ 289.  270.  240.  121.  470.  410.  369.  292.  331.  208.]
3 [ 264.  292.  240.  119.  452.  409.  365.  292.  334.  233.]
4 [ 243.  310.  227.  121.  455.  407.  355.  283.  341.  258.]
5 [ 233.  317.  225.  113.  467.  410.  341.  280.  346.  268.]
6 [ 224.  325.  220.  108.  483.  409.  330.  272.  353.  276.]
7 [ 223.  338.  215.  103.  495.  410.  318.  260.  356.  282.]
8 [ 222.  351.  227.  103.  493.  406.  307.  257.  350.  284.]
9 [ 223.  367.  227.  105.  496.  404.  291.  256.  342.  289.]
10 [ 220.  395.  228.  105.  503.  398.  274.  254.  336.  287.]
11 [ 219.  408.  229.  107.  503.  395.  260.  255.  334.  290.]
12 [ 219.  407.  228.  108.  508.  391.  266.  249.  336.  288.]
13 [ 218.  401.  228.  109.  515.  392.  264.  254.  330.  289.]
14 [ 217.  399.  228.  110.  523.  395.  249.  257.  327.  295.]
15 [ 217.  404.  228.  111.  523.  

In [27]:
# How to load the data back in 
# json.loads(dumped, object_hook=json_numpy_obj_hook)

# source : http://stackoverflow.com/questions/3488934/simplejson-and-numpy-array/24375113#24375113
import base64
import json
import numpy as np

def json_numpy_obj_hook(dct):
        """
        Decodes a previously encoded numpy ndarray
        with proper shape and dtype
        :param dct: (dict) json encoded ndarray
        :return: (ndarray) if input was an encoded ndarray
        """
        if isinstance(dct, dict) and '__ndarray__' in dct:
            data = base64.b64decode(dct['__ndarray__'])
            return np.frombuffer(data, dct['dtype']).reshape(dct['shape'])
        return dct


In [67]:
testing_ks = range(10,20) + range(20, 55, 5)
methods = ["means", "medoids", "medians"]
init_types = ["random", "kplusplus"]

# for k in testing_ks:
m = methods[0]
init_type = init_types[0]
k = testing_ks[0]


title = m + "_" + init_type + "_cluster" + str(k)

results_set = []
with open('./' + title + '/' + title + '_results.json') as data_file:
    results_set.append(json.load(data_file,object_hook=json_numpy_obj_hook))


In [59]:
# results_set

In [69]:
# k = 9
for k in testing_ks:
    print "python main_cluster.py {k} {method} {init_type} {prop};".format(k = k,
                                                                        method = m,
                                                                        init_type =init_type,
                                                                        prop = 5)

python main_cluster.py 10 means random 5
python main_cluster.py 11 means random 5
python main_cluster.py 12 means random 5
python main_cluster.py 13 means random 5
python main_cluster.py 14 means random 5
python main_cluster.py 15 means random 5
python main_cluster.py 16 means random 5
python main_cluster.py 17 means random 5
python main_cluster.py 18 means random 5
python main_cluster.py 19 means random 5
python main_cluster.py 20 means random 5
python main_cluster.py 25 means random 5
python main_cluster.py 30 means random 5
python main_cluster.py 35 means random 5
python main_cluster.py 40 means random 5
python main_cluster.py 45 means random 5
python main_cluster.py 50 means random 5


In [65]:
testing_ks

[0]