In [1]:
"""
source: matplotlib tutorial
http://matplotlib.org/users/image_tutorial.html
"""

import matplotlib.pyplot as plt
from matplotlib.image import cm

import mnist_database

db = mnist_database.mnist_database('../data/mnist')

(images, labels) = db.get_training_data()

(images_training, labels_training) = (images, labels)
(images_testing, labels_testing) = db.get_testing_data()

In [2]:
# view the first image of the dataset and print the number of images and labels

imgplot = plt.imshow(images[0].reshape((28,28)), cmap=cm.gray)
plt.show()

print( len(images), len(labels) )

(60000, 60000)


In [8]:
from numpy import zeros, int32


class mnist_learntester:
    def __init__(self,
                 learning_implementation,
                 training_vectors=len(images_training),
                 testing_vectors=len(images_testing)
                ):
        self.learning_implementation = learning_implementation
        self.trv = training_vectors
        self.tsv = testing_vectors
        self.hits = zeros((10,10), dtype=int32)
    
    def train(self):
        self.learning_implementation.fit(images_training[:self.trv],
                                         labels_training[:self.trv])
        return
    
    def test(self):
        for (t, sol) in zip(images_testing[:self.tsv],
                            labels_testing[:self.tsv]):
            self.hits[sol][self.learning_implementation.predict(t)] += 1
        return
    
    def stats(self):
        return self.hits


In [18]:
import matplotlib.pyplot as plt
from matplotlib.image import cm
import numpy

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

results = []


for j in range(15, 16):
    m = mnist_learntester(
        DecisionTreeClassifier(criterion='gini'),
        training_vectors=60000
    )
    m.train()
    m.test()

    stats = m.stats()
    # stats_percent = [ [i/sum(line) for i in line] for line in stats ]
    
    hit_rate = sum([stats[i][i] for i in range(len(stats))]) / float(len(images_testing))
    print('max-depth %d: %.3f' % (j, hit_rate*100) + '% classified correctly')
    results.append(hit_rate)


print(results)

max-depth 15: 87.820% classified correctly
[0.87819999999999998]


In [7]:
plt.imshow(stats)
plt.show()

In [9]:
# render the tree

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO

try:
    import pydot
    
    dot_data = StringIO() 
    export_graphviz(m.svm, out_file=dot_data) 
    graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
    graph.write_pdf("decision-tree.pdf")
    print('created PDF-file')
    
except ImportError:
    # pydot is not existing (probably working in python3)
    with open("/home/raphael/Dokumente/MLlab/data/one-feature-tree.dot", 'w') as f:
        f = export_graphviz(m.svm, out_file=f)
    print('created dot-file')


created PDF-file
