In [None]:
import csv

import numpy as np
import pandas as pd

from IPython.display import display

from bokeh.plotting import output_notebook, show, figure

from sklearn.metrics import classification_report, accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Activation

output_notebook()

In [None]:
input_columns = list(range(28 * 28))
label_column = 'number'
columns=[label_column,] + input_columns

train = pd.read_csv('mnist_train.csv', names=columns)
test = pd.read_csv('mnist_test.csv', names=columns)

categories = list(range(10))

train.head()

In [None]:
def show_images(samples, titles):
    titles = titles.astype(str)
    samples_as_grid = [sample_values.reshape(28, 28)[::-1, :]
                       for sample_values in samples[input_columns].values]

    for title, sample in zip(titles, samples_as_grid):
        f = figure(title=title, plot_width=400, plot_height=400, x_range=(0, 28), y_range=(0, 28))
        f.image([sample], x=[0], y=[0], dw=[28], dh=[28], palette="Greys9")
        show(f)

In [None]:
samples = train.sample(1)
show_images(samples, samples[label_column])

In [None]:
label_colors = [
    'red',
    'green',
    'blue',
    'indigo',
    'chocolate',
    'crimsom',
    'darkgrey',
    'darkkhaki',
    'darkviolet',
    'gold',
]

feature1 = 290
feature2 = 291

colors = train[label_column].map(lambda x: label_colors[x])

f = figure()

f.xaxis.axis_label = "feature " + str(feature1)
f.yaxis.axis_label = "feature " + str(feature2)

f.circle(train[feature1], 
         train[feature2], 
         color=colors)
show(f)

In [None]:
def get_binarized_labels(dataset):
    # pd.get_dummies is a problem due to ordering of columns and other stuff
    binarized_labels = dataset[[label_column]].copy()
    
    for number in categories:
        binarized_labels[number] = binarized_labels[label_column] == number
    
    binarized_labels.drop(label_column, axis=1, inplace=True)
    
    return binarized_labels

def get_unbinarized_labels(binarized_labels):
    unbinarized_labels = pd.DataFrame(binarized_labels, columns=categories)
    return unbinarized_labels.idxmax(axis=1)

In [None]:
model = Sequential([
    Dense(800, input_dim=len(input_columns)),
    Activation('tanh'),
    Dense(800),
    Activation('tanh'),
    Dense(10),
    Activation('sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy',],
)

model.fit(
    train[input_columns].values, 
    get_binarized_labels(train).values, 
    nb_epoch=5,
    batch_size=1024,
)

In [None]:
groups = (
    ('train', train),
    ('test', test),
)

for name, dataset in groups:
    labels = dataset[label_column].values
    predictions = get_unbinarized_labels(model.predict(dataset[input_columns].values)).values

    print('#' * 50)
    print(name)
    print('accuracy', accuracy_score(labels, predictions))
    print(classification_report(labels, predictions))
    print()


In [None]:
test_with_predictions = test.copy()
test_with_predictions['predicted_number'] = get_unbinarized_labels(model.predict(test_with_predictions[input_columns].values)).values

In [None]:
good = test_with_predictions.number == test_with_predictions.predicted_number

goods = test_with_predictions[good].sample(5)
bads = test_with_predictions[~good].sample(5)

show_images(goods, goods['predicted_number'])
show_images(bads, bads['predicted_number'].astype(str) + " but was " + bads['number'].astype(str))