## Libraries

In [1]:
from __future__ import print_function
from __future__ import division

import tensorflow as tf

import numpy as np

import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import gc

## Prepare the data

In [2]:
# read in data
%time data = pd.read_csv("./data/train.csv")

# seperate out label
y = data[['label']]
X = data.drop('label', axis = 1).astype(np.float32).values / 255   # tensorflow prefer 32 bit floating point numbers

# split data into train, validation and test

X, X_test, y, y_test = train_test_split(X, y, test_size=0.20, stratify = y, random_state = 36)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, stratify = y, random_state = 2014)

del [X, y]
gc.collect()

print(X_train.shape, X_valid.shape, X_test.shape)

CPU times: user 4.16 s, sys: 224 ms, total: 4.39 s
Wall time: 6.46 s
(22512, 784) (11088, 784) (8400, 784)


In [3]:
# check the label balance
print(pd.DataFrame({
            'train_labels': y_train.label.value_counts(),
            'valid_lables': y_valid.label.value_counts(),
            'test_labels': y_test.label.value_counts()
        }))

   test_labels  train_labels  valid_lables
1          937          2510          1237
7          880          2359          1162
3          870          2332          1149
9          838          2245          1105
2          835          2239          1103
6          827          2218          1092
0          826          2215          1091
4          815          2182          1075
8          813          2178          1072
5          759          2034          1002


In [4]:
# use one-hot encoding to encode label
le = OneHotEncoder()
le.fit(y_train[['label']])

y_train = le.transform(y_train).astype(np.float32).toarray()
y_valid = le.transform(y_valid).astype(np.float32).toarray()
y_test = le.transform(y_test).astype(np.float32).toarray()

print(y_train.shape, y_valid.shape, y_test.shape)

(22512, 10) (11088, 10) (8400, 10)


In [5]:
# see the encoded responses variables
print(y_train[:5, :])

[[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]]


## Define Model

In [6]:
# construct the computation graph with tensorflow

graph = tf.Graph()

with graph.as_default():
    # specify the dataset used in computation, for now just pass them as constants
    train_dataset = tf.constant(X_train)
    train_labels = tf.constant(y_train)
    valid_dataset = tf.constant(X_valid)
    test_dataset = tf.constant(X_test)

    # setup how weights are initialized: using a truncated normal distribution
    W = tf.Variable(tf.truncated_normal([784, 10]))
    # setup how biases are initialized: all set to zero.
    b = tf.Variable(tf.zeros([10]))
    
    # in the graph there is just one node and all it does is a linear mapping from inputs(784) to logits(10) 
    # logits = Wx + b
    logits = tf.matmul(train_dataset, W) + b
    
    # specifiy the loss used in training, here I am using log-loss
    # loss function is where the gradients are calculated (based on the back-propagated error)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, train_labels))
    
    # specify an optimizer to use to update W and b, here i am using 0.5 as the learning rate
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)   
    
    # use softmax to convert logits into proper probabilities(sum to 1 across each row) 
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax(tf.matmul(valid_dataset, W) + b)
    test_prediction = tf.nn.softmax(tf.matmul(test_dataset, W) + b)

## Evaluation Metric

In [7]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1)) / predictions.shape[0])

## Training

In [8]:
num_steps = 500 

# start a computation session
with tf.Session(graph=graph) as session:
    # initialize weights and bias
    tf.initialize_all_variables().run()
    
    # 
    for step in range(num_steps):
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(predictions, y_train))
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), y_valid))
            print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), y_test))
            print("========================================================================")

Loss at step 0: 12.880156
Training accuracy: 6.3%
Validation accuracy: 6.9%
Test accuracy: 6.7%
Loss at step 100: 1.130349
Training accuracy: 75.2%
Validation accuracy: 75.2%
Test accuracy: 74.5%
Loss at step 200: 0.812949
Training accuracy: 81.4%
Validation accuracy: 81.3%
Test accuracy: 80.5%
Loss at step 300: 0.685550
Training accuracy: 83.8%
Validation accuracy: 83.5%
Test accuracy: 82.7%
Loss at step 400: 0.611366
Training accuracy: 85.1%
Validation accuracy: 84.8%
Test accuracy: 84.3%


## What's Next?

For the next time, we are going to use hidden layers. 