In [89]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.utils import np_utils, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

import numpy as np

### Load training data

Load `train.csv` from Kaggle into a pandas DataFrame.

In [90]:
train = pd.read_csv('./datasets/train.csv')

In [91]:
train.shape

(42000, 785)

### Set up X and y

NOTE: Keras requires a `numpy` matrix, it doesn't work with `pandas`.

In [92]:
y = train.label.values
X = train.drop('label', axis=1)
X = X.values

### Preprocessing

1. When dealing with image data, you need to normalize your `X` by dividing each value by the max number of pixels (255).
2. Since this is a multiclass classification problem, keras needs `y` to be a one-hot encoded matrix

In [93]:
X = X/255

In [94]:
y.shape

(42000,)

In [98]:
y = to_categorical(y)

In [13]:
# ohe = OneHotEncoder()

In [26]:
# y_ohe = ohe.fit_transform(y.reshape(-1, 1))

In [99]:
# y_ohe.shape

### Train/Test Split

We want to create a validation set that the model will never see to approximate how it's going to do with Kaggle's `test.csv`. Use `sklearn`'s `train_test_split` to do this.

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=666)

print(X_train.shape,
      X_test.shape,
      y_train.shape,
      y_test.shape)

(31500, 784) (10500, 784) (31500, 10) (10500, 10)


### Create your neural network

Create a neural network using the `Dense` and `Dropout` layers from `keras`. Your activation function for the final output layer needs to be `softmax` to accomidate the ten different classes.

In [101]:
model = Sequential()

In [102]:
X_train.shape[1]

784

In [103]:
model.add(Dense(X_train.shape[1], input_dim=X_train.shape[1], activation='relu'))
# 50% dropout with randomstate of 666
model.add(Dropout(0.5)) #, seed=666))
model.add(Dense(y_train.shape[1], activation='softmax'))

### Compile your model

Since this is a multiclass classification problem, your loss function is `categorical_crossentropy`.

In [104]:
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

### Fit the model

Use your X_test, y_test from the `train_test_split` step for the `validation_data` parameter.

In [105]:
model.fit(X_train, y_train, 
          epochs=2, 
          validation_data=(X_test, y_test))

Train on 31500 samples, validate on 10500 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f039ad69780>

### Load in Kaggle's `test.csv`

Be sure to do the **same** preprocessing you did for your training `X`.

In [107]:
test = pd.read_csv('./datasets/test.csv')

In [109]:
X = test.values
X = X/255

### Create your predictions

Use `predict_classes` to get the actual numerical values (0-9).

In [111]:
pred = model.predict_classes(X)



### Prepare your submission

1. Add your predictions to a column called `Label`
2. You'll need to manually create the `ImageId` column, which is just a list of 1..[NUMBER OF TEST SAMPLES]

In [120]:
sub = pd.DataFrame({'Label': pred, 'ImageId': range(1,len(pred)+1)}).set_index('ImageId')

### Create your submission csv

Remember to set `index=False`!

In [127]:
# we are setting index to True cause we're badasses and used .set_index('ImageId)
# this places the column ImageId into our index. Pandas defaults to exporting the 
# index into the resultant .csv file. We are explicitly stating the default with 
# index=True
sub.to_csv('./datasets/submission.csv', index=True)