In [None]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import numpy as np
from keras.utils import to_categorical

### Load training data

Load `train.csv` from Kaggle into a pandas DataFrame.

In [2]:
train = pd.read_csv('C:/Users/Pernesso/Downloads/train.csv')

In [3]:
# Convert dataframe to a numpy array data form

### Set up X and y

NOTE: Keras requires a `numpy` matrix, it doesn't work with `pandas`.

In [4]:
X = train[train.columns[1:]].values
X.shape

(42000, 784)

In [5]:
y = train['label']
y.shape

(42000,)

### Preprocessing

1. When dealing with image data, you need to normalize your `X` by dividing each value by the max number of pixels (255).
2. Since this is a multiclass classification problem, keras needs `y` to be a one-hot encoded matrix

In [6]:
X = X / 255.

y = to_categorical(y)

In [7]:
y.shape

(42000, 10)

### Train/Test Split

We want to create a validation set that the model will never see to approximate how it's going to do with Kaggle's `test.csv`. Use `sklearn`'s `train_test_split` to do this.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2000)

In [9]:
X_train.shape
y_train.shape

(31500, 10)

### Create your neural network

Create a neural network using the `Dense` and `Dropout` layers from `keras`. Your activation function for the final output layer needs to be `softmax` to accomidate the ten different classes.

In [10]:
model = Sequential()
model.add(Dense(X_train.shape[1], input_shape=(784,), activation='relu'))
model.add(Dropout(.5))
model.add(Dense(y_train.shape[1], activation='softmax'))

### Compile your model

Since this is a multiclass classification problem, your loss function is `categorical_crossentropy`.

In [None]:
model.compile(optimizer='adam', metrics=['accuracy'], loss='categorical_crossentropy')

### Fit the model

Use your X_test, y_test from the `train_test_split` step for the `validation_data` parameter.

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=100)

Train on 31500 samples, validate on 10500 samples
Epoch 1/5
 3000/31500 [=>............................] - ETA: 3:13:29 - loss: 0.9891 - acc: 0.7010

### Load in Kaggle's `test.csv`

Be sure to do the **same** preprocessing you did for your training `X`.

In [None]:
test = pd.read_csv('C:/Users/Pernesso/Downloads/test.csv')

In [None]:
test = test / 255.

### Create your predictions

Use `predict_classes` to get the actual numerical values (0-9).

In [None]:
pred = model.predict_classes(test.values)

### Prepare your submission

1. Add your predictions to a column called `Label`
2. You'll need to manually create the `ImageId` column, which is just a list of 1..[NUMBER OF TEST SAMPLES]

In [None]:
test['Label'] = pred
test['ImageId'] = range(1,test.shape[0] + 1)

### Create your submission csv

Remember to set `index=False`!

In [None]:
test[['ImageId', 'Label']].to_csv('submission.csv', index=False)

# New Model 

In [None]:
from sklearn.datasets import load_breast_cancer, fetch_lfw_people
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool2D, Flatten
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import to_categorical

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
train = pd.read_csv('C:/Users/Pernesso/Downloads/train.csv')

In [None]:
df =pd.DataFrame(train)

In [None]:
df.head()

In [None]:
y = to_categorical(df['label'].values)

X_train, X_test, y_train, y_test = train_test_split(df[df.columns[1:]].values, y)

X_train = X_train / 255.
X_test = X_test / 255.

X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1)

In [None]:
plt.imshow(df.iloc[0, 1:].values.reshape(28, 28))

In [None]:
model = Sequential()
model.add(Conv2D(15, kernel_size=(5,5), input_shape=(28, 28, 1), activation='relu'))
model.add(MaxPool2D((2,2)))
model.add(Conv2D(30, kernel_size=(4,4), activation='relu'))
model.add(MaxPool2D((2,2)))
model.add(Conv2D(45, kernel_size=(3,3), activation='relu'))
model.add(MaxPool2D((2,2)))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5)

In [None]:
test = pd.read_csv('test.csv')
test.head()

test = test / 255.

test = test.values.reshape(test.shape[0], 28, 28, 1)

pred = model.predict_classes(test)

submission = pd.DataFrame()
submission['Label'] = pred
submission['ImageId'] = range(1,test.shape[0] + 1)

submission.to_csv('submission.csv', index=False)

model.summary()