# Digits Recognizer - Keras CNN 
# 数据分析方法解析：手写数字识别
### 原作者：**Yassine Ghouzam, PhD**
### 来源： **https://www.kaggle.com/yassineghouzam/digits-recognizer-keras-cnn-0-997-top-8?scriptVersionId=1353798**

* **1. 简介**
* **2. 数据预处理**
    * 2.1 载入数据
    * 2.2 检查数据是否缺失
    * 2.3 正则化
    * 2.4 reshape
    * 2.5 标签编码
    * 2.6 划分数据测试集
* **3. CNN**
    * 3.1 建立模型
    * 3.2 建立优化器和退火器
    * 3.3 数据扩增
* **4. 模型评估**
    * 4.1 训练和验证损失曲线
    * 4.2 混淆矩阵
* **5. 预测和提交**

# 1. 简介

采用5层的模型，利用keras建立，在单个i5 GPU下训练2h30得到了99.67%的正确率。数据集是mnist。

<img src="http://img1.imagilive.com/0717/mnist-sample.png" ></img>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

sns.set(style='white', context='notebook', palette='deep')

# 2. 数据预处理
## 2.1 载入数据

In [5]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [6]:
# 划分x_train和y_train数据
Y_train = train["label"]
X_train = train.drop(labels = ["label"],axis = 1) 

# 及时释放内存
del train 

# 对值进行计数
Y_train.value_counts()

### 结果：数据分布较为均匀

## 2.2 检查缺失值

In [7]:
# 检查缺失数据
X_train.isnull().any().describe()

In [8]:
test.isnull().any().describe()

## 2.3 正则化处理

In [9]:
# 将0, 255的数据映射到0, 1之间，即可完成正则化
X_train = X_train / 255.0
test = test / 255.0

## 2.3 Reshape

In [10]:
# 存储的数据是以784的一维形式，需要reshape为28 x 28 x 1的单通道二维图片数据
X_train = X_train.values.reshape(-1,28,28,1)
test = test.values.reshape(-1,28,28,1)

## 2.5 标签

In [11]:
# 利用to_categorical可以直接独热编码
Y_train = to_categorical(Y_train, num_classes = 10)

## 2.6 划分数据和数据集

In [12]:
random_seed = 2

In [13]:
# 训练时的测试集占10%
X_train, X_val, Y_train, Y_val = 
    train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)

In [14]:
# 预览图片
g = plt.imshow(X_train[0][:,:,0])

# 3. CNN
## 3.1 Define the model

In [15]:
model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))


model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))


model.add(Flatten())
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))
model.summary()

In [18]:
# 建立优化器
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

In [19]:
# Compile
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [20]:
# 对于learning_rate建立调整算法
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [21]:
epochs = 2 
batch_size = 86

## 3.3 数据扩增

为了防止overfitting，将训练集的数据进行微小变动实现扩增，比如图片经过缩放和旋转。这样能够使训练出的模型验证的验证误差更小

In [17]:
# 没有数据扩增下的accuracy为 0.98114
#history = model.fit(X_train, Y_train, batch_size = batch_size, epochs = epochs, 
#          validation_data = (X_val, Y_val), verbose = 2)

In [22]:
# 数据扩增之后，accuracy达到 0.99286
# 主要包括改变平均值，按方差分类，ZCA白化，缩放，旋转和位移等
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images
# 作用于数据集
datagen.fit(X_train)

In [19]:
# 模型训练，callback是每次训练执行的步骤，采用datagen得到经过扩增后的x_train和y_train进行训练
history = model.fit_generator(datagen.flow(X_train,Y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_val,Y_val),
                              verbose = 2, steps_per_epoch=X_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])

# 4. 模型评估
## 4.1 训练和验证曲线

In [23]:
# 训练集和验证集的loss差异，可以检查是否过拟合等
fig, ax = plt.subplots(2,1)
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['acc'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_acc'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

## 4.2 混淆矩阵绘制

In [24]:

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Predict the values from the validation dataset
Y_pred = model.predict(X_val)
# Convert predictions classes to one hot vectors 
Y_pred = np.argmax(Y_pred,axis = 1) 
# Convert validation observations to one hot vectors
Y_true = np.argmax(Y_val,axis = 1) 
# compute the confusion matrix
confusion_mtx = confusion_matrix(Y_true, Y_pred) 
# plot the confusion matrix
plot_confusion_matrix(confusion_mtx, classes = range(10)) 

In [22]:
# 预测结果
results = model.predict(test)

# 从结果中选取最大值所在的位置
results = np.argmax(results,axis = 1)
# 将结果写为pd格式
results = pd.Series(results,name="Label")

In [23]:
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)

submission.to_csv("cnn_mnist_datagen.csv",index=False)