## 数据预处理

In [1]:
import os
import cv2
import h5py
import numpy as np
import pandas as pd
from keras.applications import Xception, xception
from keras.models import Model
from keras.layers import Dropout, Dense, Input, Lambda
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from tqdm import tqdm

Using TensorFlow backend.


In [2]:
data_path_train = '../dogs-vs-cats-dataset/train'
data_path_train_extra = '../dogs-vs-cats-dataset/images-Oxford-IIIT'
data_path_test = '../dogs-vs-cats-dataset/test'
image_names_train = os.listdir(data_path_train)
image_names_train_extra = os.listdir(data_path_train_extra)
image_names_test = os.listdir(data_path_test)
input_shape = (299, 299, 3)
labels = []
trains = []
tests = []

# 处理标准的训练数据
for i in tqdm(range(len(image_names_train))):
    image_name = image_names_train[i]
    image_path = os.path.join(data_path_train, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print('Read train image failed:', image_path)
        continue
    image = cv2.resize(image, (input_shape[0], input_shape[1]))
    trains.append(image[:, :, ::-1])
    # cat: 0, dog: 1
    category = 1 if 'dog' in image_name else 0
    labels.append(category)

    
# 猫的种类
cat_types = ['Abyssinian', 'Bengal', 'Birman', 'Bombay', 'British_Shorthair', 'Egyptian_Mau', 'Maine_Coon', 'Persian',
             'Ragdoll', 'Russian_Blue', 'Siamese', 'Sphynx']
# 处理扩展的训练数据
for i in tqdm(range(len(image_names_train_extra))):
    image_name = image_names_train_extra[i]
    image_path = os.path.join(data_path_train_extra, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print('Read extra train image failed:', image_path)
        continue
    image = cv2.resize(image, (input_shape[0], input_shape[1]))
    index = len(image_names_train) + i
    trains.append(image[:, :, ::-1])
    
    # 获取动物的种类（dog or cat）
    spt = image_names_train_extra[i].split('_')
    spt.pop()
    tp = '_'.join(spt)
    category = 0 if tp in cat_types else 1
    labels.append(category)
    

# 处理标准的测试数据
for i in tqdm(range(len(image_names_test))):
    image_name = image_names_test[i]
    image_path = os.path.join(data_path_test, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print('Read test image failed:', image_path)
        continue
    image = cv2.resize(image, (input_shape[0], input_shape[1]))
    tests.append(image[:, :, ::-1])
    
    
trains = np.array(trains)
labels = np.array(labels)
tests = np.array(tests)

print('Training data size: %d' % len(trains))
print('Label size: %d' % len(labels))
print('Testing data size: %d' % len(tests))

100%|██████████| 25000/25000 [01:00<00:00, 413.37it/s]
 12%|█▏        | 895/7393 [00:04<00:29, 222.16it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Abyssinian_102.mat


 14%|█▍        | 1036/7393 [00:04<00:28, 222.06it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Abyssinian_100.mat


 19%|█▉        | 1438/7393 [00:06<00:26, 223.06it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Egyptian_Mau_139.jpg


 37%|███▋      | 2750/7393 [00:12<00:20, 222.65it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Egyptian_Mau_145.jpg


 57%|█████▋    | 4227/7393 [00:18<00:13, 226.47it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Egyptian_Mau_177.jpg


 74%|███████▍  | 5493/7393 [00:24<00:08, 227.01it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Abyssinian_101.mat


 76%|███████▌  | 5637/7393 [00:24<00:07, 226.88it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Egyptian_Mau_191.jpg


 78%|███████▊  | 5783/7393 [00:25<00:07, 226.98it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Abyssinian_34.jpg


 98%|█████████▊| 7274/7393 [00:32<00:00, 227.15it/s]

Read extra train image failed: ../dogs-vs-cats-dataset/images-Oxford-IIIT/Egyptian_Mau_167.jpg


100%|██████████| 7393/7393 [00:32<00:00, 227.15it/s]
100%|██████████| 12500/12500 [00:30<00:00, 414.67it/s]


Training data size: 32384
Label size: 32384
Testing data size: 12500


## 特征提取

In [None]:
x = Input(shape=input_shape)
x = Lambda(xception.preprocess_input)(x)
model = Xception(input_tensor=x, input_shape=input_shape, weights='imagenet', include_top=False, pooling='avg')
bottleneck_features_train = model.predict(trains, batch_size=128)
bottleneck_features_test = model.predict(tests, batch_size=128)

with h5py.File("bottleneck_features_with_extra_data.h5", 'w') as h:
    h.create_dataset('trains', data=bottleneck_features_train)
    h.create_dataset('labels', data=labels)
    h.create_dataset('tests', data=bottleneck_features_test)
    h.create_dataset('test_imgs', data=image_names_test)

print('bottleneck features have been wrote to bottleneck_features_with_extra_data.h5')

## 构建模型

In [42]:
with h5py.File('bottleneck_features.h5','r') as h:
# with h5py.File('bottleneck_features_with_extra_data.h5','r') as h:
    X_train = np.array(h['trains'])
    y_train = np.array(h['labels'])
    X_test = np.array(h['tests'])
    test_imgs = np.array(h['test_imgs'])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, shuffle=True, test_size=0.2, random_state=2018)

x = Input(shape=(X_train.shape[1],))
y = Dropout(0.2)(x)
y = Dense(1, activation='sigmoid')(y)
model = Model(x, y)

model.compile(optimizer='adadelta', loss='binary_crossentropy', metrics=['accuracy'])

print('Model ready!')

Model ready!


## 训练

### 预测函数

In [43]:
def predict_func(mod, file_name):
    y_pred = mod.predict(X_test, verbose=1)
    y_pred = y_pred.clip(min=0.005, max=0.995)

    df = pd.read_csv("sample_submission.csv")

    for i in range(len(test_imgs)):
        image_name = test_imgs[i]
        index = int(str.split(image_name, '.')[0]) - 1
        df.iat[index, 1] = y_pred[i]

    df.to_csv(os.path.join('./predict-csv', file_name), index=None)
    print('The prediction result has been wrote to: ', file_name)

### 回调函数

In [44]:
class LossCallback(Callback):
    def on_epoch_end(self, epoch, logs={}):
        predict_func(self.model, 'predict' + '_epoch' + str(epoch + 1) + '.csv')

### 模型训练与优化

In [45]:

model.fit(
    x=X_train,
    y=y_train,
    batch_size=128,
    epochs=10,
    validation_data=(X_val, y_val),
    callbacks=[LossCallback()]
)


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
The prediction result has been wrote to:  predict_epoch1.csv
Epoch 2/10
The prediction result has been wrote to:  predict_epoch2.csv
Epoch 3/10
The prediction result has been wrote to:  predict_epoch3.csv
Epoch 4/10
The prediction result has been wrote to:  predict_epoch4.csv
Epoch 5/10
The prediction result has been wrote to:  predict_epoch5.csv
Epoch 6/10
The prediction result has been wrote to:  predict_epoch6.csv
Epoch 7/10
The prediction result has been wrote to:  predict_epoch7.csv
Epoch 8/10
The prediction result has been wrote to:  predict_epoch8.csv
Epoch 9/10
The prediction result has been wrote to:  predict_epoch9.csv
Epoch 10/10
The prediction result has been wrote to:  predict_epoch10.csv


<keras.callbacks.History at 0x7ff6d591ca90>

## 预测

In [58]:
predict_func(model, 'predict.csv')

The prediction result has been wrote to predict.csv
