# Use Transfer Learning to build InceptionV3 model to classify DogsCats 
## By Cheng Yi Peng

## DL-TF_Docker Environment Setup
`sudo nvidia-docker run -ti -p 8888:8888 -v ~/tfv_data:/notebooks/tfv_data --name TFV tensorflow/tensorflow:lastest-gpu`
### So that the docker storage position is ~/notebooks/tfv_data
### Then we added some needed library like: 
* `pip install bcolz`  
* `pip install graphviz`



In [2]:
%matplotlib inline

In [3]:
import os, sys
import numpy as np
import pandas as pd

from keras.layers import *
from keras.applications import *
from keras.models import *
from keras.optimizers import *

from keras.preprocessing import image

Using TensorFlow backend.


In [4]:
HOME_DIR = "/notebooks"
print HOME_DIR

/notebooks


## Build Dataset for DogsCats 

In [5]:
path = HOME_DIR + "/tfv_data"
dogcat_path = path + '/dogcat_set'
model_path = dogcat_path + '/models'
train_path = dogcat_path + '/train'
test_path = dogcat_path + '/test'
valid_path = dogcat_path + '/valid'
model_path = "~/.keras/models/"
inv3_weights_path = path + "/keras_models/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5"
#bn_inv3_path = dogcat_path + "/bottleneck/bn_inv3.h5"
#label_inv3_path = dogcat_path + "/bottleneck/label_inv3.h5"

In [6]:
if os.path.exists(dogcat_path):
    print "Dogcat-Dataset has built already."
else:
    os.mkdir(dogcat_path)
    os.mkdir(model_path)
    os.mkdir(train_path)
    os.mkdir(test_path)
    os.mkdir(valid_path)
    os.mkdir(bottleneck)

Dogcat-Dataset has built already.


In [7]:
#Create sub folders(dog,cat) of train,test, valid
for p in (train_path, valid_path):
    subpath_dog = p + '/dog'
    subpath_cat = p + '/cat'
    if os.path.exists(subpath_dog):
        print subpath_dog, "has existed."
    else:
        os.mkdir(subpath_dog)
        
    if os.path.exists(subpath_cat):
        print subpath_cat, "has existed."
    else:
        os.mkdir(subpath_cat)

/notebooks/tfv_data/dogcat_set/train/dog has existed.
/notebooks/tfv_data/dogcat_set/train/cat has existed.
/notebooks/tfv_data/dogcat_set/valid/dog has existed.
/notebooks/tfv_data/dogcat_set/valid/cat has existed.


In [8]:
from shutil import copyfile
from glob import glob

In [16]:
%cd /notebooks/tfv_data/dogcat_set/train/dog
g = glob('*.jpg')
shuffle_dog = np.random.permutation(g)
for i in range(2000):
    copyfile(shuffle_dog[i], valid_path + "/dog/" + shuffle_dog[i])

/notebooks/tfv_data/dogcat_set/train/dog


In [17]:
%cd /notebooks/tfv_data/dogcat_set/train/cat
gb = glob('*.jpg')
shuffle_cat = np.random.permutation(gb)
for i in range(2000):
    copyfile(shuffle_cat[i], valid_path + "/cat/" + shuffle_cat[i])

/notebooks/tfv_data/dogcat_set/train/cat


# Confirm whether the valid data imported successfully

```
def confirm_random_images_in_valid(v_path):
    for i in ["/dog/", "/cat/"]:
        valid_path = v_path + i
        %cd valid_path 
        
        path = os.getcwd()
        count = 0
        for files in os.walk(path):
            for each in files:
                count += 1
        print "valid + %s + has %d random images"% (i, count)
```


In [9]:
%cd /notebooks/tfv_data/dogcat_set/valid/cat

/notebooks/tfv_data/dogcat_set/valid/cat


In [10]:
path = "/notebooks/tfv_data/dogcat_set/valid/cat"
count = 0
for root,dirs,files in os.walk(path):   
      for each in files:
             count += 1  

print '%s have random images number:'% os.getcwd(),count   

/notebooks/tfv_data/dogcat_set/valid/cat have random images number: 2000


In [11]:
%cd /notebooks/tfv_data/dogcat_set/valid/dog

/notebooks/tfv_data/dogcat_set/valid/dog


In [12]:
path = "/notebooks/tfv_data/dogcat_set/valid/dog"
count = 0
for root,dirs,files in os.walk(path):   
      for each in files:
             count += 1   

print '%s have random images number:'% os.getcwd(),count  

/notebooks/tfv_data/dogcat_set/valid/dog have random images number: 2005


## Then download  train.zip and test.zip from https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data

### Move "dog.jpg", "cat.jpg" to  corresponding folders
``
kaggle_dogcat_path = "~/notebooks/tfv_data/kaggle_dogcat_dataset"
os.path.mkdir(kaggle_dogcat_path) 
unzip train.zip kaggle_dogcat_path 
unzip test.zip kaggle_dogcat_path 
%cd kaggle_dogcat_path + "/train"
mv dog*.jpg train_path + "/dog/"
mv cat*.jpg train_path + "/cat/"
%cd kaggle_dogcat_path + "/test"
mv dog*.jpg test_path + "/dog/"
mv cat*.jpg test_path + "/cat/"
``

## Build Base_model

In [13]:
base_model = InceptionV3(input_shape=(299,299,3), weights = "imagenet", include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x)
model = Model(base_model.input, x)

print model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 299, 299, 3)   0                                            
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 149, 149, 32)  896         input_1[0][0]                    
____________________________________________________________________________________________________
batchnormalization_1 (BatchNorma (None, 149, 149, 32)  128         convolution2d_1[0][0]            
____________________________________________________________________________________________________
convolution2d_2 (Convolution2D)  (None, 147, 147, 32)  9248        batchnormalization_1[0][0]       
___________________________________________________________________________________________

In [15]:
model.input_shape[1:3], model.output_shape

((299, 299), (None, 2048))

## Get bootlenecks of InceptionV3

In [19]:
gen = image.ImageDataGenerator(rotation_range=90, 
                              width_shift_range=0.2, 
                              height_shift_range=0.2, 
                              rescale=1./255, 
                              shear_range=0.1,
                              zoom_range=0.2, 
                              horizontal_flip=True,
                              fill_mode='nearest')

In [20]:
print "Train Batches:"
train_batches = gen.flow_from_directory(train_path, model.input_shape[1:3], batch_size=64, shuffle=False )
print "\nValid Batches:"
valid_batches = gen.flow_from_directory(valid_path, model.input_shape[1:3], batch_size=64, shuffle=False )
print "\nTest Batches:"
test_batches = gen.flow_from_directory(test_path, model.input_shape[1:3], batch_size=64, shuffle=False, class_mode=None)

Train Batches:
Found 25000 images belonging to 2 classes.

Valid Batches:
Found 4000 images belonging to 2 classes.

Test Batches:
Found 12500 images belonging to 1 classes.


In [21]:
%%time
train_bn = model.predict_generator(train_batches, train_batches.nb_sample)

CPU times: user 6min 50s, sys: 45.5 s, total: 7min 36s
Wall time: 3min 55s


In [22]:
%%time
valid_bn = model.predict_generator(valid_batches, valid_batches.nb_sample)

CPU times: user 1min 6s, sys: 7.64 s, total: 1min 14s
Wall time: 38.6 s


In [23]:
%%time
test_bn = model.predict_generator(test_batches, test_batches.nb_sample)

CPU times: user 3min 27s, sys: 22.7 s, total: 3min 50s
Wall time: 1min 58s


## Save InceptionV3 bottlenecks

In [24]:
import h5py
from keras.utils.np_utils import to_categorical

In [25]:
with h5py.File("bn_inv3.h5") as hf:
    hf.create_dataset("train", data=train_bn)
    hf.create_dataset("valid", data=valid_bn)
    hf.create_dataset("test", data=test_bn)

with h5py.File("label_inv3.h5") as hfl:
    hfl.create_dataset("train", data=to_categorical(train_batches.classes))
    hfl.create_dataset("valid", data=to_categorical(valid_batches.classes))    

RuntimeError: Unable to create link (Name already exists)

## Load bottlenecks & labels

In [32]:
with h5py.File("bn_inv3.h5") as hf:
    X_train = hf["train"][:]
    X_valid = hf['valid'][:]

with h5py.File("label_inv3.h5") as hfl:
    y_train = hfl['train'][:]
    y_valid = hfl['valid'][:]

In [33]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

callback_list = [ModelCheckpoint("top_model.h5", monitor='val_acc', verbose=1, save_best_only=True)]

#callback_list = [ModelCheckpoint("top_model.h5", monitor='val_acc', verbose=1, save_best_only=True),
#                 EarlyStopping(monitor='val_acc', patience=5, verbose=1)]

In [34]:
model = Sequential([Dropout(0.5, input_shape=X_train.shape[1:]),
                    Dense(1024, activation='relu'),
                    Dense(2, activation='softmax')])

In [35]:
model.compile(optimizer=Nadam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [55]:
#model.fit(X_train, y_train, callbacks=callback_list, nb_epoch=10, validation_data=(X_valid, y_valid))
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)

model.fit(X_train, y_train, callbacks=callback_list, batch_size=64, nb_epoch=10, validation_split=0.3)

Train on 17500 samples, validate on 7500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd49f851a90>

In [56]:
from keras.models import load_model
model = load_model('top_model.h5')

In [57]:
test_batches = gen.flow_from_directory(test_path, model.input_shape[1:3], batch_size=32,  shuffle=False, class_mode=None)

Found 12500 images belonging to 1 classes.


In [58]:
with h5py.File('bn_inv3.h5') as h:
    X_test = h['test'][:]
y_test = model.predict(X_test)

In [59]:
sub = pd.read_csv('/notebooks/tfv_data/sample_submission.csv')

In [60]:
sub.head()

Unnamed: 0,id,label
0,1,0.5
1,2,0.5
2,3,0.5
3,4,0.5
4,5,0.5


In [61]:
test_batches.filenames[0]

'test/5818.jpg'

In [62]:
test_batches.filenames[0].split("/")[1].split(".")[0]

'5818'

In [63]:
ids = [int(x.split("/")[1].split(".")[0]) for x in test_batches.filenames]

In [64]:
print y_test[:,1][0]

4.50713e-16


In [65]:
for i in range(len(ids)):
    # y_test.column1 data copy to sub.column["label"] while their index is the same value..
    sub.loc[sub.id == ids[i], "label"] = y_test[:,1][i]

In [66]:
for i, filename in enumerate(test_batches.filenames):
    print i, filename

0 test/5818.jpg
1 test/2417.jpg
2 test/7184.jpg
3 test/10971.jpg
4 test/5686.jpg
5 test/6913.jpg
6 test/7389.jpg
7 test/9296.jpg
8 test/1917.jpg
9 test/4350.jpg
10 test/6302.jpg
11 test/516.jpg
12 test/4674.jpg
13 test/618.jpg
14 test/3250.jpg
15 test/10155.jpg
16 test/4797.jpg
17 test/7351.jpg
18 test/10338.jpg
19 test/5561.jpg
20 test/4664.jpg
21 test/6342.jpg
22 test/10141.jpg
23 test/3069.jpg
24 test/860.jpg
25 test/11993.jpg
26 test/6100.jpg
27 test/6313.jpg
28 test/4461.jpg
29 test/10432.jpg
30 test/6991.jpg
31 test/6646.jpg
32 test/7965.jpg
33 test/10009.jpg
34 test/8287.jpg
35 test/6083.jpg
36 test/2416.jpg
37 test/8602.jpg
38 test/1975.jpg
39 test/11177.jpg
40 test/9441.jpg
41 test/9165.jpg
42 test/10285.jpg
43 test/1315.jpg
44 test/5131.jpg
45 test/820.jpg
46 test/8992.jpg
47 test/11736.jpg
48 test/7885.jpg
49 test/1655.jpg
50 test/2990.jpg
51 test/6641.jpg
52 test/10022.jpg
53 test/8946.jpg
54 test/2965.jpg
55 test/7061.jpg
56 test/9484.jpg
57 test/94.jpg
58 test/4166.jpg
59

for i, filename in enumerate(test_batches.filenames):
    index = int(filename[filename.rfind('/')+1:filename.rfind('.')])
    sub.set_value(index-1, 'label', y_test[i])

In [67]:
clipped = y_test.clip(min=0.002, max=0.998)

for i in range(len(ids)):
    sub.loc[sub.id == ids[i], "label"] = clipped[:,1][i]

sub.to_csv("/notebooks/tfv_data/0408pred01.csv", index=False)


In [68]:
sub.head()

Unnamed: 0,id,label
0,1,0.975215
1,2,0.998
2,3,0.998
3,4,0.998
4,5,0.002
