# Use bottleneck features and cross validation

In [14]:
import math
import pandas as pd
import cv2
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from keras.applications import ResNet50, InceptionResNetV2
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.callbacks import Callback, LearningRateScheduler
from keras.optimizers import SGD
from matplotlib import pyplot as plt
from IPython.display import clear_output
from sklearn.linear_model import LogisticRegression

## Get top labels
get labels with more data in it.

In [2]:
num_classes = 10
df = pd.read_csv('data2/labels.csv')

# get top 10 breed
breed = df.groupby(['breed'])['breed'].count().nlargest(num_classes)
breed = [i for i in breed.index]  # get index name only

# labels in breed
labels = df[df['breed'].isin(breed)]

## Preparing data and labels

In [3]:
ROWS = 250
COLS = 250
CHANNELS = 3
CORE = 4


def resize_img(file_path):
    img = cv2.imread(file_path, cv2.IMREAD_COLOR)
    return cv2.resize(img, (ROWS, COLS), interpolation=cv2.INTER_CUBIC)


def prep_data(images):
    count = len(images)
    data = np.ndarray((count, ROWS, COLS, CHANNELS), dtype=np.uint8)

    with ProcessPoolExecutor(max_workers=CORE) as executor:
        data[:] = list(executor.map(resize_img, images))
    
    return data


lb = LabelBinarizer()

files = ['data2/train/%s.jpg' % i['id'] for _, i in labels.iterrows()]
text_labels = [i['breed'] for _, i in labels.iterrows()]

x = prep_data(files)
y = lb.fit_transform(text_labels)

## Feature extraction

In [4]:
inc = ResNet50(include_top=False, weights='imagenet')

features = inc.predict(x, verbose=1)



## Model And Train evaluate function

In [69]:
def create_model(x_train, y_train, x_valid, y_valid):
    model = Sequential()
    model.add(Flatten(input_shape=features.shape[1:]))
    model.add(Dense(num_classes, activation='softmax'))

    sgd = SGD(lr=0.01, momentum=0.9)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['categorical_accuracy'])
    
    model.fit(
        x_train, y_train,
        epochs=10,
        validation_data=(x_valid, y_valid),
        verbose=0
    )
    return model

## Perforn Cross Validation

In [70]:
skf = KFold(n_splits=10, shuffle=False)

smallest_loss = None
most_acc = None 
best_train_index = None
best_valid_index = None
best_model = None

for train_index, valid_index in skf.split(x, y):    
    x_train, x_valid = features[train_index], features[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    model = create_model(x_train, y_train, x_valid, y_valid)
    
    loss, acc = model.evaluate(x_valid, y_valid)

    if not smallest_loss or smallest_loss > loss:
        smallest_loss = loss
        most_acc = acc
        best_train_index = train_index
        best_valid_index = valid_index
        best_model = Model
        
    print(loss, acc)

print('--------------------------------')
print('final loss and accuracy: %r %r' % (smallest_loss, most_acc))

0.184110214399 0.947826087993
0.0865268224805 0.964912280702
1.6804974121 0.868421049494
0.228603020049 0.938596487045
0.180674313192 0.929824557221
0.226556450651 0.929824557221
0.167000009695 0.956140346694
0.157356351614 0.929824557221
0.289829286828 0.912280697572
0.173595475 0.938596488091
--------------------------------
final loss and accuracy: 0.086526822480536475 0.96491228070175439


## Predict on test data