In [None]:
import dill
dill.load_session('notebook_env.db')

# Loading/Preprocessing the Data

In [1]:
import matplotlib.pyplot as plt
import matplotlib.image as img
import numpy as np
from scipy.misc import imresize

%matplotlib inline

import os
from os import listdir
from os.path import isfile, join
import shutil
import stat
import collections
from collections import defaultdict

from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

import h5py
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.applications.inception_v3 import preprocess_input
from keras.models import load_model

Using TensorFlow backend.


In [5]:
root_dir = 'food-101/images/'
rows = 17
cols = 6
fig, ax = plt.subplots(rows, cols, frameon=False, figsize=(15, 25))
fig.suptitle('Random Image from Each Food Class', fontsize=20)
sorted_food_dirs = sorted(os.listdir(root_dir))
for i in range(rows):
    for j in range(cols):
        try:
            food_dir = sorted_food_dirs[i*cols + j]
        except:
            break
        all_files = os.listdir(os.path.join(root_dir, food_dir))
        rand_img = np.random.choice(all_files)
        img = plt.imread(os.path.join(root_dir, food_dir, rand_img))
        ax[i][j].imshow(img)
        ec = (0, .6, .1)
        fc = (0, .7, .2)
        ax[i][j].text(0, -20, food_dir, size=10, rotation=0,
                ha="left", va="top", 
                bbox=dict(boxstyle="round", ec=ec, fc=fc))
plt.setp(ax, xticks=[], yticks=[])
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

Error in callback <function flush_figures at 0x7fa516dbfea0> (for post_execute):


KeyboardInterrupt: 

In [2]:
# Setup multiprocessing pool
# Do this early, as once images are loaded into memory there will be Errno 12
# http://stackoverflow.com/questions/14749897/python-multiprocessing-memory-usage
import multiprocessing as mp

num_processes = 6
pool = mp.Pool(processes=num_processes)

Process ForkPoolWorker-4:
Process ForkPoolWorker-6:
Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Process ForkPoolWorker-5:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/usr/lib/python3.5/multiprocessing/queues.py", lin

In [3]:
class_to_ix = {}
ix_to_class = {}
with open('food-101/meta/classes.txt', 'r') as txt:
    classes = [l.strip() for l in txt.readlines()]
    class_to_ix = dict(zip(classes, range(len(classes))))
    ix_to_class = dict(zip(range(len(classes)), classes))
    class_to_ix = {v: k for k, v in ix_to_class.items()}
sorted_class_to_ix = collections.OrderedDict(sorted(class_to_ix.items()))

In [3]:
# Only split files if haven't already
if not os.path.isdir('./food-101/test') and not os.path.isdir('./food-101/train'):

    def copytree(src, dst, symlinks = False, ignore = None):
        if not os.path.exists(dst):
            os.makedirs(dst)
            shutil.copystat(src, dst)
        lst = os.listdir(src)
        if ignore:
            excl = ignore(src, lst)
            lst = [x for x in lst if x not in excl]
        for item in lst:
            s = os.path.join(src, item)
            d = os.path.join(dst, item)
            if symlinks and os.path.islink(s):
                if os.path.lexists(d):
                    os.remove(d)
                os.symlink(os.readlink(s), d)
                try:
                    st = os.lstat(s)
                    mode = stat.S_IMODE(st.st_mode)
                    os.lchmod(d, mode)
                except:
                    pass # lchmod not available
            elif os.path.isdir(s):
                copytree(s, d, symlinks, ignore)
            else:
                shutil.copy2(s, d)

    def generate_dir_file_map(path):
        dir_files = defaultdict(list)
        with open(path, 'r') as txt:
            files = [l.strip() for l in txt.readlines()]
            for f in files:
                dir_name, id = f.split('/')
                dir_files[dir_name].append(id + '.jpg')
        return dir_files

    train_dir_files = generate_dir_file_map('food-101/meta/train.txt')
    test_dir_files = generate_dir_file_map('food-101/meta/test.txt')


    def ignore_train(d, filenames):
        print(d)
        subdir = d.split('/')[-1]
        to_ignore = train_dir_files[subdir]
        return to_ignore

    def ignore_test(d, filenames):
        print(d)
        subdir = d.split('/')[-1]
        to_ignore = test_dir_files[subdir]
        return to_ignore

    copytree('food-101/images', 'food-101/test', ignore=ignore_train)
    copytree('food-101/images', 'food-101/train', ignore=ignore_test)
    
else:
    print('Train/Test files already copied into separate folders.')

Train/Test files already copied into separate folders.


In [4]:
%%time
import imageio
# Load dataset images and resize to meet minimum width and height pixel size
def load_images(root, min_side=299):
    all_imgs = []
    all_classes = []
    resize_count = 0
    invalid_count = 0
    for i, subdir in enumerate(listdir(root)):
        if subdir in ["spaghetti_carbonara", "spaghetti_bolognese"]:
            imgs = listdir(join(root, subdir))
            class_ix = class_to_ix[subdir]
            print(i, class_ix, subdir)
            for img_name in imgs:
                img_arr = imageio.imread(join(root, subdir, img_name))
                img_arr_rs = img_arr
                try:
                    w, h, _ = img_arr.shape
                    if w < min_side:
                        wpercent = (min_side/float(w))
                        hsize = int((float(h)*float(wpercent)))
                        #print('new dims:', min_side, hsize)
                        img_arr_rs = skimage.transform.resize(img_arr, (min_side, hsize))
                        resize_count += 1
                    if h < min_side:
                        hpercent = (min_side/float(h))
                        wsize = int((float(w)*float(hpercent)))
                        #print('new dims:', wsize, min_side)
                        img_arr_rs = skimage.transform.resize(img_arr, (wsize, min_side))
                        resize_count += 1
                    all_imgs.append(img_arr_rs)
                    all_classes.append(class_ix)
                except:
                    print('Skipping bad image: ', subdir, img_name)
                    invalid_count += 1
    print(len(all_imgs), 'images loaded')
    print(resize_count, 'images resized')
    print(invalid_count, 'images skipped')
    return np.array(all_imgs), np.array(all_classes)
    
X_test, y_test = load_images('food-101/test', min_side=299)

9 91 spaghetti_carbonara
Skipping bad image:  spaghetti_carbonara 2333067.jpg
Skipping bad image:  spaghetti_carbonara 2882129.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_carbonara 2957999.jpg
Skipping bad image:  spaghetti_carbonara 906675.jpg
Skipping bad image:  spaghetti_carbonara 1900436.jpg
Skipping bad image:  spaghetti_carbonara 1566710.jpg
Skipping bad image:  spaghetti_carbonara 3910421.jpg
Skipping bad image:  spaghetti_carbonara 1522638.jpg
46 90 spaghetti_bolognese
Skipping bad image:  spaghetti_bolognese 501894.jpg
Skipping bad image:  spaghetti_bolognese 2002026.jpg
Skipping bad image:  spaghetti_bolognese 3510210.jpg
Skipping bad image:  spaghetti_bolognese 1660402.jpg
Skipping bad image:  spaghetti_bolognese 2068959.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_bolognese 1906824.jpg
486 images loaded
0 images resized
14 images skipped
CPU times: user 1.96 s, sys: 532 ms, total: 2.49 s
Wall time: 4.29 s


In [5]:
%%time
X_train, y_train = load_images('food-101/train', min_side=299)

9 91 spaghetti_carbonara
Skipping bad image:  spaghetti_carbonara 3210760.jpg


  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_carbonara 59935.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_carbonara 2811977.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_carbonara 1124356.jpg
Skipping bad image:  spaghetti_carbonara 2829676.jpg
Skipping bad image:  spaghetti_carbonara 2978646.jpg
Skipping bad image:  spaghetti_carbonara 2930419.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


46 90 spaghetti_bolognese


  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_bolognese 2142734.jpg


  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_bolognese 2162105.jpg
Skipping bad image:  spaghetti_bolognese 1816344.jpg
Skipping bad image:  spaghetti_bolognese 3353469.jpg
Skipping bad image:  spaghetti_bolognese 2028065.jpg
Skipping bad image:  spaghetti_bolognese 3888024.jpg
Skipping bad image:  spaghetti_bolognese 385901.jpg
Skipping bad image:  spaghetti_bolognese 3646945.jpg
Skipping bad image:  spaghetti_bolognese 3669898.jpg
Skipping bad image:  spaghetti_bolognese 2568882.jpg
Skipping bad image:  spaghetti_bolognese 2508819.jpg
Skipping bad image:  spaghetti_bolognese 1680506.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_bolognese 3078001.jpg
Skipping bad image:  spaghetti_bolognese 936987.jpg
Skipping bad image:  spaghetti_bolognese 3867365.jpg
Skipping bad image:  spaghetti_bolognese 3336806.jpg
Skipping bad image:  spaghetti_bolognese 2549112.jpg
Skipping bad image:  spaghetti_bolognese 1924410.jpg
Skipping bad image:  spaghetti_bolognese 3906894.jpg
Skipping bad image:  spaghetti_bolognese 2605748.jpg
Skipping bad image:  spaghetti_bolognese 3203671.jpg
Skipping bad image:  spaghetti_bolognese 1819458.jpg


  " Skipping tag %s" % (size, len(data), tag))
  " Skipping tag %s" % (size, len(data), tag))


Skipping bad image:  spaghetti_bolognese 1637424.jpg
Skipping bad image:  spaghetti_bolognese 2039819.jpg
Skipping bad image:  spaghetti_bolognese 3906899.jpg
1468 images loaded
0 images resized
32 images skipped
CPU times: user 4.79 s, sys: 1.2 s, total: 5.99 s
Wall time: 7.65 s


In [6]:
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)

X_train shape (1468,)
y_train shape (1468,)
X_test shape (486,)
y_test shape (486,)


# Random Forest Classifier

In [7]:
# extracting feature: average color
import cv2

def get_avg_colors(data):
    avg_colors = []

    for img in data:
        avg_color_per_row = np.average(img, axis=0)
        avg_color = np.average(avg_color_per_row, axis=0)
        avg_colors.append(avg_color)

    f_avg_colors = np.array(avg_colors)
    f_avg_colors.shape
    return f_avg_colors

In [12]:
# extracting feature: dominant color
import cv2

def get_dom_colors(data):
    dom_colors = []
    
    k = 0 
    for img in data:
        pixels = np.float32(img.reshape(-1, 3))

        n_colors = 5
        criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 200, .1)
        flags = cv2.KMEANS_RANDOM_CENTERS

        _, labels, palette = cv2.kmeans(pixels, n_colors, None, criteria, 10, flags)
        _, counts = np.unique(labels, return_counts=True)
        dominant = palette[np.argmax(counts)]

        dom_colors.append(dominant)
        
        if k % 50 == 0:
            print(k, " getting dom colors")
        k+=1
    
    f_dom_colors = np.array(dom_colors)
    f_dom_colors.shape
    return f_dom_colors

In [9]:
# extracting feature: mean pixel value
%matplotlib inline

def get_img_mpv(image):
    feature_matrix = []
    for i in range(229):
        a = []
        for j in range(229):
            a.append(((int(image[i,j,0]) + int(image[i,j,1]) + int(image[i,j,2]))/3))
        feature_matrix.append(a)
    return feature_matrix


In [17]:
# putting all features into an array of features

def get_features(data, data_size):
    all_feat = []
    avg_col = get_avg_colors(data)
    dom_col = get_dom_colors(data)
    
    i = 0
    for img in data:
        all_feat.append([avg_col[i], dom_col[i], get_img_mpv(img)])
        i += 1
    
    features = np.array(all_feat)
    features.shape
    return features


In [21]:
X_train_feat = get_features(X_train, 1468)
print(X_train_feat.shape)

(1468, 3)


In [3]:
X_test_feat = get_features(X_test, 486)
print(X_test_feat.shape)

NameError: name 'X_test_feat' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier
for i in xrange(5):
    Xtrain=X_train_feat[i][0]
    Ytrain=[]
    clf=RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)
    clf.fit(Xtrain,Ytrain)

In [2]:
# Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=2,
                             random_state=0)

clf.fit(X_train_feat, y_train)  


print(clf.feature_importances_)

print(clf.predict(X_test_feat))


  from numpy.core.umath_tests import inner1d


NameError: name 'X_train_feat' is not defined

In [27]:
result = clf.predict(X_test_feat)

def accuracy(result):
    correct = 0
    
    for i in result:
        if i == y_train[i]:
            correct += 1

    accuracy = correct/(len(result))
    print(accuracy)

accuracy(result)

0.588477366255144


In [None]:
import dill
dill.dump_session('notebook_env.db')