## Deep Learning for Image Classification- Boys vs Girls Shirts

We  try to classify an unseen shirt image as being of "boys" or of "girls".  we're going to use Convolutional Neural Networks.

    But first, we need to get the files from Ebay website:

In [None]:
from bs4 import BeautifulSoup
import requests
from ipywidgets import IntProgress
from IPython.display import display
import sys
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from skimage import transform, color, img_as_ubyte
from os import listdir

In [None]:
def download_image(url, title, file_name):
    try:
        response = requests.get(url)    
    except:
        return '', ''
    with open(file_name, "wb") as file:
        file.write(response.content)
    return title, file_name

open folders for the images

In [None]:
!mkdir boys
!mkdir girls

import boys images

In [None]:
boys_url = 'https://il.ebay.com/b/Boys-Short-Sleeve-Sleeve-Tops-T-Shirts-Sizes-4-Up/175521/bn_4278610?rt=nc&LH_ItemCondition=1000&LH_BIN=1&LH_PrefLoc=3&_pgn='
max_pages = 40
boys_items_data = {'title': {}, 'file_id': {}}
f = IntProgress(min = 0, max = max_pages)
display(f)
all_items_counter = 0

for page_num in range(max_pages):
    url = boys_url + str(page_num)
    try:
        r = requests.get(url, "lxml")
    except:
        print('Stopped at page: ' + page_num)
        break
    soup = BeautifulSoup(r.content)
    images = soup.find_all('img')[1:]
    image_titles = [img['alt'] for img in images]
    image_files_src = [img['src'] for img in images]
    image_files_datasrc = [img.get('data-src', None) for img in images]
    image_files = [src if datasrc is None else datasrc for src, datasrc in zip(image_files_src, image_files_datasrc)]
           
    for i in range(len(images)):
        title, file_name = download_image(image_files[i], image_titles[i], './boys/' + str(all_items_counter + i) + '.jpg')
        boys_items_data['title'][all_items_counter + i] = title
        boys_items_data['file_id'][all_items_counter + i] = all_items_counter + i
    all_items_counter += len(images)
    f.value += 1

import girls images

In [None]:
girls_url = 'https://il.ebay.com/b/Girls-Short-Sleeve-Sleeve-Tops-T-Shirts-Sizes-4-Up/175529/bn_4741026?rt=nc&LH_ItemCondition=1000&LH_BIN=1&LH_PrefLoc=3&_pgn='
max_pages = 40
girls_items_data = {'title': {}, 'file_id': {}}
f = IntProgress(min = 0, max = max_pages)
display(f)
all_items_counter = 0

for page_num in range(max_pages):
    url = girls_url + str(page_num)
    try:
        r = requests.get(url, "lxml")
    except:
        print('Stopped at page: ' + page_num)
        break
    soup = BeautifulSoup(r.content)
    images = soup.find_all('img')[1:]
    image_titles = [img['alt'] for img in images]
    image_files_src = [img['src'] for img in images]
    image_files_datasrc = [img.get('data-src', None) for img in images]
    image_files = [src if datasrc is None else datasrc for src, datasrc in zip(image_files_src, image_files_datasrc)]
           
    for i in range(len(images)):
        title, file_name = download_image(image_files[i], image_titles[i], './girls/' + str(all_items_counter + i) + '.jpg')
        girls_items_data['title'][all_items_counter + i] = title
        girls_items_data['file_id'][all_items_counter + i] = all_items_counter + i
    all_items_counter += len(images)
    f.value += 1

In [None]:
girls_df = pd.DataFrame(girls_items_data)
boys_df = pd.DataFrame(boys_items_data)

from sklearn.model_selection import train_test_split
boys_train, boys_test = train_test_split(boys_df, test_size=0.2)
girls_train, girls_test = train_test_split(girls_df, test_size=0.2)

## Lets start to work on the images

In [None]:
%matplotlib inline


def get_file_list(df, folder, n_sample = None, seed = None):
    if n_sample is None:
        file_ids_list = df.file_id.values
    else:
        file_ids_list = df.sample(n = n_sample, random_state = seed).file_id.values
    files_list = [folder + '/' + str(file_id) + '.jpg' for file_id in file_ids_list]
    return files_list

def read_image_and_resize(f, w = 100, h = 100):
    img = plt.imread(f)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        img = transform.resize(img, (w, h), mode='constant')
        img = img_as_ubyte(img)
    img = color.gray2rgb(img)
    img = img[np.newaxis, :, :, :3]
    if img.shape != (1, 100, 100, 3):
        raise ValueError(f + str(img.shape))
    return img

def read_images_4d_array(files_list):
    images_list = [read_image_and_resize(file) for file in files_list]
    images_array = np.concatenate(images_list)
    return images_array

def get_images_matrix(df, folder, n = None, seed = 1976):
    files_list = get_file_list(df, folder, n, seed)
    images = read_images_4d_array(files_list)
    return images, files_list

def get_all_pixels(x):
    return x.reshape(-1, np.prod(x.shape[1:]))

def numpy_array_size_in_bytes(a):
    return a.size * a.itemsize

def shape_and_size(x, name):
    n_rows = x.shape[0]
    if len(x.shape) == 1:
        n_cols = 1
    elif len(x.shape) == 2:
        n_cols = x.shape[1]
    else:
        warnings.warn('Function is meaningful for 1 or 2-D numpy arrays, taking 2nd dimension as n_cols')
        n_cols = x.shape[1]        
    size = numpy_array_size_in_bytes(x)
    print('%s Shape: %d X %d, Size (bytes): %d' % (name, n_rows, n_cols, size))

def conf_matrix(y_true, y_pred):
    return pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True)

def get_final_matrices(n_train = None, n_test = None):
    folder = 'C:/Users/chen/Downloads/boys girls shirts/'
    x_boys_train, boys_train_files = get_images_matrix(boys_train, folder + 'boys', n_train)
    x_boys_test, boys_test_files = get_images_matrix(boys_test, folder + 'boys', n_test)
    x_girls_train, girls_train_files = get_images_matrix(girls_train, folder + 'girls', n_train)
    x_girls_test, girls_test_files = get_images_matrix(girls_test, folder + 'girls', n_test)
    
    x_boys_train_all = get_all_pixels(x_boys_train)
    x_boys_test_all = get_all_pixels(x_boys_test)
    x_girls_train_all = get_all_pixels(x_girls_train)
    x_girls_test_all = get_all_pixels(x_girls_test)

    x_train = np.vstack([x_boys_train_all, x_girls_train_all])
    x_test = np.vstack([x_boys_test_all, x_girls_test_all])

    y_boys_train = np.array([np.uint8(0)] * x_boys_train.shape[0])
    y_boys_test = np.array([np.uint8(0)] * x_boys_test.shape[0])
    y_girls_train = np.array([np.uint8(1)] * x_girls_train.shape[0])
    y_girls_test = np.array([np.uint8(1)] * x_girls_test.shape[0])
    y_train = np.concatenate([y_boys_train, y_girls_train])
    y_test = np.concatenate([y_boys_test, y_girls_test])
    
    return x_train, x_test, y_train, y_test

In [None]:
x_train, x_test, y_train, y_test = get_final_matrices()

shape_and_size(x_train, 'x_train')
shape_and_size(x_test, 'x_test')
shape_and_size(y_train, 'y_train')
shape_and_size(y_test, 'y_test')

Our platform of choice [Keras](https://keras.io/) accepts `x_train` of type `float`. It's best to turn it to float in the 0-1 

In [None]:
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

    Here Keras needs the original 4D shape of the images array, so we `reshape` them to be of dimensions [N images X Height X Width X N channels].
    We're using a `Conv2D` layer of 32 units and a 3x3 kernel, then a 64 units layer also with a 3x3 kernel, followed by a `MaxPooling2D` with a 2xs pool size layer and a 25% `Dropout`. The output is then `Flatten`ed and connected to a `Dense` layer of 128 neurons, another 50% `Dropout` and then a single neuron with a `sigmoid` activation function.

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout

batch_size = 128
epochs = 10
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

img_rows, img_cols, channels = 100, 100, 3

x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, channels)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, channels)
input_shape = (img_rows, img_cols, channels)

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    shuffle=True,
                    validation_data=(x_test, y_test))

score = model.evaluate(x_test, y_test, verbose=1)

print('Test loss:', score[0])
print('Test accuracy:', score[1])

We got a test accuracy of ~84%

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc = 'upper left')
plt.show()