## Simpsons characters classification 

In this notebook we try to classify images of different simpsons characters. The characters are 'abraham_grampa_simpson', 'apu_nahasapeemapetilon', 'bart_simpson', 'charles_montgomery_burns', 'chief_wiggum', 'homer_simpson', 'krusty_the_clown', 'lisa_simpson', 'marge_simpson', 'milhouse_van_houten', 'moe_szyslak', 'ned_flanders', 'principal_skinner' and 'sideshow_bob'.
This dataset was preprocessed in an other notebook, it is splitted into a train val and testset and resized into 80x80 pixels and all characters have more than 600 images in total. The whole dataset with the original size can be found here https://www.kaggle.com/alexattia/the-simpsons-characters-dataset. 

#### Imports

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from tqdm.notebook import tqdm
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D,GlobalAveragePooling2D,GlobalMaxPooling2D, BatchNormalization
from tensorflow.keras.utils import to_categorical


####  Google Colab: Import Data from Google Drive to Colab



Open this link in your browser: https://drive.google.com/drive/folders/1ymYg4DUj1ft-kUbuU9rW0bNHAttW44Zv?usp=sharing, then:

Option 1:

- download data.zip from the shared folder above to your local hard disk
- then copy it into your google drive

Option 2:

- in the above shared folder, open drop down in folder name "CAS_MAIN_DL"
- select "add short cut to drive"

Depending on option 1 or 2, adapt the "path" variable two cells below

In [None]:
from google.colab import drive
import os,zipfile,io

# mount your google drive as /content/drive
drive.mount('/content/drive')

# path to the data.zip file within your google drive (option 1 above)
# my own file
#ddir="/content/drive/MyDrive/Shared/ZHAW/CAS_MAIN_DL"

# shared with me, then select "add shortcut to drive" (option 2 above)
ddir="/content/drive/MyDrive/CAS_MAIN_DL"

zf = zipfile.ZipFile(os.path.join(ddir,"data.zip"), "r")
zf.extractall()

#### Load Data

In [None]:
path = "./data/simpson_data"
Data = pd.read_csv(os.path.join(path, "Data.csv"))
X_train = np.load(os.path.join(path, "X_train.npy"))
Y_train = np.load(os.path.join(path, "Y_train.npy"))
X_val = np.load(os.path.join(path, "X_val.npy"))
Y_val = np.load(os.path.join(path, "Y_val.npy"))
X_test = np.load(os.path.join(path, "X_test.npy"))
Y_test = np.load(os.path.join(path, "Y_test.npy"))
labels = Data["label"].unique()

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

Let's use the trainset to plot a random image of each character. You can see that the characters are easy recognizable. And all images are the same size.

In [None]:
plt.figure(figsize=(15,15))
for i in range(0,len(np.unique(np.argmax(Y_train,axis=1)))):
    rmd = np.random.choice(np.where(np.argmax(Y_train,axis=1)==i)[0],1)
    plt.subplot(4,4,i+1)
    img = X_train[rmd]
    plt.imshow(img[0,:,:,:])
    plt.title(labels[i])

In this cell we plot the label distribution of all sets. You clearly see that the label distribution in all sets is very similar. The biggest class in the trainigset is obviously homer and the smallest class is apu.

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
plt.bar(np.unique(np.argmax(Y_train,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_train,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("train distribution")
plt.subplot(1,3,2)
plt.bar(np.unique(np.argmax(Y_val,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_val,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("val distribution")
plt.subplot(1,3,3)
plt.bar(np.unique(np.argmax(Y_test,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_test,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("test distribution")
plt.show()

### CNN

Now we normalize the data and use a CNN to classify the images into the right simpson character.

In [None]:
X_train=np.array(X_train,dtype="float32")
X_train=((X_train/255)-0.5)*2

X_val=np.array(X_val,dtype="float32")
X_val=((X_val/255)-0.5)*2

X_test=np.array(X_test,dtype="float32")
X_test=((X_test/255)-0.5)*2

In [None]:
model  =  Sequential()

model.add(Conv2D(32,(3,3),activation="relu",padding="same",input_shape=(80,80,3)))
model.add(Conv2D(32,(3,3),activation="relu",padding="same"))
model.add(MaxPooling2D((2,2)))

model.add(Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(MaxPooling2D((2,2)))

model.add(Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(Conv2D(64,(3,3),activation="relu",padding="same"))
model.add(MaxPooling2D((2,2)))

model.add(Conv2D(128,(3,3),activation="relu",padding="same"))
model.add(Conv2D(128,(3,3),activation="relu",padding="same"))
model.add(MaxPooling2D((2,2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dense(300))
model.add(Activation('relu'))
model.add(Dense(100))
model.add(Activation('relu'))

model.add(Dense(14))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

In [None]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    fill_mode="constant",
    cval=255,
    horizontal_flip=True)

In [None]:
i=22
data_aug = datagen.flow(x=X_train[i:(i+1)], y=Y_train[i:(i+1)], batch_size=1)
print(np.min(X_train[i]),np.max(X_train[i]))
plt.imshow(X_train[i])
plt.show()
plt.figure(figsize=(15,15))
for i in range (0,25):
  plt.subplot(5,5,i+1)
  x_aug,y_aug=next(data_aug)
  plt.imshow(x_aug[0,:,:,:])

In [None]:
history = model.fit(datagen.flow(X_train, Y_train, batch_size=32), 
                              steps_per_epoch=len(X_train)/32, 
                              epochs=150, # 150 
                              validation_data=(X_val, Y_val),
                              verbose=2)


In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='lower right')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')
plt.show()

In [None]:
model.save("./data/simpson_data/simpsons_cnn.h5")

In [None]:
# load reference model
# model=load_model("./data/simpson_data/simpsons_cnn_ref.h5")

### Evaluate
Lets check the overall accuracy and the accuracy per class.

In [None]:
acc = np.average(np.argmax(model.predict(X_test),axis=1) == np.argmax(Y_test,axis=1))
res = pd.DataFrame({'Acc' : acc}, index=['CNN'])
res

In [None]:
for i in range(0,len(labels)):
  print(labels[i],np.average(np.argmax(model.predict(X_test),axis=1)[np.where(np.argmax(Y_test,axis=1)==i)]==i))

Now let us do a method called test time augmentation, where we predict the same image over multiple different augmentation runs. For a final prediction for that image we average over all the runs.

In [None]:
aug_size = 40
tta_pred = np.zeros((len(X_test),14))
for i in tqdm(range(0,len(X_test))):
  tmp = np.zeros((aug_size,80,80,3))
  data_aug = datagen.flow(x=X_test [i:(i+1)], y=Y_test[i:(i+1)], batch_size=1)
  for j in range(0,aug_size):
    tmp[j],_ = next(data_aug)
  tta_pred[i] = np.average(model.predict(tmp),axis=0)

In [None]:
acc = np.average(np.argmax(tta_pred,axis=1)==np.argmax(Y_test,axis=1))
res = pd.DataFrame({'Acc' : acc}, index=['CNN_tta'])
res


In [None]:
for i in range(0,len(labels)):
  print(labels[i],np.average(np.argmax(tta_pred,axis=1)[np.where(np.argmax(Y_test,axis=1)==i)]==i))