## Simpsons characters classification 

In this notebook we try to classify images of different simpsons characters. The characters are 'abraham_grampa_simpson', 'apu_nahasapeemapetilon', 'bart_simpson', 'charles_montgomery_burns', 'chief_wiggum', 'homer_simpson', 'krusty_the_clown', 'lisa_simpson', 'marge_simpson', 'milhouse_van_houten', 'moe_szyslak', 'ned_flanders', 'principal_skinner' and 'sideshow_bob'.
This dataset was preprocessed in an other notebook, it is splitted into a train val and testset and resized into 80x80 pixels and all characters have more than 600 images in total. The whole dataset with the original size can be found here https://www.kaggle.com/alexattia/the-simpsons-characters-dataset. 

#### Imports

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from tqdm.notebook import tqdm
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D,GlobalAveragePooling2D,GlobalMaxPooling2D, BatchNormalization
from tensorflow.keras.utils import to_categorical


####  Setup

In [None]:
import os,sys

if "google.colab" in sys.modules:
    %pip install wget
    
import wget,zipfile

if "labsetup_run" not in locals() or labsetup_run:

    print("running setup ...")

    #if "google.colab" in sys.modules:
    #    print("colab")
    #else:
    #    print("local")

    # download data.zip from shared google drive
    if not(os.path.isfile("data.zip")): 
        filename=wget.download("https://drive.google.com/uc?export=download&confirm=yes&id=1dkSV2oL8Ua1SDmzVvtGkyQ0LGQ6VpUIy")
    # unpack it
    if not(os.path.isdir("./data")):
        zf = zipfile.ZipFile(os.path.join(".","data.zip"), "r")
        zf.extractall()
                          
    # allow "hot-reloading" of modules
    %load_ext autoreload
    %autoreload 2
    # needed for inline plots in some contexts
    %matplotlib inline

    print("done.")
    labsetup_run = True  # change to True re-run setup
else:
    print("setup already run.")
    

#### Load Data

In [None]:
path = "./data/simpson_data"
Data = pd.read_csv(os.path.join(path, "Data.csv"))
X_train = np.load(os.path.join(path, "X_train.npy"))
Y_train = np.load(os.path.join(path, "Y_train.npy"))
X_val = np.load(os.path.join(path, "X_val.npy"))
Y_val = np.load(os.path.join(path, "Y_val.npy"))
X_test = np.load(os.path.join(path, "X_test.npy"))
Y_test = np.load(os.path.join(path, "Y_test.npy"))
labels = Data["label"].unique()

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_val.shape)
print(Y_test.shape)

Let's use the trainset to plot a random image of each character. You can see that the characters are easy recognizable. And all images are the same size.

In [None]:
plt.figure(figsize=(15,15))
for i in range(0,len(np.unique(np.argmax(Y_train,axis=1)))):
    rmd = np.random.choice(np.where(np.argmax(Y_train,axis=1)==i)[0],1)
    plt.subplot(4,4,i+1)
    img = X_train[rmd]
    plt.imshow(img[0,:,:,:])
    plt.title(labels[i])

In this cell we plot the label distribution of all sets. You clearly see that the label distribution in all sets is very similar. The biggest class in the trainigset is obviously homer and the smallest class is apu.

In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
plt.bar(np.unique(np.argmax(Y_train,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_train,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("train distribution")
plt.subplot(1,3,2)
plt.bar(np.unique(np.argmax(Y_val,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_val,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("val distribution")
plt.subplot(1,3,3)
plt.bar(np.unique(np.argmax(Y_test,axis=1),return_counts=True)[0],np.unique(np.argmax(Y_test,axis=1),return_counts=True)[1]
       ,tick_label=labels )
plt.xticks(rotation=90)
plt.title("test distribution")
plt.show()

### CNN

Now we normalize the data and use a CNN to classify the images into the right simpson character.

In [None]:
X_train=np.array(X_train,dtype="float32")
X_train=((X_train/255)-0.5)*2

X_val=np.array(X_val,dtype="float32")
X_val=((X_val/255)-0.5)*2

X_test=np.array(X_test,dtype="float32")
X_test=((X_test/255)-0.5)*2

In [None]:
model  =  Sequential()

model.add(Conv2D(16,(3,3),activation="relu",padding="same",input_shape=(80,80,3)))
model.add(Conv2D(16,(3,3),activation="relu",padding="same"))
model.add(MaxPooling2D((2,2)))

model.add(Flatten())
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dense(100))
model.add(Activation('relu'))

model.add(Dense(14))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(X_train, Y_train, epochs=10, validation_data=(X_val, Y_val),verbose=2,batch_size=128)


In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='lower right')
plt.show()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper right')
plt.show()

### Evaluate
Lets check the overall accuracy and the accuracy per class.


In [None]:
acc = np.average(np.argmax(model.predict(X_test),axis=1)==np.argmax(Y_test,axis=1))
res = pd.DataFrame({'Acc' : acc}, index=['CNN'])
res

In [None]:
pred=np.argmax(model.predict(X_test),axis=1)
for i in range(0,len(labels)):
  print(labels[i],np.average(pred[np.where(np.argmax(Y_test,axis=1)==i)]==i))

### Now it's your turn



*   Try to fight the overfitting.
*   Try to improve the performace on the testset with a different model.  
*   *Hints:  You may want to use a deeper CNN, or use transfer learning. Maybe data augmntation could improve the performace or dropout could help to fight the overfitting.*


*   Try beat 95% overall accuracy ;-)
