## Movie Poster Neural Network Classification

** Import libraries**

In [None]:
import pandas as pd
import os

In [None]:
import numpy as np

In [None]:
import time
from tqdm import tqdm

In [None]:
import tensorflow as tf
import cv2
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.models import Model
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense
from keras import backend as K

In [None]:
import matplotlib.pyplot as plt

** Read in cleaned dataset as pandas dataframe**

In [None]:
movie = pd.read_pickle('/Users/michael/Documents/UCSB_2017-2018/PSTAT 134/134MHCP/Top6.pkl')

** Only read in image files that match the titles in the cleaned dataset**

In [None]:
titles = movie.Title.tolist()

In [None]:
len(titles)

We only want to save the image files that are part of our cleaned, processed data set. 

In [None]:
x = []
x_titles = []
for name in tqdm((os.listdir("/Users/michael/Documents/UCSB_2017-2018/PSTAT 134/134MHCP/Images/"))):
    sb = "/Users/michael/Documents/UCSB_2017-2018/PSTAT 134/134MHCP/Images/"+name
    s = name[:-4]
    x_titles.append(s)
    if(s in titles):
        img = cv2.imread(sb)
        x.append(img)

In [None]:
## Check length of x and titles make sure that it matches
len(x)

** Drop Title column and keep only One-Hot encoding to use as labels**

We do this to save computational time and keep our multi-label classification more simple

In [None]:
movie = movie.reset_index(drop = True)

In [None]:
movie.head(5)

In [None]:
df = movie.drop(movie.columns[0], axis=1)

In [None]:
df.head(5)

** Set up x and y to be split up and used for model**

Stack the array as our model needs a 4d array consisting of 3d arrays

In [None]:
y = df.values

In [None]:
x = np.stack(x)

In [None]:
print(x.shape)
print(y.shape)

We have 17,444 images and their labels. 268 is the height of the image and 182 is the width of the image in pixels. Each pixel is represented by an array that denotes its BGR values. The 6 is the number of classes we want to classify the data into.

** Split dataset into test and train sets**

In [None]:
from sklearn.cross_validation import train_test_split

In [None]:
seed = 9

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state = seed)

In [None]:
input_shape = x_train[0].shape
num_class = y_train.shape[1]
print(input_shape)
print(num_class)

** Create 7 layer convolutional neural network**

Binary_crossentropy is our loss function. Our model's operation looks like this: a numpy array that contains the image is fed into the model. It is then run through the layers until the end where a percentage is spit out to show how likely the model thinks the image is part of that genre. We used Adam as it is very efficient as an optimizer and can save us processing time.

In [None]:
# model = Sequential()
# model.add(Conv2D(32, (3, 3), input_shape=input_shape))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# # layer 2
# model.add(Conv2D(32, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# # layer 3
# model.add(Conv2D(32, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# # layer 4
# model.add(Conv2D(64, (3, 3)))
# model.add(Activation('relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# # layer 5
# model.add(Flatten())
# model.add(Dense(64))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
# # layer 6
# model.add(Dense(32))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
# # layer 7
# model.add(Dense(num_class))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])

**Run the Model**

In [None]:
# model.fit(x_train, y_train,
#           batch_size=32, epochs=100, verbose=1)

** Load saved model**

In [None]:
from keras.models import load_model

In [None]:
model = load_model('model_one_hundred_epochs.h5')

**Prediction stuff**

In [None]:
model.summary()

In [None]:
prediction = model.predict(x_test)

** Example prediction**

In [None]:
plt.imshow(x_test[900])

In [None]:
print(prediction[900])

In [None]:
print(y_test[810])

**Test Prediction**

In [None]:
evaluate = model.evaluate(x_test,y_test)
print(evaluate)
print(model.metrics_names)

**Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
pred_values = []

In [None]:
threshhold = .5
unclassified = 0
for i in range(len(prediction)):
    pred_true = []
    for j in range(len(prediction[i])):
        if(prediction[i][j] >=threshhold):
            pred_true.append(1)
        else:
            pred_true.append(0)
    if(sum(pred_true)== 0):
        unclassified+=1
    pred_values.append(pred_true)

In [None]:
y_test_non_cat = [np.argmax(t)for t in y_test]
y_pred_non_cat = [np.argmax(t) for t in pred_values]

In [None]:
conf_mat = confusion_matrix(y_test_non_cat,y_pred_non_cat)

In [None]:
import seaborn

In [None]:
df_confusion = pd.DataFrame(confusion_matrix(y_test_non_cat,y_pred_non_cat))

In [None]:
labels = ['Action','Comedy','Crime','Horror','Romance','Thriller']

In [None]:
df_confusion.columns = labels

In [None]:
df_confusion[''] = labels

In [None]:
cols = df_confusion.columns.tolist()
cols = cols[-1:] + cols[:-1]

In [None]:
df_confusion = df_confusion[cols] 

In [None]:
df_confusion

In [None]:
labels

In [None]:
conf_heatmap = seaborn.heatmap(conf_mat,xticklabels=True, yticklabels=True)
conf_heatmap.set_xticklabels(labels, rotation=90)
conf_heatmap.set_yticklabels(labels,rotation = 360)
plt.show()