In [24]:
import numpy as np
import pandas as pd
import glob
import scipy.misc
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

In [25]:
import cv2
import os

# Importing movie data

In [26]:
data = pd.read_csv("MovieGenre.csv", encoding="ISO-8859-1")
data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


In [27]:
data.isnull().any()

imdbId        False
Imdb Link     False
Title         False
IMDB Score     True
Genre          True
Poster         True
dtype: bool

In [28]:
data.shape

(40108, 6)

In [30]:
data = data[pd.notnull(data['Genre'])]
data.shape

(39963, 6)

In [31]:
data = data[pd.notnull(data['IMDB Score'])]
data.shape

(39917, 6)

In [32]:
data = data[pd.notnull(data['Poster'])]
data.shape

(39246, 6)

In [33]:
data.isnull().any()

imdbId        False
Imdb Link     False
Title         False
IMDB Score    False
Genre         False
Poster        False
dtype: bool

In [34]:
import re
def convert(k):
    numbers = re.findall('\d+',k)
    results = list(map(int, numbers))
    return(results[0])

# Loading the Posters

In [35]:
path = 'posters/'
image_glob = glob.glob(path + "/" + "*.jpg")
img_dict = {}


def get_id(filename):
    index_s = filename.rfind("/") + 1
    #print(index_s)
    index_f = filename.rfind(".jpg")
    #print(index_f)
    return filename[index_s:index_f]


In [35]:
get_id(image_glob[1])
#cv2.imread('posters/100014.jpg')

'posters\\100014'

In [22]:
for fn in image_glob:
    try:
        img_dict[get_id(fn)] = cv2.imread(fn)
    except:
        pass

In [31]:
def show_img(id):
    title = data[data["imdbId"] == convert(id)]["Title"].values[0]
    genre = data[data["imdbId"] == convert(id)]["Genre"].values[0]
    plt.imshow(img_dict[id])
    plt.title("{} \n {}".format(title, genre))

In [79]:
for k in img_dict:
    print(img_dict[k].shape)
    break

(268, 182, 3)


# Show Examples

In [None]:
show_img('posters\\114709')

# Model

In [72]:
def preprocess(img, size=(150, 101)):
    img = cv2.resize(img, size)
    img = img.astype(np.float32)
    img = (img / 127.5) - 1.
    return img

In [77]:
for k in img_dict:
    size=(150, 101)
    img = preprocess(img_dict[k], size)
    print(img.shape)
    break

(101, 150, 3)


# Function to generate dataset

In [108]:
def prepare_data(data, img_dict, size=(150, 101)):
    #print("Generation dataset...")
    dataset = []
    y = []
    ids = []
    label_dict = {"word2idx": {}, "idx2word": []}
    idx = 0
    genre_per_movie = data["Genre"].apply(lambda x: str(x).split("|"))
    for l in [g for d in genre_per_movie for g in d]:
        if l in label_dict["idx2word"]:
            pass
        else:
            label_dict["idx2word"].append(l)
            label_dict["word2idx"][l] = idx
            idx += 1
    n_classes = len(label_dict["idx2word"])
    #print("identified {} classes".format(n_classes))
    n_samples = len(img_dict)
    #print("got {} samples".format(n_samples))
    for k in img_dict:
        g = data[data["imdbId"] == convert(k)]["Genre"].values[0].split("|")
        img = preprocess(img_dict[k], size)
        #print(img.shape)
        
        #if img.shape != (101, 150, 3):
        #    continue
        #g = data[data["imdbId"] == convert(k)]["Genre"].values[0].split("|")
        l = np.sum([np.eye(n_classes, dtype="uint8")[label_dict["word2idx"][s]] for s in g], axis=0)
        #print(g)
        #print(l)
        
        y.append(l)
        #print(y)
        
        dataset.append(img)
        #break
        ids.append(k)
        #break
    print("DONE")
    return dataset, y, label_dict, ids

SIZE = (150, 101)
prepare_data(data, img_dict, size=SIZE)

# Scale our movie posters to 96×96.

In [None]:
SIZE = (150, 101)
dataset, y, label_dict, ids =  prepare_data(data, img_dict, size=SIZE)

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization

Using TensorFlow backend.


In [12]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu',
                 input_shape=(SIZE[0], SIZE[1], 3)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(29, activation='sigmoid'))

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [13]:
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [None]:
n = 10000
model.fit(np.array(dataset[: n]), np.array(y[: n]), batch_size=16, epochs=5,
          verbose=1, validation_split=0.1)