In [1]:
import pandas as pd
from ast import literal_eval
from collections import Counter

# 0. Data preparation

## 0.0 Download and upload

In [2]:
input_folder_path='../input_data/'

In [3]:
output_folder_path = '../trained_model/'

In [7]:
input_file_name = 'movies_metadata.csv'

In [8]:
movies = pd.read_csv(input_folder_path+input_file_name, 
                     usecols= ['id','title', 'overview', 'genres'],
                     dtype={"id": str, "title": str, "overview": str},
                     converters={"genres": literal_eval})

In [9]:
movies.shape

(45466, 4)

## 0.1 Filtering

In [10]:
movies[movies.title.isna()]

Unnamed: 0,genres,id,overview,title
19729,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",82663,British soldiers force a recently captured IRA...,
19730,"[{'name': 'Carousel Productions', 'id': 11176}...",1997-08-20,Released,
29502,"[{'id': 16, 'name': 'Animation'}, {'id': 878, ...",122662,Third film of the Mardock Scramble series.,
29503,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...",2012-09-29,Released,
35586,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 28,...",249260,A group of skiers are terrorized during spring...,
35587,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...",2014-01-01,Released,


In [11]:
movies = movies[movies.title.notna()]

In [12]:
movies = movies[movies.overview.str.len()>50]

In [13]:
movies['id'] = movies.id.astype(int)

In [14]:
movies.shape

(43971, 4)

In [15]:
sum(movies.genres.apply(len)==0)

2061

In [16]:
movies = movies[movies.genres.apply(len)>0]

In [17]:
movies.shape

(41910, 4)

In [18]:
sum(movies.genres.apply(len)==0)

0

## 0.2 Chosing an unique genre

In [19]:
all_genres = [element for list_ in movies.genres for element in list_]

In [20]:
genres_freq = dict(Counter([element["name"] for element in all_genres]).most_common())

In [21]:
len(genres_freq)

20

In [22]:
genres_freq

{'Drama': 19889,
 'Comedy': 12652,
 'Thriller': 7543,
 'Romance': 6638,
 'Action': 6526,
 'Horror': 4625,
 'Crime': 4249,
 'Documentary': 3824,
 'Adventure': 3458,
 'Science Fiction': 3007,
 'Family': 2712,
 'Mystery': 2445,
 'Fantasy': 2275,
 'Animation': 1896,
 'Foreign': 1579,
 'Music': 1566,
 'History': 1375,
 'War': 1306,
 'Western': 1027,
 'TV Movie': 744}

In [23]:
genres_list = list(genres_freq.keys())

In [24]:
num_of_genres=10

In [25]:
genres_subset = genres_list[:num_of_genres]

In [26]:
genres_subset

['Drama',
 'Comedy',
 'Thriller',
 'Romance',
 'Action',
 'Horror',
 'Crime',
 'Documentary',
 'Adventure',
 'Science Fiction']

In [27]:
movies['accepted_genres'] = movies.genres.apply(
    lambda x: list(set([elem_["name"] for elem_ in x ])\
                  .intersection(set(genres_subset))))

In [28]:
movies[movies.accepted_genres.apply(len)==0].head(3)

Unnamed: 0,genres,id,overview,title,accepted_genres
124,"[{'id': 14, 'name': 'Fantasy'}]",27793,A young boy must restore order when a group of...,The Neverending Story III: Escape from Fantasia,[]
309,"[{'id': 16, 'name': 'Animation'}]",22586,The beautiful princess Odette is transformed i...,The Swan Princess,[]
377,"[{'id': 14, 'name': 'Fantasy'}]",10395,Publisher Will Randall becomes a werewolf and ...,Wolf,[]


In [29]:
movies[movies.accepted_genres.apply(len)==0].shape, movies.shape

((1429, 5), (41910, 5))

In [30]:
movies_reduced = movies[movies.accepted_genres.apply(len)>0].copy()

In [31]:
sum(movies_reduced.accepted_genres.apply(len)==0)

0

In [32]:
movies_reduced['unique_genre'] = movies_reduced.accepted_genres.apply(lambda x: x[0])

In [33]:
movies_reduced.head(3)

Unnamed: 0,genres,id,overview,title,accepted_genres,unique_genre
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"Led by Woody, Andy's toys live happily in his ...",Toy Story,[Comedy],Comedy
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,When siblings Judy and Peter discover an encha...,Jumanji,[Adventure],Adventure
2,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,A family wedding reignites the ancient feud be...,Grumpier Old Men,"[Comedy, Romance]",Comedy


In [34]:
movies_reduced.shape

(40481, 6)

In [35]:
movies.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

## 0.3 Vectorization

In [None]:
num_words=10000

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)

In [None]:
#all_overview = movies_reduced.overview.str.replace('[^a-zA-Z0-9\s]', '', regex=True).str.lower()

In [None]:
tokenizer.fit_on_texts(movies_reduced.overview)

In [None]:
len(tokenizer.word_index)

In [None]:
tokenized_overviews = tokenizer.texts_to_sequences(movies_reduced.overview.str.split())

In [None]:
movies_reduced.overview.str.split()

In [None]:
genre_labels = movies_reduced.unique_genre.apply(lambda x: genres_subset.index(x)).tolist()

## 1.4 Saving

In [None]:
import pickle

In [None]:
with open(output_folder_path+'genres_subset.txt', "wb") as fp:
    pickle.dump(genres_subset, fp)

In [None]:
with open(output_folder_path+'tokenizer.txt', "wb") as fp:
    pickle.dump(tokenizer, fp)

In [None]:
with open(folder_path+'vect/genres_subset.txt', "rb") as fp2:   # Unpickling
    genres_subset_2= pickle.load(fp2)

In [None]:
with open(folder_path+'vect/tokenizer.txt', "rb") as fp2:   # Unpickling
    tokenizer2= pickle.load(fp2)

# 2. The model

## 2.1 Data encoding

In [None]:
import numpy as np

In [None]:
vectorized_overviews = np.zeros((len(tokenized_overviews), num_words), bool)

In [None]:
for i, tokenized_ov in enumerate(tokenized_overviews):
    vectorized_overviews[i, tokenized_ov]=True

In [None]:
vectorized_overviews

In [None]:
from keras.utils import to_categorical

In [None]:
genre_one_hot = to_categorical(genre_labels)

In [None]:
genre_one_hot

## 2.2 Split train test

In [None]:
test_size= 10000

In [None]:
len(vectorized_overviews)

In [None]:
vectorized_overviews_train = vectorized_overviews[:-test_size]

In [None]:
vectorized_overviews_test = vectorized_overviews[-test_size:]

In [None]:
len(vectorized_overviews_train),len(vectorized_overviews_test)

In [None]:
genre_one_hot_train = genre_one_hot[:-test_size]
genre_one_hot_test = genre_one_hot[-test_size:]

In [None]:
len(genre_one_hot_train),len(genre_one_hot_test)

## 2.3 Architecture

In [None]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(128, activation='relu', input_shape=(10000,)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(num_of_genres, activation='softmax'))

In [None]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(vectorized_overviews_train,
                    genre_one_hot_train,
                    epochs=10,
                    batch_size=512,
                    validation_split=0.1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
history.history.keys()

In [None]:
plt.clf()   # clear figure

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
model.evaluate(vectorized_overviews_test, genre_one_hot_test)

In [None]:
model.save(output_folder_path+'nn/')

In [None]:
vectorized_overviews_test[0].reshape(10000,1).shape

In [None]:
probs = model.predict(vectorized_overviews_test[0].reshape(1,10000))

In [None]:
genres_subset[np.argmax(probs)]

In [None]:
probs

In [None]:
np.argmax(probs)

In [None]:
genre_one_hot_test[1]

In [None]:
predictions= model.predict(vectorized_overviews_test)

In [None]:
predictions[1]

In [None]:
probs

In [None]:
genres_subset[np.argmax(predictions[1])]

In [None]:
movies_reduced.loc[test_size]

In [None]:
print(movies_reduced.loc[test_size].overview)

# 3. Loading

In [None]:
from keras.models import load_model

In [None]:
model2 = load_model(output_folder_path+'nn/')

In [None]:
model2.evaluate(vectorized_overviews_test, genre_one_hot_test)