## Importing libraries

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [3]:
X_train = pd.read_csv('/content/drive/MyDrive/movie_rating/dataset/X_train.csv')
y_train = pd.read_csv('/content/drive/MyDrive/movie_rating/dataset/y_train.csv')

In [4]:
X_train.head()

Unnamed: 0,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_count,credits,keywords
0,Alev Alev,Drama-Thriller,tr,,0.664,Erler Film,01-01-1984,0,0,118.0,Released,,2,Tarık Akan-Gülşen Bubikoğlu-Cüneyt Arkın-Çiğde...,pregnancy-model-sea captain-businessman-illega...
1,Those Who Work,Drama,fr,Frank a man of action who worked his way up al...,4.174,Box Productions-Novak Prod-Office Fédéral de l...,04-10-2018,0,0,102.0,Released,,67,Olivier Gourmet-Adèle Bochatay-Delphine Bibet-...,
2,Driven,,en,In a world of adrenaline and speed a quadriple...,0.6,,13-12-2019,0,0,17.0,Released,,0,,
3,Netherlands Documentary,,en,Sex Drugs & Other Taboo Topics The World Is To...,0.6,,27-02-2020,0,0,60.0,Released,,0,,
4,Utta Danella - Der Verlobte meiner besten Freu...,Drama,de,Katharina and Elena are best friends. After a ...,1.152,,03-04-2009,0,0,88.0,Released,,2,Henriette Richter-Röhl-Ina Paule Klink-Robert ...,


In [5]:
X_train.drop_duplicates(inplace=True)

In [6]:
X_train.shape

(752677, 15)

In [7]:
X_train =  X_train[['keywords']]

#### Here we shall work on the text which need to be tokenized and then converted to label binary endode input sequence

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 752677 entries, 0 to 752677
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   keywords  218469 non-null  object
dtypes: object(1)
memory usage: 11.5+ MB


In [9]:
X_train.isnull().sum()

keywords    534208
dtype: int64

In [10]:
X_train.dropna(inplace=True)

In [11]:
X_train['keywords'] = X_train['keywords'].apply(lambda x: str(x).lower())
X_train['keywords'] = X_train['keywords'].apply(lambda x: x.replace('-',' '))

In [12]:
X_train.head()

Unnamed: 0,keywords
0,pregnancy model sea captain businessman illega...
8,artists' life educational pablo picasso art ex...
15,horror
16,texas dam disaster flood disaster movie
26,mail order bride


In [13]:
X_train.dropna(inplace=True)

In [14]:
X_train.shape

(218469, 1)

### Building the neural network for numeric part of the code

In [15]:
# Neural Network Model for label binarizrer
X = X_train['keywords']
y = y_train

In [16]:
X_train['keywords']


0         pregnancy model sea captain businessman illega...
8         artists' life educational pablo picasso art ex...
15                                                   horror
16                  texas dam disaster flood disaster movie
26                                         mail order bride
                                ...                        
752669                        softcore exhibitionism voyeur
752671    casino exploitation gambling debt softcore sex...
752673    wrestling pro wrestling female wrestler wrestl...
752675                                               parody
752677    lake finland nature documentary finnish mythology
Name: keywords, Length: 218469, dtype: object

### Let's tokenize the sequence 

In [17]:
vocab_size = 40000
max_length = 120

In [18]:
# this is for tokenizing the words, then each word of the tokenized list shall be given a specific index 
tokenizer = Tokenizer(num_words=vocab_size, oov_token= '<0VV>') # <0VV> is the token that the model shall replace the unseen words with. So if some word of the test case is not there in tokenized list then that word shall be replaces with 0VV this is done to maintain the size of the sentence. 
tokenizer.fit_on_texts(X_train['keywords'].astype(str)) # we have to use astype(str) as there are some numbers in X_train[!]
word_index_ = tokenizer.word_index # gives index to each word and then makes a dictionary of word and their indexes.


# the numbers that would constitue to this list are those numbers which are index of the corresponding words in the dictionary.
sequence_train = tokenizer.texts_to_sequences(X_train['keywords'])

In [None]:
sequence_train

In [20]:
print(len(word_index_))

26528


In [21]:
padded_train = pad_sequences(sequence_train, maxlen = 52)

X_train = padded_train

In [22]:
print(X_train)

[[   0    0    0 ... 3357  538  834]
 [   0    0    0 ... 4107   57 2421]
 [   0    0    0 ...    0    0   39]
 ...
 [   0    0    0 ...  770   44 1117]
 [   0    0    0 ...    0    0  274]
 [   0    0    0 ...   52 2436  466]]


In [23]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### LSTM

In [24]:
model1 = tf.keras.models.Sequential()
model1.add(tf.keras.layers.Embedding(40000, 52, input_length=52)) 
model1.add(tf.keras.layers.LSTM(100, return_sequences=True,))
model1.add(tf.keras.layers.LSTM(100))
model1.add(tf.keras.layers.Dense(30, activation="relu"))
model1.add(tf.keras.layers.Dense(10, activation="relu"))
model1.add(tf.keras.layers.Dense(1, activation="linear"))

In [25]:
model1.summary()
model1.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 52, 52)            2080000   
                                                                 
 lstm (LSTM)                 (None, 52, 100)           61200     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 30)                3030      
                                                                 
 dense_1 (Dense)             (None, 10)                310       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                                 
Total params: 2,224,951
Trainable params: 2,224,951
Non-

In [26]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hX_train5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
model1.fit(X_train, y_train, epochs=500, batch_size=32, validation_split = 0.1, callbacks=callbacks_list)