## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
X_train = pd.read_csv('dataset/X_train.csv')
y_train = pd.read_csv('dataset/y_train.csv')

In [3]:
X_train.head()

Unnamed: 0,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_count,credits,keywords
0,Alev Alev,Drama-Thriller,tr,,0.664,Erler Film,01-01-1984,0,0,118.0,Released,,2,Tarık Akan-Gülşen Bubikoğlu-Cüneyt Arkın-Çiğde...,pregnancy-model-sea captain-businessman-illega...
1,Those Who Work,Drama,fr,Frank a man of action who worked his way up al...,4.174,Box Productions-Novak Prod-Office Fédéral de l...,04-10-2018,0,0,102.0,Released,,67,Olivier Gourmet-Adèle Bochatay-Delphine Bibet-...,
2,Driven,,en,In a world of adrenaline and speed a quadriple...,0.6,,13-12-2019,0,0,17.0,Released,,0,,
3,Netherlands Documentary,,en,Sex Drugs & Other Taboo Topics The World Is To...,0.6,,27-02-2020,0,0,60.0,Released,,0,,
4,Utta Danella - Der Verlobte meiner besten Freu...,Drama,de,Katharina and Elena are best friends. After a ...,1.152,,03-04-2009,0,0,88.0,Released,,2,Henriette Richter-Röhl-Ina Paule Klink-Robert ...,


In [4]:
X_train.drop_duplicates(inplace=True)

In [5]:
X_train.shape

(752677, 15)

In [6]:
X_train =  X_train[['genres','credits']]

#### Here we shall work on the text which need to be tokenized and then converted to label binary endode input sequence

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 752677 entries, 0 to 752677
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   genres   529280 non-null  object
 1   credits  521461 non-null  object
dtypes: object(2)
memory usage: 17.2+ MB


In [8]:
X_train.isnull().sum()

genres     223397
credits    231216
dtype: int64

In [9]:
X_train.dropna(inplace=True)

In [10]:
X_train['credits'] = X_train['credits'].apply(lambda x: str(x))
X_train['credits'] = X_train['credits'].apply(lambda x: x.replace(' ', '_'))
X_train['credits'] = X_train['credits'].apply(lambda x: x.replace('-', ' '))

In [11]:
X_train['genres'] = X_train['genres'].apply(lambda x: str(x))
X_train['genres'] = X_train['genres'].apply(lambda x: x.replace(' ', '_'))
X_train['genres'] = X_train['genres'].apply(lambda x: x.replace('-', ' '))

In [12]:
X_train['input'] = X_train['genres'] + ' ' + X_train['credits']

In [13]:
X_train.head()

Unnamed: 0,genres,credits,input
0,Drama Thriller,Tarık_Akan Gülşen_Bubikoğlu Cüneyt_Arkın Çiğde...,Drama Thriller Tarık_Akan Gülşen_Bubikoğlu Cün...
1,Drama,Olivier_Gourmet Adèle_Bochatay Delphine_Bibet ...,Drama Olivier_Gourmet Adèle_Bochatay Delphine_...
4,Drama,Henriette_Richter Röhl Ina_Paule_Klink Robert_...,Drama Henriette_Richter Röhl Ina_Paule_Klink R...
6,Animation Comedy,Iva_Janžurová,Animation Comedy Iva_Janžurová
7,Drama,Robert_Cawdron Peter_Dyneley Lee_Patterson She...,Drama Robert_Cawdron Peter_Dyneley Lee_Patters...


In [14]:
X_train.dropna(inplace=True)

In [15]:
X_train.shape

(431211, 3)

### Building the neural network for numeric part of the code

In [16]:
# Neural Network Model for label binarizrer
X = X_train['input']
y = y_train

### Let's tokenize the sequence 

In [17]:
# this is for tokenizing the words, then each word of the tokenized list shall be given a specific index 
tokenizer = Tokenizer(oov_token= '<0VV>') # <0VV> is the token that the model shall replace the unseen words with. So if some word of the test case is not there in tokenized list then that word shall be replaces with 0VV this is done to maintain the size of the sentence. 
tokenizer.fit_on_texts(X.astype(str)) # we have to use astype(str) as there are some numbers in X_train[!]
word_index_ = tokenizer.word_index # gives index to each word and then makes a dictionary of word and their indexes.

# # since computer does not understand letters so we shall make list of sequence of numbers that is replaces sentences with sequences of indexes corresponding to those words that shall be feeded to our neural network model.
# # the numbers that would constitue to this list are those numbers which are index of the corresponding words in the dictionary.
sequence_train = tokenizer.texts_to_sequences(X)

In [18]:
print(word_index_)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [19]:
maxi = 0
for i in sequence_train:
    maxi = max(len(i), maxi)

In [20]:
print(maxi)

653


In [21]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_train = pad_sequences(sequence_train, maxlen= maxi)

In [22]:
print(padded_train.shape)

(431211, 653)


In [23]:
X_train = padded_train

### Creating model architecture

In [24]:
NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(256, kernel_initializer='normal',input_dim = X_train.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(64, kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               167424    
                                                                 
 dense_1 (Dense)             (None, 64)                16448     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 183,937
Trainable params: 183,937
Non-trainable params: 0
_________________________________________________________________


In [25]:
checkpoint_name = 'Weights_2_-{epoch:03d}--{val_loss:.5f}.hX_train5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [None]:
NN_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split = 0.1, callbacks=callbacks_list)

In [28]:
NN_model = tf.keras.models.load_model('/content/Weights_2_-013--2.50026.hX_train5')

In [29]:
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               167424    
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 249,729
Trainable params: 249,729
Non-trainable params: 0
_________________________________________________________________
