In [33]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [37]:
data = pd.read_csv('movie_data.csv')
data['movie_id']=data.index
data.head(5)

Unnamed: 0,title,description,image,genre,movie_id
0,Enola Holmes 2,"Now a detective-for-hire, Enola Holmes takes on her first official case to find a missing girl as the sparks of a dangerous conspiracy ignite a mystery that requires the help of friends - and Sherlock himself - to unravel",https://m.media-amazon.com/images/M/MV5BMDI1NWM1ZDItNDFhMi00YWRhLTg1YzItNTNhY2M2N2QzY2FkXkEyXkFqcGdeQXVyMTEyMjM2NDc2.jpg,action,0
1,Bullet Train,Five assassins aboard a swiftly-moving bullet train find out that their missions have something in common.,https://m.media-amazon.com/images/M/MV5BMDU2ZmM2OTYtNzIxYy00NjM5LTliNGQtN2JmOWQzYTBmZWUzXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg,action,1
2,Everything Everywhere All at Once,"An aging Chinese immigrant is swept up in an insane adventure, in which she alone can save the world by exploring other universes connecting with the lives she could have led.",https://m.media-amazon.com/images/M/MV5BYTdiOTIyZTQtNmQ1OS00NjZlLWIyMTgtYzk5Y2M3ZDVmMDk1XkEyXkFqcGdeQXVyMTAzMDg4NzU0.jpg,action,2
3,Kantara,It involves culture of Kambla and Bhootha Kola. A human and nature conflict where Shiva is a rebel who defends his village and nature. A death leads to war between villagers and evil forces. Will he able to regain peace in the village?,https://m.media-amazon.com/images/M/MV5BNjQyNGI5OWEtZjI1Yy00NDVjLWE4MTAtMzRlNzU1NzM2OGVkXkEyXkFqcGdeQXVyMTA1NzEzOTU1.jpg,action,3
4,Thor: Love and Thunder,"Thor enlists the help of Valkyrie, Korg and ex-girlfriend Jane Foster to fight Gorr the God Butcher, who intends to make the gods extinct.",https://m.media-amazon.com/images/M/MV5BYmMxZWRiMTgtZjM0Ny00NDQxLWIxYWQtZDdlNDNkOTEzYTdlXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg,action,4


In [38]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [39]:
data['description'] = data['description'].apply(lambda x: clean_text(x))
data['description'] = data['description'].apply(lambda x: remove_stopwords(x))

In [41]:
df = data[['movie_id','title','description','genre']]
df.head()

Unnamed: 0,movie_id,title,description,genre
0,0,Enola Holmes 2,detective hire enola holmes takes first official case find missing girl sparks dangerous conspiracy ignite mystery requires help friends sherlock unravel,action
1,1,Bullet Train,five assassins aboard swiftly moving bullet train find missions something common,action
2,2,Everything Everywhere All at Once,aging chinese immigrant swept insane adventure alone save world exploring universes connecting lives could led,action
3,3,Kantara,involves culture kambla bhootha kola human nature conflict shiva rebel defends village nature death leads war villagers evil forces able regain peace village,action
4,4,Thor: Love and Thunder,thor enlists help valkyrie korg ex girlfriend jane foster fight gorr god butcher intends make gods extinct,action


In [42]:
movie_genres = df.groupby('title')['genre'].apply(list).to_frame().reset_index()

df = df[['title','description']].drop_duplicates().merge(movie_genres, how='right', on='title')
#Drop the duplicate columns
df.head(5)

Unnamed: 0,title,description,genre
0,'83,june lords cricket ground witnessed men beat two times world champions west indies putting india back onto cricket world stage,"[biography, history]"
1,'G' Men,james cagney helped jump start gangster genre public enemy outcries movies glorified underworld criminals put cagney side law g men,[film noir]
2,...All the Marbles,small timer female wrestling team california dolls manager must face hardship sport life suceed,[sport]
3,10 Cloverfield Lane,young woman held underground bunker man insists hostile event left surface earth uninhabitable,[thriller]
4,10 Things I Hate About You,pretty popular teenager cant go date ill tempered older sister,[romance]


In [43]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genre'])

# transform target variable
y = multilabel_binarizer.transform(df['genre'])

In [46]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['description'], y, test_size=0.2, random_state=9)

In [47]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [48]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [49]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [50]:
y_pred[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0])

In [51]:
multilabel_binarizer.inverse_transform(y_pred)[3]

('war',)

In [52]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.0021008403361344537

In [57]:
# predict probabilities
y_pred_prob = clf.predict_proba(xval_tfidf)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

In [58]:
# evaluate performance
f1_score(yval, y_pred_new, average="micro")

0.2787550744248985

# image classification

In [81]:
import os
import numpy as np
import shutil

# # Creating Train / Val / Test folders (One time use)
root_dir = 'image_data'


os.makedirs(root_dir +'/train')
os.makedirs(root_dir +'/val')
src = "images" # Folder to copy images from

allFileNames = os.listdir(src)
np.random.shuffle(allFileNames)
train_FileNames, val_FileNames = np.split(np.array(allFileNames), [int(len(allFileNames)*0.9)])

train_FileNames = [src+'/'+ name for name in train_FileNames.tolist()]
val_FileNames = [src+'/' + name for name in val_FileNames.tolist()]

# Copy-pasting images
for name in train_FileNames:
    shutil.copy(name, "image_data/train")

for name in val_FileNames:
    shutil.copy(name, "image_data/val")


In [82]:
train_FileNames.shape

(3102,)

In [83]:
val_FileNames.shape

(345,)

In [88]:
labels = pd.read_csv('movie_df_final.csv')
labels.head(5)

Unnamed: 0,title,description,image,genre,image_name,action,adventure,animation,biography,comedy,...,musical,mystery,romance,sci-fi,short film,sport,superhero,thriller,war,western
0,'83,on june 25 1983 the lords cricket ground witnessed 14 men beat the two times world champions west indies putting india back onto the cricket world stage,https://m.media-amazon.com/images/M/MV5BNTc0MWIzMjYtMDg4Mi00YmM3LWIxNGUtYjQ5NzBlZTM3YWI2XkEyXkFqcGdeQXVyODE5NzE3OTE@.jpg,"['biography', 'history']",biography history_0.jpg,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,'G' Men,james cagney helped jumpstart the gangster genre as the public enemy outcries against movies that glorified underworld criminals put cagney on the side of the law in g men,https://m.media-amazon.com/images/M/MV5BMjA1ODEyNzIzMl5BMl5BanBnXkFtZTgwNTg0OTMyMjE@.jpg,['film noir'],film noir_1.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,...All the Marbles,small timer female wrestling team the california dolls and their manager must face the hardship of their sport and life to suceed,https://m.media-amazon.com/images/M/MV5BNWVmNTc5NGMtMDIzMC00NTczLWFhOWQtNDYzOTAwMjU1ODc0XkEyXkFqcGdeQXVyMjI4MjA5MzA@.jpg,['sport'],sport_2.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,10 Cloverfield Lane,a young woman is held in an underground bunker by a man who insists that a hostile event has left the surface of the earth uninhabitable,https://m.media-amazon.com/images/M/MV5BMjEzMjczOTIxMV5BMl5BanBnXkFtZTgwOTUwMjI3NzE@.jpg,['thriller'],thriller_3.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,10 Things I Hate About You,a pretty popular teenager cant go out on a date until her illtempered older sister does,https://m.media-amazon.com/images/M/MV5BMmVhZjhlZDYtMDAwZi00MDcyLTgzOTItOWNiZjY0YmE0MGE0XkEyXkFqcGdeQXVyMTQxNzMzNDI@.jpg,['romance'],romance_4.jpg,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
labels = labels.drop(['genre','image','description','title'], axis=1)


In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tqdm.auto import tqdm

sns.set_style('darkgrid')

In [102]:
batch_size=128
image_data_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, validation_split=0.1)

train_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='./image_data/train',
    x_col='image_name',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=batch_size,
    subset='training'
)

valid_generator = image_data_generator.flow_from_dataframe(
    dataframe=labels,
    directory='./image_data/val',
    x_col='image_name',
    y_col=labels.columns.tolist()[1:],
    class_mode='raw',
    color_mode="rgb",
    target_size=(224, 224),
    batch_size=batch_size,
    subset='validation'
)

Found 2792 validated image filenames.
Found 34 validated image filenames.


In [103]:
inputs = tf.keras.Input(shape=(224, 224, 3))
x = tf.keras.applications.MobileNetV2(include_top=False)(inputs)
x = tf.keras.layers.GlobalAveragePooling2D()(x)
outputs = tf.keras.layers.Dense(5, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs, outputs)
model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=1e-4))

model.summary()
tf.keras.utils.plot_model(model, show_shapes=True)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 mobilenetv2_1.00_224 (Funct  (None, None, None, 1280)  2257984  
 ional)                                                          
                                                                 
 global_average_pooling2d (G  (None, 1280)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 5)                 6405      
                                                                 
Total params: 2,264,389
Trainab

  super().__init__(name, **kwargs)


In [104]:
rlp = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.01)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

history = model.fit(train_generator, validation_data=valid_generator, epochs=10, callbacks=[rlp, es])

Epoch 1/10




InvalidArgumentError: Graph execution error:

Detected at node 'gradient_tape/binary_crossentropy/logistic_loss/mul/BroadcastGradientArgs' defined at (most recent call last):
    File "C:\Users\Elisaveta\anaconda3\lib\runpy.py", line 194, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\Elisaveta\anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
      app.start()
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
      self.io_loop.start()
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\Elisaveta\anaconda3\lib\asyncio\base_events.py", line 570, in run_forever
      self._run_once()
    File "C:\Users\Elisaveta\anaconda3\lib\asyncio\base_events.py", line 1859, in _run_once
      handle._run()
    File "C:\Users\Elisaveta\anaconda3\lib\asyncio\events.py", line 81, in _run
      self._context.run(self._callback, *self._args)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\ioloop.py", line 688, in <lambda>
      lambda f: self._run_callback(functools.partial(callback, future))
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\ioloop.py", line 741, in _run_callback
      ret = callback()
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\gen.py", line 814, in inner
      self.ctx_run(self.run)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\gen.py", line 775, in run
      yielded = self.gen.send(value)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 365, in process_one
      yield gen.maybe_future(dispatch(*args))
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
      yield gen.maybe_future(handler(stream, idents, msg))
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 543, in execute_request
      self.do_execute(
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\tornado\gen.py", line 234, in wrapper
      yielded = ctx_run(next, result)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 306, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2894, in run_cell
      result = self._run_cell(
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in _run_cell
      return runner(coro)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3165, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3357, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3437, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "<ipython-input-104-20f9081aeb49>", line 4, in <module>
      history = model.fit(train_generator, validation_data=valid_generator, epochs=10, callbacks=[rlp, es])
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\engine\training.py", line 997, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 576, in minimize
      grads_and_vars = self._compute_gradients(
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 634, in _compute_gradients
      grads_and_vars = self._get_gradients(
    File "C:\Users\Elisaveta\anaconda3\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 510, in _get_gradients
      grads = tape.gradient(loss, var_list, grad_loss)
Node: 'gradient_tape/binary_crossentropy/logistic_loss/mul/BroadcastGradientArgs'
Incompatible shapes: [128,5] vs. [128,24]
	 [[{{node gradient_tape/binary_crossentropy/logistic_loss/mul/BroadcastGradientArgs}}]] [Op:__inference_train_function_11987]