In [33]:
import pandas as pd
import numpy as np
import json
import nltk
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [37]:
data = pd.read_csv('movie_data.csv')
data['movie_id']=data.index
data.head(5)

Unnamed: 0,title,description,image,genre,movie_id
0,Enola Holmes 2,"Now a detective-for-hire, Enola Holmes takes on her first official case to find a missing girl as the sparks of a dangerous conspiracy ignite a mystery that requires the help of friends - and Sherlock himself - to unravel",https://m.media-amazon.com/images/M/MV5BMDI1NWM1ZDItNDFhMi00YWRhLTg1YzItNTNhY2M2N2QzY2FkXkEyXkFqcGdeQXVyMTEyMjM2NDc2.jpg,action,0
1,Bullet Train,Five assassins aboard a swiftly-moving bullet train find out that their missions have something in common.,https://m.media-amazon.com/images/M/MV5BMDU2ZmM2OTYtNzIxYy00NjM5LTliNGQtN2JmOWQzYTBmZWUzXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg,action,1
2,Everything Everywhere All at Once,"An aging Chinese immigrant is swept up in an insane adventure, in which she alone can save the world by exploring other universes connecting with the lives she could have led.",https://m.media-amazon.com/images/M/MV5BYTdiOTIyZTQtNmQ1OS00NjZlLWIyMTgtYzk5Y2M3ZDVmMDk1XkEyXkFqcGdeQXVyMTAzMDg4NzU0.jpg,action,2
3,Kantara,It involves culture of Kambla and Bhootha Kola. A human and nature conflict where Shiva is a rebel who defends his village and nature. A death leads to war between villagers and evil forces. Will he able to regain peace in the village?,https://m.media-amazon.com/images/M/MV5BNjQyNGI5OWEtZjI1Yy00NDVjLWE4MTAtMzRlNzU1NzM2OGVkXkEyXkFqcGdeQXVyMTA1NzEzOTU1.jpg,action,3
4,Thor: Love and Thunder,"Thor enlists the help of Valkyrie, Korg and ex-girlfriend Jane Foster to fight Gorr the God Butcher, who intends to make the gods extinct.",https://m.media-amazon.com/images/M/MV5BYmMxZWRiMTgtZjM0Ny00NDQxLWIxYWQtZDdlNDNkOTEzYTdlXkEyXkFqcGdeQXVyMTkxNjUyNQ@@.jpg,action,4


In [38]:
# function for text cleaning 
def clean_text(text):
    # remove backslash-apostrophe 
    text = re.sub("\'", "", text) 
    # remove everything except alphabets 
    text = re.sub("[^a-zA-Z]"," ",text) 
    # remove whitespaces 
    text = ' '.join(text.split()) 
    # convert text to lowercase 
    text = text.lower() 
    
    return text

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# function to remove stopwords
def remove_stopwords(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    return ' '.join(no_stopword_text)

In [39]:
data['description'] = data['description'].apply(lambda x: clean_text(x))
data['description'] = data['description'].apply(lambda x: remove_stopwords(x))

In [41]:
df = data[['movie_id','title','description','genre']]
df.head()

Unnamed: 0,movie_id,title,description,genre
0,0,Enola Holmes 2,detective hire enola holmes takes first official case find missing girl sparks dangerous conspiracy ignite mystery requires help friends sherlock unravel,action
1,1,Bullet Train,five assassins aboard swiftly moving bullet train find missions something common,action
2,2,Everything Everywhere All at Once,aging chinese immigrant swept insane adventure alone save world exploring universes connecting lives could led,action
3,3,Kantara,involves culture kambla bhootha kola human nature conflict shiva rebel defends village nature death leads war villagers evil forces able regain peace village,action
4,4,Thor: Love and Thunder,thor enlists help valkyrie korg ex girlfriend jane foster fight gorr god butcher intends make gods extinct,action


In [42]:
movie_genres = df.groupby('title')['genre'].apply(list).to_frame().reset_index()

df = df[['title','description']].drop_duplicates().merge(movie_genres, how='right', on='title')
#Drop the duplicate columns
df.head(5)

Unnamed: 0,title,description,genre
0,'83,june lords cricket ground witnessed men beat two times world champions west indies putting india back onto cricket world stage,"[biography, history]"
1,'G' Men,james cagney helped jump start gangster genre public enemy outcries movies glorified underworld criminals put cagney side law g men,[film noir]
2,...All the Marbles,small timer female wrestling team california dolls manager must face hardship sport life suceed,[sport]
3,10 Cloverfield Lane,young woman held underground bunker man insists hostile event left surface earth uninhabitable,[thriller]
4,10 Things I Hate About You,pretty popular teenager cant go date ill tempered older sister,[romance]


In [43]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df['genre'])

# transform target variable
y = multilabel_binarizer.transform(df['genre'])

In [46]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)
# split dataset into training and validation set
xtrain, xval, ytrain, yval = train_test_split(df['description'], y, test_size=0.2, random_state=9)

In [47]:
# create TF-IDF features
xtrain_tfidf = tfidf_vectorizer.fit_transform(xtrain)
xval_tfidf = tfidf_vectorizer.transform(xval)

In [48]:
from sklearn.linear_model import LogisticRegression

# Binary Relevance
from sklearn.multiclass import OneVsRestClassifier

# Performance metric
from sklearn.metrics import f1_score

In [49]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)
# fit model on train data
clf.fit(xtrain_tfidf, ytrain)
# make predictions for validation set
y_pred = clf.predict(xval_tfidf)

In [50]:
y_pred[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0])

In [51]:
multilabel_binarizer.inverse_transform(y_pred)[3]

('war',)

In [52]:
# evaluate performance
f1_score(yval, y_pred, average="micro")

0.0021008403361344537

In [57]:
# predict probabilities
y_pred_prob = clf.predict_proba(xval_tfidf)
t = 0.1 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)

In [58]:
# evaluate performance
f1_score(yval, y_pred_new, average="micro")

0.2787550744248985