In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import string
import os
import requests

from requests import get
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
from xgboost import XGBClassifier
from tqdm import tqdm
from time import sleep

from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from tensorflow.keras import layers, Sequential, callbacks, optimizers, utils, models, applications
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.regularizers import l2, l1
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
GENRE_NAMES = ["action", "adventure", "animation", "biography", "comedy", "crime", "drama", "family", "fantasy", "film-noir", "history", "horror", "music", "musical", "mystery", "romance", "scifi", "sport", "thriller", "war", "western"]

In [3]:
path_ls = [f'../raw_data/500_points/{genre_name}.csv' for genre_name in GENRE_NAMES]
df = pd.concat(
    map(pd.read_csv, path_ls), ignore_index=True)
df = df.drop_duplicates().reset_index(drop=True)

In [7]:
df['RN'] = df.sort_values(['imdb_id']).groupby(['imdb_id']).cumcount() + 1
merged_df = df[df['RN'] == 1].drop(columns=['RN', 'Unnamed: 0'])

In [8]:
merged_df.shape

(4237, 5)

In [9]:
merged_df.head()

Unnamed: 0,movie,imdb_id,genre,plot,image_url
0,Ramayana: The Legend of Prince Rama,tt0259534,"['Animation', ' Action', ' Adventure']","The fantastic story of Rama, a young prince wh...",http://img.omdbapi.com/?i=tt0259534&h=600&apik...
11,The Legend of Maula Jatt,tt4139928,"['Action', ' Drama', ' Fantasy']",From times untold where legends are written in...,http://img.omdbapi.com/?i=tt4139928&h=600&apik...
12,Terminator 2: Judgment Day,tt0103064,"['Action', ' Sci-Fi']",Over 10 years have passed since the first mach...,http://img.omdbapi.com/?i=tt0103064&h=600&apik...
14,Gladiator,tt0172495,"['Action', ' Adventure', ' Drama']","Maximus is a powerful Roman general, loved by ...",http://img.omdbapi.com/?i=tt0172495&h=600&apik...
16,The Dark Knight Rises,tt1345836,"['Action', ' Drama', ' Thriller']",Despite his tarnished reputation after the eve...,http://img.omdbapi.com/?i=tt1345836&h=600&apik...


In [47]:
merged_df = pd.read_csv('../raw_data/long_sypnosis/long_sypnosis.csv')
merged_df["plot"] = merged_df["sypnosis"]

In [48]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

In [49]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    # Advanced cleaning
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation
    
    tokenized_sentence = word_tokenize(sentence) ## tokenize 
    stop_words = set(stopwords.words('english')) ## define stopwords
    
    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos = "v") 
        for word in tokenized_sentence_cleaned
    ]
    
    cleaned_sentence = ' '.join(word for word in lemmatized)
    
    return cleaned_sentence

In [50]:
merged_df["clean_plot"] = merged_df["plot"].apply(cleaning)

In [79]:
merged_df["genre"] = (
    merged_df["genre"]
    .apply(eval)
    .apply(lambda x: [genre.strip() for genre in x])
    )

TypeError: eval() arg 1 must be a string, bytes or code object

In [None]:
#check the genre we have
merged_df["genre"].explode().value_counts()

genre
Drama        2824
Comedy       1495
Adventure    1045
Action        994
Crime         806
Romance       647
Biography     591
Thriller      544
Horror        511
Mystery       504
Animation     405
Fantasy       387
Sci-Fi        360
Family        312
History       298
Music         261
War           201
Sport         187
Musical       106
Western       104
Film-Noir      59
Name: count, dtype: int64

In [76]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(merged_df["genre"])

# transform target variable
y = multilabel_binarizer.transform(merged_df['genre'])

In [77]:
y.shape

(4624, 21)

In [78]:
genre_names = multilabel_binarizer.classes_

# Adding 
for i in range(len(genre_names)):
    merged_df[f"{genre_names[i]}"] = y[:,i]

merged_df.shape

(4624, 29)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(merged_df['clean_plot'], y, test_size=0.3)

In [57]:
# Instantiating the TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.05, max_features=10000)

# Training it on the texts
X_train_vec = pd.DataFrame(tf_idf_vectorizer.fit_transform(X_train).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())
X_test_vec =  pd.DataFrame(tf_idf_vectorizer.transform(X_test).toarray(),
                 columns = tf_idf_vectorizer.get_feature_names_out())

In [58]:
X_train_vec.head()

Unnamed: 0,abandon,ability,able,aboard,abuse,accept,accident,accidentally,accompany,account,...,writer,wrong,year,yearold,years,yell,yet,york,young,younger
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.038444,0.0,0.0,0.0,0.0,0.266858
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010517,0.0,0.013621,...,0.0,0.0,0.0,0.0,0.038424,0.0,0.009824,0.0,0.017009,0.0
3,0.012096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03448,0.0
4,0.009029,0.0,0.042081,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017441,0.0,0.0,0.0,0.020589,0.0


In [59]:
X_test_vec.head()

Unnamed: 0,abandon,ability,able,aboard,abuse,accept,accident,accidentally,accompany,account,...,writer,wrong,year,yearold,years,yell,yet,york,young,younger
0,0.060737,0.0,0.0,0.0,0.082595,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034626,0.0
1,0.0,0.0,0.0,0.0,0.0,0.061568,0.0,0.14979,0.0,0.0,...,0.0,0.0,0.0,0.0,0.045606,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.039505,0.0,0.0,0.0,0.182483,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032793,0.0
3,0.0,0.0,0.0,0.0,0.0,0.070866,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.045905,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.109184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02671,0.0


In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, accuracy_score

## OneVsRest logistic regression

### Simple OneVsRest logistic regression

In [61]:
log = LogisticRegression()
model_test_1 = OneVsRestClassifier(log)

model_test_1.fit(X_train_vec, y_train)

In [62]:
y_pred = model_test_1.predict(X_test_vec)

In [63]:
accuracy_score(y_test, y_pred)

0.06628242074927954

In [66]:
multilabel_binarizer.inverse_transform(y_pred)[1]

('Drama',)

In [67]:
merged_df[merged_df.clean_plot == X_test.iat[1]]

Unnamed: 0.1,Unnamed: 0,movie,imdb_id,genre,image_url,sypnosis,plot,clean_plot,Action,Adventure,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
1539,1539,Puncture,tt1582248,"[Biography, Drama]",http://img.omdbapi.com/?i=tt1582248&h=600&apik...,A David and Goliath law drama about a drug-add...,A David and Goliath law drama about a drug-add...,david goliath law drama drugaddicted lawyer ta...,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Grouping the topic

In [68]:
from sklearn.decomposition import LatentDirichletAllocation

# Instantiate the LDA 
n_components = 15
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 1000)

# Fit the LDA on the vectorized documents
lda_model.fit(X_train_vec)

In [69]:
X_train_topic_mixture = lda_model.transform(X_train_vec)
X_train_topic_df = pd.DataFrame(X_train_topic_mixture)
X_train_topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.880753,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518,0.008518
1,0.877313,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763,0.008763
2,0.922893,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508,0.005508
3,0.871125,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205,0.009205
4,0.902430,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969,0.006969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3231,0.908023,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570,0.006570
3232,0.908247,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554,0.006554
3233,0.924147,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418,0.005418
3234,0.761994,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000,0.017000


In [70]:
X_test_topic_mixture = lda_model.transform(X_test_vec)
X_test_topic_df = pd.DataFrame(X_test_topic_mixture)
X_test_topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.886877,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080,0.008080
1,0.899865,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152,0.007152
2,0.921653,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596,0.005596
3,0.919883,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723,0.005723
4,0.864347,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689,0.009689
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1383,0.920492,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679,0.005679
1384,0.881270,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481,0.008481
1385,0.928230,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126,0.005126
1386,0.819829,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869,0.012869


### Training OneVsRest using topic grouping as input

In [71]:
model_test_2 = OneVsRestClassifier(log)
model_test_2.fit(X_train_topic_df, y_train)

In [72]:
y_pred_2 = model_test_2.predict(X_test_topic_df)
accuracy_score(y_test, y_pred)

0.06628242074927954

In [73]:
multilabel_binarizer.inverse_transform(y_pred_2)[0:10]

[('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',)]

In [74]:
multilabel_binarizer.inverse_transform(y_test)[0:10]

[('Drama', 'Fantasy', 'Horror'),
 ('Biography', 'Drama'),
 ('Comedy', 'Family'),
 ('Comedy', 'Romance', 'Sci-Fi'),
 ('Comedy', 'Drama', 'Music'),
 ('Action', 'Drama', 'War'),
 ('Drama', 'Thriller'),
 ('Animation', 'Comedy', 'Drama'),
 ('Action', 'Adventure', 'Comedy'),
 ('Crime', 'Drama', 'Thriller')]

### Using both topic grouping as input + OneVsRest 

In [35]:
model_test_3 = OneVsRestClassifier(log)
model_test_3.fit(X_train_topic_df, y_train)
model_test_3.fit(X_train_vec, y_train)

In [36]:
y_pred_3 = model_test_2.predict(X_test_topic_df)
accuracy_score(y_test, y_pred)

0.025157232704402517

## SGD

In [37]:
sgd = SGDClassifier(random_state = 42)
sgd_model = OneVsRestClassifier(sgd)
sgd_model.fit(X_train_vec,y_train)

In [38]:
y_pred_sgd1 = sgd_model.predict(X_test_vec)

In [39]:
multilabel_binarizer.inverse_transform(y_pred_sgd1)[0:10]

[('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 (),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',),
 ('Drama',)]

In [40]:
accuracy_score(y_test, y_pred_sgd1)

0.019654088050314465