In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_validate

from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

In [2]:
df = pd.read_csv("./movies5.csv") # dataset with summary
df = df.drop(['Unnamed: 0'], axis=1)

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def show_metrics(model, meth, fset, x, y):

    if meth == "ovr":
        clf = OneVsRestClassifier(model)
        print("Logistic Regression on {} feature set with OneVsRest\n".format(fset))
    elif meth == "ovo":
        clf = OneVsOneClassifier(model)
        print("Logistic Regression on {} feature set with OneVsOne\n".format(fset))
    else:
        clf = model
        print("Logistic Regression on {} feature set\n".format(fset))
        
    scoring = { 'accuracy' : make_scorer(accuracy_score), 
                'precision' : make_scorer(precision_score, average='macro', zero_division=0),
                'recall' : make_scorer(recall_score, average='macro', zero_division=0), 
                'f1_score' : make_scorer(f1_score, average='macro', zero_division=0)}    
    kfold = KFold(n_splits=10)

    results = cross_validate(   estimator=clf,
                                X=x,
                                y=y,
                                cv=kfold,
                                scoring=scoring)
    print(f"Accuracy :{np.mean(results['test_accuracy'])}")
    print('-'*70)
    print(f"Precision :{np.mean(results['test_precision'])}")
    print('-'*70)
    print(f"Recall :{np.mean(results['test_recall'])}")
    print('-'*70)
    print(f"F1 score :{np.mean(results['test_f1_score'])}")
    print('-'*70)
    return results

In [5]:
from sklearn.preprocessing import LabelEncoder

# Encode label
LE = LabelEncoder()
df['label'] = LE.fit_transform(df.Genre)
df = df.drop(['Genre'], axis=1)
df.head(1)

Unnamed: 0,Name,Date,Certificate,Duration,Votes,IMDB,Metascore,Director,Star1,Star2,Star3,Star4,Summary,label
0,Avatar: The Way of Water,2022,PG-13,192,181876,7.9,67.0,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Jake Sully lives with his newfound family form...,0


In [6]:
# split to explanitory and response var

X = df.iloc[:, :-1] # explanitory or features
y = df.iloc[:,-1] # response

df_train, df_val, y_train, y_val = train_test_split(df, y ,test_size=0.33, random_state=42)

In [7]:
#!pip install transformers 

In [8]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# tokenized summary sentence

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
tokenized_train = tokenizer(df_train["Summary"].values.tolist(),\
                            padding = True,truncation = True, return_tensors="pt")
tokenized_val = tokenizer(df_val["Summary"].values.tolist() , padding = True, truncation = True,  return_tensors="pt")


# move on device (GPU)
tokenized_train = {k:torch.tensor(v).to(device) for k,v in tokenized_train.items()}
tokenized_val = {k:torch.tensor(v).to(device) for k,v in tokenized_val.items()}

In [10]:
with torch.no_grad():
    hidden_train = model(**tokenized_train) #dim : [batch_size(nr_sentences), tokens, emb_dim]
    hidden_val = model(**tokenized_val)

#get only the [CLS] hidden states
cls_train = hidden_train.last_hidden_state[:,0,:]
cls_val = hidden_val.last_hidden_state[:,0,:]

In [11]:
# export features

bert_x_train = cls_train.to("cpu")
bert_y_train = y_train

bert_x_val = cls_val.to("cpu")
bert_y_val = y_val

print(bert_x_train.shape, bert_y_train.shape, bert_x_val.shape, bert_y_val.shape)

torch.Size([665, 768]) (665,) torch.Size([328, 768]) (328,)


In [12]:
# add exported features to old feature space

df_train_new = pd.concat([df_train, pd.DataFrame(bert_x_train, index = df_train.index)],axis=1)
df_val_new = pd.concat([df_val, pd.DataFrame(bert_x_val,index = df_val.index)],axis=1)
df_train_new.head()

Unnamed: 0,Name,Date,Certificate,Duration,Votes,IMDB,Metascore,Director,Star1,Star2,...,758,759,760,761,762,763,764,765,766,767
316,Finding Nemo,2003,G,100,1045550,8.2,90.0,Andrew Stanton,Lee Unkrich,Albert Brooks,...,-0.118598,-0.115944,-0.021202,-0.238341,0.214603,-0.028188,-0.041471,-0.039205,0.303629,0.287858
887,Masaan,2015,R,109,28578,8.1,80.0,Neeraj Ghaywan,Richa Chadha,Sanjay Mishra,...,-0.095614,-0.09965,-0.192176,-0.293057,0.030225,0.198788,-0.096362,-0.227162,0.235923,0.002556
544,The Remains of the Day,1993,PG,134,75711,7.8,86.0,James Ivory,Anthony Hopkins,Emma Thompson,...,0.010668,-0.256897,0.169733,-0.190286,0.117481,0.21304,-0.061148,0.034189,0.43881,0.193805
465,Who's Afraid of Virginia Woolf?,1966,Not Rated,131,76192,8.0,75.0,Mike Nichols,Elizabeth Taylor,Richard Burton,...,0.053901,-0.391932,-0.007374,-0.253204,-0.174501,0.012404,-0.028599,-0.332241,0.244898,0.155917
529,Before Sunset,2004,R,80,266238,8.1,90.0,Richard Linklater,Ethan Hawke,Julie Delpy,...,0.11569,-0.572898,0.255928,-0.370734,0.114187,-0.148663,-0.196128,-0.242217,0.403006,-0.026208


In [13]:
# concat train and validation to use in final model.
df = pd.concat([df_train_new, df_val_new])
df = df.sort_index()
df

Unnamed: 0,Name,Date,Certificate,Duration,Votes,IMDB,Metascore,Director,Star1,Star2,...,758,759,760,761,762,763,764,765,766,767
0,Avatar: The Way of Water,2022,PG-13,192,181876,7.9,67.0,James Cameron,Sam Worthington,Zoe Saldana,...,-0.227578,-0.085557,0.086981,-0.287374,0.144450,-0.204477,-0.124813,-0.016928,0.301979,0.212887
1,Knives Out,2019,PG-13,130,663687,7.9,82.0,Rian Johnson,Daniel Craig,Chris Evans,...,-0.255177,-0.200549,-0.009086,-0.248939,-0.250488,0.184530,-0.200523,-0.201389,0.448424,0.227310
2,Avatar,2009,PG-13,162,1289325,7.9,83.0,James Cameron,Sam Worthington,Zoe Saldana,...,-0.217375,-0.056055,0.148919,-0.410105,0.038562,-0.014082,-0.023151,0.033419,0.424240,0.311013
3,The Banshees of Inisherin,2022,R,114,60799,7.9,87.0,Martin McDonagh,Colin Farrell,Brendan Gleeson,...,0.076645,-0.450983,0.170847,-0.261458,-0.054667,-0.045585,-0.148260,-0.167307,0.325794,0.253769
4,A Christmas Story,1983,PG,93,159045,7.9,77.0,Bob Clark,Peter Billingsley,Melinda Dillon,...,-0.616443,-0.361742,0.003473,-0.238807,0.085032,0.041604,-0.076567,0.226465,0.348203,0.645684
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,Baby,2015,Not Rated,159,58251,7.9,80.0,Neeraj Pandey,Akshay Kumar,Danny Denzongpa,...,-0.391109,-0.133587,0.206538,-0.352747,-0.148902,0.475171,0.052318,-0.234301,0.407315,0.228957
989,Udaan,2010,Not Rated,134,45435,8.1,80.0,Vikramaditya Motwane,Rajat Barmecha,Ronit Roy,...,0.154508,-0.228614,0.087903,-0.193755,0.096263,-0.091006,-0.181661,-0.106398,0.194895,0.362645
990,Vizontele,2001,R,110,37114,8.0,80.0,Yilmaz Erdogan,Ömer Faruk Sorak,Yilmaz Erdogan,...,-0.349503,-0.507420,-0.037407,-0.209447,-0.129228,0.147713,-0.041216,-0.591865,0.432748,0.239804
991,Dev.D,2009,Not Rated,144,31023,7.9,80.0,Anurag Kashyap,Abhay Deol,Mahie Gill,...,-0.095965,-0.396708,0.066606,-0.229216,-0.236185,0.131616,-0.135899,-0.141555,0.369930,0.204268


In [14]:
# preprocessing 

directors = pd.get_dummies(df.Director, prefix='Director')
star1 = pd.get_dummies(df.Star1, prefix='star1')
star2 = pd.get_dummies(df.Star2, prefix='star2')
star3 = pd.get_dummies(df.Star3, prefix='star3')
star4 = pd.get_dummies(df.Star4, prefix='star4')
certificates = pd.get_dummies(df.Certificate, prefix='Certificate')

X_train_new = pd.concat([df, directors, star1,star2, star3, star4, certificates], axis=1)#, star1, star2, star3, star4

y = X_train_new['label']
x = X_train_new.drop(columns=['Name', 'Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4','Summary','label'])


In [15]:
#number of features for this dataset :)

X_train_new.shape

(993, 4653)

In [16]:
# normalized

scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

# Logistic Regression without PCA

In [17]:
lr = LogisticRegression()
lr_results = show_metrics(lr, "ovr", "first", x, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.5176767676767676
----------------------------------------------------------------------
Precision :0.46676095441436366
----------------------------------------------------------------------
Recall :0.3948888358646194
----------------------------------------------------------------------
F1 score :0.39629213845948175
----------------------------------------------------------------------


### PCA on these feature space

In [18]:
# Import necessary libraries
from sklearn import datasets  # to retrieve the iris Dataset
import pandas as pd  # to load the dataframe
from sklearn.preprocessing import StandardScaler  # to standardize the features
from sklearn.decomposition import PCA  # to apply PCA
import seaborn as sns  # to plot the heat maps

In [19]:
pca = PCA(n_components = 27)
pca.fit(x)
data_pca = pca.transform(x)
data_pca = pd.DataFrame(data_pca)
data_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,2.634529,20.589591,-8.1715,-3.002239,-1.426935,0.306979,-6.127013,-1.620841,-7.8436,-0.243018,...,-3.140005,-3.048527,-2.581707,0.463745,2.076623,0.63454,5.259766,-1.770236,0.845774,3.851709
1,-4.068545,-5.605605,0.744136,2.278124,-8.920463,-1.643336,4.537599,0.026564,-6.015525,-1.772549,...,3.475207,3.31182,1.700058,-0.608288,-1.263046,-2.836304,1.673864,0.841492,3.747204,-0.99626
2,-5.20556,9.00647,-4.05544,3.278979,-1.482623,3.8453,-5.010238,-1.348046,-6.882019,3.943844,...,-0.446759,-0.856791,-0.089582,-0.483442,1.966354,2.752832,3.031396,0.695761,1.482466,0.455214
3,5.313344,-3.985216,-9.076124,2.151214,-3.938088,-0.27347,-11.164812,-1.699629,-4.155167,-3.417642,...,-3.499996,3.379049,0.184675,-1.911955,-0.204951,-1.335879,-1.817948,2.878983,-3.229726,-4.031365
4,-5.351436,3.741959,0.764805,-0.698165,12.449117,4.021508,9.481889,-9.510388,17.481556,-1.871191,...,0.210752,0.391615,-8.288984,0.240334,-6.616544,-2.476837,-7.096863,3.028086,-0.897073,-0.152173


# Logistic Regression

In [20]:
lr = LogisticRegression()
lr_results = show_metrics(lr, "ovr", "first", data_pca, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.4823232323232324
----------------------------------------------------------------------
Precision :0.46249250082884963
----------------------------------------------------------------------
Recall :0.4160643049853241
----------------------------------------------------------------------
F1 score :0.415948563394372
----------------------------------------------------------------------


# SVM

In [21]:
svm = SVC()
svm_results_ovo = show_metrics(svm, "ovo", "first", data_pca, y)

Logistic Regression on first feature set with OneVsOne

Accuracy :0.47630303030303034
----------------------------------------------------------------------
Precision :0.4452782403220419
----------------------------------------------------------------------
Recall :0.3544310793281767
----------------------------------------------------------------------
F1 score :0.35471821572519235
----------------------------------------------------------------------


# MLP

In [22]:
mlp = MLPClassifier(hidden_layer_sizes=(10, 15), activation='logistic', solver='adam',random_state=12, max_iter=100,
                    learning_rate='constant', learning_rate_init=0.001)
mlp_results = show_metrics(mlp, "ovr", "first", data_pca, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.4199090909090909
----------------------------------------------------------------------
Precision :0.35082389999447366
----------------------------------------------------------------------
Recall :0.2590310413763488
----------------------------------------------------------------------
F1 score :0.23263242972347564
----------------------------------------------------------------------


# Gaussian Naive Bayes

In [23]:
gnb = GaussianNB()
gnb_results = show_metrics(gnb, "ovr", "first", data_pca, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.442959595959596
----------------------------------------------------------------------
Precision :0.4125343371858953
----------------------------------------------------------------------
Recall :0.38704818044691536
----------------------------------------------------------------------
F1 score :0.38065563757671717
----------------------------------------------------------------------


# Decision Tree

In [24]:
ds = DecisionTreeClassifier(random_state=0, max_depth=5)
ds_results = show_metrics(ds, "ovr", "first", data_pca, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.3282222222222222
----------------------------------------------------------------------
Precision :0.2692935103597405
----------------------------------------------------------------------
Recall :0.25126044282881044
----------------------------------------------------------------------
F1 score :0.2363881814823178
----------------------------------------------------------------------


### Just use CLF as a feature of summary

In [25]:
bert_train = bert_x_train[:,0].tolist()
bert_val = bert_x_val[:,0].tolist()

df_train["bert"] = bert_train
df_val["bert"] = bert_val

In [26]:
X_train_new = pd.concat([df_train, pd.DataFrame(bert_x_train, index = df_train.index)],axis=1)
X_val_new = pd.concat([df_val, pd.DataFrame(bert_x_val,index = df_val.index)],axis=1)
X_train_new.head(1)

Unnamed: 0,Name,Date,Certificate,Duration,Votes,IMDB,Metascore,Director,Star1,Star2,...,758,759,760,761,762,763,764,765,766,767
316,Finding Nemo,2003,G,100,1045550,8.2,90.0,Andrew Stanton,Lee Unkrich,Albert Brooks,...,-0.118598,-0.115944,-0.021202,-0.238341,0.214603,-0.028188,-0.041471,-0.039205,0.303629,0.287858


In [27]:
df2 = pd.concat([df_train, df_val])
df2 = df2.sort_index()
df2.head(1)

Unnamed: 0,Name,Date,Certificate,Duration,Votes,IMDB,Metascore,Director,Star1,Star2,Star3,Star4,Summary,label,bert
0,Avatar: The Way of Water,2022,PG-13,192,181876,7.9,67.0,James Cameron,Sam Worthington,Zoe Saldana,Sigourney Weaver,Stephen Lang,Jake Sully lives with his newfound family form...,0,-0.243486


In [28]:
y = df2.label
x = df2.drop(columns=['label','Name'])

In [29]:
# preprocessing 

directors = pd.get_dummies(df2.Director, prefix='Director')
star1 = pd.get_dummies(df2.Star1, prefix='star1')
star2 = pd.get_dummies(df2.Star2, prefix='star2')
star3 = pd.get_dummies(df2.Star3, prefix='star3')
star4 = pd.get_dummies(df2.Star4, prefix='star4')
certificates = pd.get_dummies(df2.Certificate, prefix='Certificate')

X_train_new = pd.concat([df2, directors, star1, star2, star3, star4, certificates], axis=1)# star1, star2, star3, star4

y = df2['label']
x = df2.drop(columns=['Name', 'Certificate', 'Director', 'Star1', 'Star2', 'Star3', 'Star4','Summary','label'])


In [30]:
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

In [31]:
pca = PCA(n_components = 6)
pca.fit(x)
data_pca = pca.transform(x)
data_pca = pd.DataFrame(data_pca)
data_pca.head()

Unnamed: 0,0,1,2,3,4,5
0,1.053369,1.728011,1.556716,1.48413,0.244727,-0.015578
1,1.172228,0.614195,-1.196662,0.373024,-1.023536,-0.340888
2,2.400077,0.338469,-0.37223,0.073424,-0.597409,-1.857804
3,-0.513291,0.352151,0.82956,-0.225289,-1.437643,0.494016
4,-0.776918,-0.021102,-1.214323,-0.170955,0.108318,0.36675


In [32]:
lr = LogisticRegression()
lr_results = show_metrics(lr, "ovr", "first", x, y)

Logistic Regression on first feature set with OneVsRest

Accuracy :0.3625555555555555
----------------------------------------------------------------------
Precision :0.1900025306860988
----------------------------------------------------------------------
Recall :0.22309595466604418
----------------------------------------------------------------------
F1 score :0.1797924147793663
----------------------------------------------------------------------
