In [1]:
import os
import random

import pandas as pd
import numpy as np
import text_normalizer as tn
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.svm import LinearSVC

import lightgbm as lgbm

import shap

from sklearn.model_selection import KFold 

pd.set_option('display.max_rows',1000)
pd.set_option('display.max_columns',1000)
pd.set_option('display.max_colwidth', None)

%matplotlib inline

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

stopwords = nltk.corpus.stopwords.words('english')

### Preprocessing

In [None]:
# MovieSummaries dataset. Source: http://www.cs.cmu.edu/~ark/personas/
df_meta = pd.read_csv('movie_genre_prediction/movie.metadata.tsv', sep='\t')
df_meta.head()

In [None]:
df_plot = pd.read_csv('movie_genre_prediction/plot_summaries.txt', sep='\t')
#df_plot.head()

In [None]:
df = pd.merge(df_plot, df_meta,on='movie_id',how='left')
#df.head()

In [None]:
df.drop(['x1','title','date','x2','x3','lang','country'],axis=1,inplace=True)
df.shape

In [None]:
df['plot'] = df['plot'].astype(str)
df['tags'] = df['tags'].astype(str)
df['tags'] = df['tags'].str.lower()

In [None]:
df['sci'] = ''
df['sci'] = df['tags'].apply(lambda x : 1 if 'science fiction' in x else 0)

In [None]:
df_sci = df[df['sci'] == 1]
df_sci.shape

In [None]:
df_non_sci = df[df['sci'] == 0][:2500]
df_non_sci.shape

In [None]:
df_train = pd.concat([df_sci,df_non_sci])

In [None]:
df_train['sci'].value_counts()

In [None]:
df_train['cleaned_plot'] = tn.normalize_corpus(corpus=df_train['plot'],stopwords=stopwords)
df_train.head()

In [None]:
df_train.drop(['plot','tags'],axis=1,inplace=True)

In [None]:
df_train.to_csv('cleaned_plots.csv',index=False)

### TF IDF features

In [None]:
df = pd.read_csv(''movie_genre_prediction/cleaned_plots.csv')

In [None]:
stopwords =  stopwords + ['los', 'must', 'may', 'could','jim','would','without','also','thus','however','ben']
max_features = 700
min_df = 10
max_df = 0.5

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=31, shuffle=True, stratify=df['sci'])

In [None]:
tfidf = TfidfVectorizer(max_features=max_features, min_df=min_df,max_df=max_df,stop_words=stopwords,analyzer='word',\
                            token_pattern='[^\W\d_]{2,}', ngram_range=(1,3),strip_accents='unicode')

In [None]:
train_texts = df_train['cleaned_plot'].tolist()

tfidf.fit(train_texts)

tfidf_train = np.array(tfidf.transform(df_train['cleaned_plot']).todense(), dtype=np.float16)

tfidf_feature_names = { v:k for k,v in tfidf.vocabulary_.items() }

for i in range(max_features):
    df_train['tfidf_' + tfidf_feature_names[i]] = tfidf_train[:, i]

In [None]:
tfidf_test = np.array(tfidf.transform(df_test['cleaned_plot']).todense(), dtype=np.float16)

for i in range(max_features):
    df_test['tfidf_' + tfidf_feature_names[i]] = tfidf_test[:, i]

In [None]:
X_train = df_train.drop(['movie_id','sci','cleaned_plot'], axis=1,errors='ignore')
y_train = df_train['sci']
X_test = df_test.drop(['movie_id','sci','cleaned_plot'], axis=1,errors='ignore')
y_test = df_test['sci']

In [None]:
lr = LogisticRegression()
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()
lgm = lgbm.LGBMClassifier()

for model in [svm, knn,d_tree,lr,lgm,forest]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
df_out = X_test.copy()
df_out['truth'] = y_test
df_out.reset_index(inplace=True)
df_out['predicted'] = y_pred
df_misclassified = df_out[df_out['truth'] != df_out['predicted']][['index','truth','predicted']]

In [None]:
df_misclassified.head(10)

In [None]:
index = 1166
orig_index = df_misclassified[df_misclassified['index'] == index].index.values[0]
movie_id = df.loc[index]['movie_id']
print(df.loc[index])
print(X_test.loc[index])
print(orig_index)
print(movie_id)

In [None]:
df_meta[df_meta['movie_id'] == movie_id]

In [None]:
model = lr

In [None]:
feature_names = tfidf.get_feature_names_out()
explainer = shap.Explainer(model, X_train, feature_names=feature_names)

In [None]:
shap.initjs()

In [None]:
#shapexplainer = shap.Explainer(forest, X_train, feature_names=feature_names)
shap_values = explainer.shap_values(X_test)

In [None]:
shap_values_exp = explainer(X_test)

#### Global Explanations

In [None]:
try:
    # explainer for tree-based models
    shap.plots.bar(shap_values_exp[:,:,1], max_display=20)
except IndexError:
    # falling back to standard explainer
    print('Falling back to standard explainer')
    shap.plots.bar(shap_values_exp, max_display=20)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar',feature_names=feature_names, max_display=20)

In [None]:
# change the first argument from 0 to 1 to see the chart from other angle

try:
    shap.summary_plot(shap_values[0], X_test, plot_type='violin',feature_names=feature_names, max_display=20)
except AssertionError:
    print('Falling back to standard explainer')
    shap.summary_plot(shap_values, X_test, plot_type='violin',feature_names=feature_names, max_display=20)

In [None]:
# change the first argument from 1 to 0 to see the chart from other angle

try:
    shap.summary_plot(shap_values[1], X_test, plot_type='dot',feature_names=feature_names, max_display=20)
except AssertionError:
    print('Falling back to standard explainer')
    shap.summary_plot(shap_values, X_test, plot_type='violin',feature_names=feature_names, max_display=20)

#### Local Explainers

In [None]:
try:
    shap.plots.waterfall(shap_values_exp[orig_index,:,1], max_display=20)
except IndexError:
    print('Falling back')
    shap.plots.waterfall(shap_values_exp[orig_index], max_display=20)

In [None]:
try:
    shap.plots.bar(shap_values_exp[orig_index,:,1], max_display=20)
except IndexError:
    print('Falling back')
    shap.plots.bar(shap_values_exp[orig_index], max_display=20)

In [None]:
# TODO: to understand this plot
try:
    shap.dependence_plot(feature_names.tolist().index('like'), shap_values[1], X_test)
except TypeError:
    shap.dependence_plot(feature_names.tolist().index('like'), shap_values, X_test)

In [None]:
def predict(x):
    data = pd.Series(x)
    return model.predict_proba(tfidf.transform(data))

masker = shap.maskers.Text(r"\W")
corpus = [df.loc[index]['cleaned_plot']]
single_explainer = shap.Explainer(predict, masker, output_names=['Non Sci-Fi','Sci-Fi'])
single_shap_values = single_explainer(corpus)
shap.plots.text(single_shap_values)

In [None]:
choosen_instance = X_test.loc[index]
shap_values2 = explainer.shap_values(choosen_instance)
try:
    plot = shap.force_plot(explainer.expected_value[1], shap_values2[1], choosen_instance)
except IndexError:
    plot = shap.force_plot(explainer.expected_value, shap_values2, choosen_instance)
# the code block did not display the chart in the try-catch so I had to explicitly make the plot to be shown with this last line
plot

In [None]:
from sklearn.metrics import classification_report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Word2vec

In [None]:
df = pd.read_csv('cleaned_plots.csv')

In [None]:
import gensim
from gensim import utils as gensim_utils

sentences=[gensim_utils.simple_preprocess(x) for x in df['cleaned_plot'].tolist()]

vector_size = 300
window_size = 10
min_count = 10

w2v_model = gensim.models.Word2Vec(sentences,
                                   vector_size=vector_size,
                                   window=window_size,
                                   min_count=min_count)

In [None]:
w2v_model.wv.most_similar('deep')

In [None]:
w2v_model.build_vocab(sentences)

In [None]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

In [None]:
# tokenize the documents into words

from nltk.tokenize import word_tokenize
df['tok_plot'] = df['cleaned_plot'].str.lower().apply(word_tokenize)
#df['tok_plot_bi'] = df['tok_plot'].apply(lambda x: [x[0] + ' ' + x[1] for x in list(nltk.bigrams(x))])
#df['tok_plot_sum'] = df['tok_plot'] + df['tok_plot_bi']

In [None]:
df.head(5)

In [None]:
# convert the tokenized words into list of word vectors

words = set(w2v_model.wv.index_to_key )
df['vect_plot'] = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in df['tok_plot']])

In [None]:
df.head()

In [None]:
# as the length of the document vary so does the length of word vector list
# for machine learning we need same size word vector list

for i, v in enumerate(df['vect_plot']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
# to have the same size vectors for all document we are generating the averaged document vectors
# the result is a constant size word vector for all documents

text_vect_avg = []
for v in df['vect_plot']:
    if v.size:
        text_vect_avg.append(v.mean(axis=0))
    else:
        text_vect_avg.append(np.zeros(vector_size, dtype=float)) # the same vector size must be used here as for model training
        
        
df['vect_plot_avg'] = text_vect_avg
df.head()

In [None]:
# now we can see that the vector lengths are constant

for i, v in enumerate(df['vect_plot_avg']):
    print(len(df['tok_plot'].iloc[i]), len(v))

In [None]:
df_train = pd.DataFrame(text_vect_avg)
df_train.columns = ['vec_avg_' + str(i+1) for i in range(0, df_train.shape[1])]
df_train

In [None]:
final_df = pd.concat([df[['sci']], df_train], axis=1, sort=False)
final_df

In [None]:
X = final_df.drop(['sci'], axis=1,errors='ignore')
y = final_df['sci']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, shuffle=True)

In [None]:
from sklearn import metrics

lr = LogisticRegression(max_iter=1000)
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
svm = LinearSVC()

for model in [lr,knn,d_tree,forest,svm]:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('%s accuracy score: %f' % (model.__class__.__name__, model.score(X_test, y_test)))
    #print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))