In [None]:
# text 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

# features
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

from collections import Counter
from wordcloud import WordCloud

# Word2Vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

# other
import numpy as np 
import re
import nltk.data
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from scipy import stats     
import csv

In [None]:
data = pd.read_csv('/data/mcu.csv')
data.head()

In [None]:
data['character'].value_counts()

From above we see that there are 652 different characters. Since not all of them are known and important we can just do this analysis for 30 most important characters (most important being with largest number of dialogues).

In [None]:
important_characters = np.array(data['character'].value_counts()[:15].keys())

In [None]:
#removing rows with non-important characters
df = data.drop(data[~data.character.isin(important_characters)].index)

#removing columns that we don't need
df = df.drop(columns=df.columns[6:])
df = df.drop(columns=['year'])
df = df.drop(columns=df.columns[0])
df.head()

In [None]:
#character improtance
sns.set_style('whitegrid')
plt.figure(figsize=(10,10))
sns.countplot(y='character', data=df, order=df.character.value_counts().iloc[:15].index, palette="flare")
plt.xlabel('Number of lines of dialogue', fontsize=15)
plt.ylabel('Character', fontsize=15)
plt.title('Character Importance by Number of Lines of Dialogue', fontsize=20)
plt.show()

Of course this character importance depends deeply on the movies included in this dataset!

In [None]:
total_char_words = df.groupby('character', as_index=False).words.sum()
total_char_words = pd.DataFrame(total_char_words)
sns.set_style('whitegrid')
plt.figure(figsize=(10,10))
sns.barplot(x='words',y='character', data=total_char_words, palette="flare", order=total_char_words.sort_values('words', ascending=False).character[:15], orient='h')
plt.xlabel('Number of words of dialogue', fontsize=15)
plt.ylabel('Character', fontsize=15)
plt.title('Character Importance by Number of Words in Dialogues', fontsize=20)

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10,10))
sns.countplot(y='movie', data=df, order=df.movie.value_counts().iloc[:11].index, palette="flare")
plt.xlabel('Number of lines of dialogue', fontsize=15)
plt.ylabel('Movie', fontsize=15)
plt.title('Distribution of dialogue lines in movies', fontsize=20)
plt.show()

In [None]:
df['words'].describe()

In [None]:
df.head()

# Preparing training dataset 

Creating csv with multi-target columns

In [None]:
tsv_file = open("/data/en-annotated.tsv")
read_tsv = csv.reader(tsv_file, delimiter="\t")

column_names = ['text', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
df = pd.DataFrame(columns = column_names)
i = 0
for row in read_tsv:
    arr = pd.DataFrame([[row[0],0,0,0,0,0,0,0,0]],columns = list(column_names), index = [i])
    nums = row[1].split(",")
    for num in nums:
        arr.loc[i, column_names[int(num)]]  = 1
    df = pd.concat([df,arr])
    i+=1

# save to csv so that you don't have to run it every time
df.to_csv('/data/sentiment.csv',index=False)

In [None]:
# read from csv
mov_data = pd.read_csv('/data/sentiment.csv')
categories = list(mov_data.columns[1:].values)

In [None]:
sns.set(font_scale = 2)
plt.figure(figsize=(15,11))
ax = sns.barplot(categories, mov_data.iloc[:,1:].sum().values)

plt.title("Lines in each category", fontsize=24)
plt.ylabel('Number of lines', fontsize=18)
plt.xlabel('Line Type ', fontsize=18)
#adding the text labels
rects = ax.patches
labels = mov_data.iloc[:,1:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=18)
plt.show()

In [None]:
def num_word(line):
    return len(line.split())

mov_data['words'] = mov_data['text'].apply(num_word)

In [None]:
mov_data['words'].describe()

# Preprocessing

In [None]:
import spacy

def lemmatize(text):
    nlp = spacy.load('en', disable=['parser', 'ner'])
    doc = nlp(text)
    return (" ".join([token.lemma_ for token in doc]))

In [None]:
def preprocess(text):
    
    # lemmatize sentences 
    lemmatized = lemmatize(text)
    lemmatized = re.sub(" -PRON-", "", lemmatized)
    
    # leave only letters
    letters_only = re.sub("[^a-zA-Z]", " ", lemmatized)
       
    # remove stop words
    words = letters_only.lower().split()           
    stops = set(stopwords.words("english")) 
    meaningful_words = [w for w in words if not w in stops] 

    return(" ".join( meaningful_words ))

Preprocessing MARVEL dataset

In [None]:
df['clean_line'] = df['line'].apply(preprocess)  
df = df.drop(a[a.clean_line.isin([""])].index) 
df.to_csv('/data/marvel_dataset.csv', index=False)

Preprocessing training dataset

In [None]:
## Preprocessed data is saved to file so that preprocessing does not have to be repeated!

mov_data['clean_text'] = mov_data['text'].apply(preprocess)
mov_data = mov_data.drop(mov_data[mov_data.clean_text.isin([""])].index)
mov_data.to_csv('/data/preprocessed.csv', index=False)

# FEATURES EXTRACTION

In [None]:
def word_grams(cleans): 
    word_vectorizer = TfidfVectorizer(analyzer = "word", ngram_range=(1,3), tokenizer = None, preprocessor = None,
                                      stop_words = None, max_features = 300, max_df = 0.90) 
    wgram_features = word_vectorizer.fit_transform(cleans)
    return wgram_features.toarray()

In [None]:
def char_grams(cleans):
    char_vectorizer = TfidfVectorizer(analyzer = "char", ngram_range=(1,3), tokenizer = None, preprocessor = None,
                                      stop_words = None, max_features = 200, max_df = 0.85)
    cgram_features = char_vectorizer.fit_transform(cleans)
    return cgram_features.toarray()

Number of syllables in a word (stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word)

In [None]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

Function for extraction of linguistic features:

In [None]:
def ling_features(line, clean):
   
    # get raw text
    words = clean.split() 
    
    syllables_all = syllable_count(clean)
    num_chars = sum(len(w) for w in words)
    num_chars_total = len(line) 
    num_terms = len(line.split()) 
    num_words = len(words) 
    num_unique_terms = len(set(words))    
    
    # FKRA and FRE
    avg_syl = round(float((syllables_all+0.001))/float(num_words+0.001),4)
    
    # Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    
    # Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
        
    return [FKRA, FRE,syllables_all, avg_syl, num_chars, num_chars_total, num_terms, num_words,num_unique_terms]


def get_ling_feature_array(lines, cleans):
    feats=[] 
    for (line, clean) in zip(lines, cleans):
        feats.append(ling_features(line, clean))
    return np.array(feats)

In [None]:
from nltk.corpus import opinion_lexicon

def sentiment_features(text):
    pos_words = 0
    neg_words = 0
    pos_start = 1000
    neg_start = 1000
    pos_set = set(opinion_lexicon.positive())
    neg_set = set(opinion_lexicon.negative())
    i = 0
    for word in text.split():
        if word in pos_set:
            pos_words += 1
            if pos_start >= 1000:
                pos_start = i
            
        elif word in neg_set:
            neg_words += 1
            if neg_start >= 1000:
                neg_start = i
        i += 1
    
    if neg_words != 0:
        ratio = pos_words/neg_words
    else:
        ratio = pos_words
        
    return [pos_words, neg_words, ratio, pos_start, neg_start]


def get_sentiment_feature_array(lines):
    feats=[]
    for line in lines:
        feats.append(sentiment_features(line))
    return np.array(feats)

In [None]:
def pos_features(cleans):
    pos_vectorizer = TfidfVectorizer(tokenizer=None, lowercase=False,preprocessor=None,ngram_range=(1, 3), stop_words=None,
                                     use_idf=False, smooth_idf=False, norm=None, decode_error='replace', max_features=50,
                                     min_df=0.1,max_df=0.80)
    text_tags = []
    for c in cleans:
        tokens = c
        tags = nltk.pos_tag(tokens)
        tag_list = [x[1] for x in tags]
        tag_str = " ".join(tag_list) 
        text_tags.append(tag_str) 
        
    return pos_vectorizer.fit_transform(pd.Series(text_tags)).toarray()

In [None]:
def read_data(file_name):
    with open(file_name,'r') as f:
        word_vocab = set() # not using list to avoid duplicate entry
        word2vector = {}
        for line in f:
            line_ = line.strip() #Remove white space
            words_Vec = line_.split()
            word_vocab.add(words_Vec[0])
            word2vector[words_Vec[0]] = np.array(words_Vec[1:],dtype=float)
    print("Total Words in DataSet:",len(word_vocab))
    return word_vocab,word2vector

def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector  
   
def averaged_word_vectorizer(corpus, model, vocab, num_features):
    vocabulary = set(vocab)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

Features extraction of training dataset

In [None]:
# load preprocessed data of training dataset

prep_data = pd.read_csv('/data/preprocessed.csv')
prep_data = prep_data.drop(prep_data[prep_data.clean_text.isna()].index)

In [None]:
wordgram_features = word_grams(prep_data.clean_text)
char_gram_features = char_grams(prep_data.clean_text)
ling_feats = get_ling_feature_array(prep_data.text, prep_data.clean_text) 
sentiment_feats = get_sentiment_feature_array(prep_data.text) 
pos_feats = pos_features(prep_data.clean_text)

In [None]:
vocab, w2v = read_data("/data/glove.6B.100d.txt")
embeddings = averaged_word_vectorizer(corpus=prep_data.clean_text, model=w2v, vocab=vocab, num_features=100)
emb_df = pd.DataFrame(embeddings)

In [None]:
all_features = np.concatenate([wordgram_features, char_gram_features, ling_feats, sentiment_feats, pos_feats],axis=1)
X = pd.DataFrame(all_features)
X.to_csv("/data/features.csv")

Features extraction from MARVEL dataset

In [None]:
marvel_prep = pd.read_csv('/data/marvel_dataset.csv')

# vocab, w2v = read_data("/data/glove.6B.100d.txt")
mar_embeddings = averaged_word_vectorizer(corpus=marvel_prep.clean_line, model=w2v, vocab=vocab, num_features=100)
mar_emb_df = pd.DataFrame(mar_embeddings)

mar_wordgram_features = word_grams(marvel_prep.clean_line)
mar_char_gram_features = char_grams(marvel_prep.clean_line)
mar_ling_feats = get_ling_feature_array(marvel_prep.line, marvel_prep.clean_line) 
mar_sentiment_feats = get_sentiment_feature_array(marvel_prep.line) 
mar_pos_feats = pos_features(marvel_prep.clean_line)

mar_all_features = np.concatenate([mar_wordgram_features, mar_char_gram_features, mar_ling_feats, mar_sentiment_feats, mar_pos_feats],axis=1)
mar_X = pd.DataFrame(mar_all_features)
mar_X = pd.concat([mar_X, mar_emb_df], axis=1)
mar_X.to_csv("/data/marvel_features.csv")

# MODEL TRAINING

Loading features from file (training dataset)

In [None]:
X = pd.read_csv('/data/features.csv')
X = X.drop(labels = ['Unnamed: 0'], axis = 1)
X = pd.concat([X, emb_df], axis=1)
y = prep_data.drop(labels = ['text','clean_text'], axis=1)

Loading features from file (MARVEL dataset)

In [None]:
mar_X = pd.read_csv('/data/marvel_features.csv')
mar_X = mar_X.drop(labels = ['Unnamed: 0'], axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3, test_size=0.2)
columns = ['category', 'test_accuracy', 'train_accuracy']

OneVsRestClassifier with Logistic Regression

In [None]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)),
            ])


tps = 0
fps = 0
fns = 0
prc = 0
rec = 0
results_logreg = pd.DataFrame(columns = columns)
i = 0

character_sentiment1 = marvel_prep.drop(labels = ['clean_line', 'words'], axis=1)
for category in categories:
    print('**Processing ' + category + '**')
    
    LogReg_pipeline.fit(X_train, y_train[category])
    
    test_prediction = LogReg_pipeline.predict(X_test)
    train_prediction = LogReg_pipeline.predict(X_train)
    marv_predict = LogReg_pipeline.predict(mar_X)
    marv_pred_df = pd.DataFrame(marv_predict, columns=[category])
    
    test_accuracy = accuracy_score(y_test[category], test_prediction)
    train_accuracy = accuracy_score(y_train[category], train_prediction)
    cf = confusion_matrix(y_test[category], test_prediction)
    tp = cf[0][0]
    fp = cf[0][1]
    fn = cf[1][0]
    tps += tp
    fps += fp
    fns += fn
    prca = tp/(tp+fp)
    reca = tp/(tp+fn)
    prc += prca
    rec += reca
    
    character_sentiment1 = pd.concat([character_sentiment1, marv_pred_df], axis=1)
    
    #save results
    temp = pd.DataFrame([[category, test_accuracy, train_accuracy]],columns = list(columns), index = [i])
    results_logreg = pd.concat([results_logreg, temp])
    i+=1

print("Micro averaging precision: ", tps/(tps+fps))
print("Micro averaging recall: ", tps/(tps+fns))
print("Macro averaging precision: ", prc/len(categories))
print("Macro averaging recall: ", rec/len(categories))

results_logreg

OneVsRestClassifier with SVM

In [None]:
SVC_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=-1)),
            ])

tps = 0
fps = 0
fns = 0
prc = 0
rec = 0
results_svc = pd.DataFrame(columns = columns)
i = 0

character_sentiment2 = marvel_prep.drop(labels = ['clean_line','words'], axis=1)

for category in categories:
    print('**Processing ' + category + '**')
    
    SVC_pipeline.fit(X_train, y_train[category])
    
    test_prediction = SVC_pipeline.predict(X_test)
    train_prediction = SVC_pipeline.predict(X_train)
    marv_predict = SVC_pipeline.predict(mar_X)
    marv_pred_df = pd.DataFrame(marv_predict, columns=[category])
    
    test_accuracy = accuracy_score(y_test[category], test_prediction)
    train_accuracy = accuracy_score(y_train[category], train_prediction)
    cf = confusion_matrix(y_test[category], test_prediction)
    tp = cf[0][0]
    fp = cf[0][1]
    fn = cf[1][0]
    tps += tp
    fps += fp
    fns += fn
    prca = tp/(tp+fp)
    reca = tp/(tp+fn)
    prc += prca
    rec += reca
    character_sentiment2 = pd.concat([character_sentiment2, marv_pred_df], axis=1)
    
    #save results
    temp = pd.DataFrame([[category, test_accuracy, train_accuracy]],columns = list(columns), index = [i])
    results_svc = pd.concat([results_svc, temp])
    i+=1
    
# results for SVM
print("Micro averaging precision: ", tps/(tps+fps))
print("Micro averaging recall: ", tps/(tps+fns))
print("Macro averaging precision: ", prc/len(categories))
print("Macro averaging recall: ", rec/len(categories))
results_svc

In [None]:
df_tony1 = character_sentiment1.drop(character_sentiment1[~character_sentiment1.character.isin(['TONY STARK'])].index)
df_tony1 = df_tony1.drop(labels = ['character','line','movie'], axis=1)
df_tony1.sum(axis=0)

In [None]:
df_tony1.sum(axis=0).plot.bar()

In [None]:
df_steve1 = character_sentiment1.drop(character_sentiment1[~character_sentiment1.character.isin(['STEVE ROGERS'])].index)
df_steve1 = df_steve1.drop(labels = ['character','line','movie'], axis=1)
df_steve1.sum(axis=0)

In [None]:
df_steve1.sum(axis=0).plot.bar()

In [None]:
df_tony2 = character_sentiment2.drop(character_sentiment2[~character_sentiment2.character.isin(['TONY STARK'])].index)
df_tony2 = df_tony2.drop(labels = ['character','line','movie'], axis=1)
df_tony2.sum(axis=0)

In [None]:
df_tony2.sum(axis=0).plot.bar()

In [None]:
df_steve2 = character_sentiment2.drop(character_sentiment2[~character_sentiment2.character.isin(['STEVE ROGERS'])].index)
df_steve2 = df_steve2.drop(labels = ['character','line','movie'], axis=1)
df_steve2.sum(axis=0)

In [None]:
df_steve2.sum(axis=0).plot.bar()