In [112]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.linear_model import LinearRegression
import xml.etree.ElementTree as ET
import time
import json
import tarfile
import gzip
import os

from data_preprocessing import * 
from imdb_ratings_scraper import *
import load_functions as lf

import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models
import pickle


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data_folder = './data/MovieSummaries/'
(movie_metadata, character_metadata, name_clusters, plot_summaries, test_data) = \
    lf.load_movie_summaries(data_folder)
plot_summaries = pd.merge(plot_summaries, movie_metadata[['wiki_movie_id','movie_name','release_date']], on="wiki_movie_id", how='left')


In [4]:
#number of nan in movie_name
print("Number of nan's in movie_name {}".format(plot_summaries.movie_name.isna().sum()))
#number of nan in release_date
print("Number of nan's in movie_name {}".format(plot_summaries.release_date.isna().sum()))

Number of nan's in movie_name 99
Number of nan's in movie_name 2717


In [5]:
#amount of plots
print(len(plot_summaries))

42303


In [6]:
plots = plot_summaries["plot"]

In [138]:
plots_list = list(plots)
plots_list_test = plots_list[:500]

At the moment, don't filter out short plots yet

In [8]:
nlp = spacy.load('en_core_web_sm')

In [147]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
STOPWORDS.add("film")
STOPWORDS.add("tell")

processed_plots_1 = list()
for doc in nlp.pipe(plots_list, n_process=5, batch_size=10):

    # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities
    
    # All people's names
    people = [ent.text for ent in ents if ent.label_=="PERSON"]
    
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation, remove stopwords and remove one-word people's names
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop and token.text not in people]
    
    # Remove common words from a stopword list, keep only words of length 3 or more and casefolding
    doc = [token.casefold() for token in doc if token not in STOPWORDS and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity).casefold() for entity in ents if len(entity) > 1])
    
    processed_plots_1.append(doc)


In [150]:
with open('processed_plots_1_full.pickle', 'wb') as f:
    pickle.dump(processed_plots_1, f)

In [140]:
#print(plots_list[3])
test_plot = nlp(plots_list[3])
people = [ent.text for ent in test_plot.ents if ent.label_=="PERSON"]
test_proc = [token.lemma_ for token in test_plot if token.is_alpha and not token.is_stop and token.text not in people]
#print(test_proc)

In [146]:
#print(processed_plots_1[1])

In [151]:
# Add bigrams too
from gensim.models.phrases import Phrases
import copy

#with open('processed_plots_1_full.pickle', 'rb' as f:
#         processed_plots_1_full = pickle.load(f)

processed_plots_2 = copy.deepcopy(processed_plots_1)

# Add bigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(processed_plots_2, min_count=20)

for idx in range(len(processed_plots_2)):
    for token in bigram[processed_plots_2[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            processed_plots_2[idx].append(token)

# also add trigrams
#trigram = Phrases(bigram[processed_plots], min_count=15, threshold=100) 
#for idx in range(len(processed_plots)):
#    for token in trigram[bigram[processed_plots[idx]]]:
#        if token.count('_') == 2:
#            # Token is a trigram, add to document.
#            processed_plots[idx].append(token)
#            #print("trigram {} added".format(token))

with open('processed_plots_2_full.pickle', 'wb') as f:
    pickle.dump(processed_plots_2, f)


In [152]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary = Dictionary(processed_plots_2)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.33
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in processed_plots_2]
#MmCorpus.serialize("models/corpus.mm", corpus)


The words "tell", "film" and "kill" occur in about 30 % of the plots. However, I would think tell and film are not relevant to charactarize the plot, whereas kill is relevent. So should maybe not try to put max_freq parameter in filter_extremes too low to filter out "film" and "tell", but filter those out manually.

In [153]:
# models
seed = 42
from gensim.models import LdaMulticore

params = {'passes': 20, 'random_state': seed}
base_models = dict()
model1 = LdaMulticore(corpus=corpus, num_topics=6, id2word=dictionary, workers=6, eta='auto',
                passes=params['passes'], random_state=params['random_state'])

In [154]:
model1.show_topics(num_words=8)

[(0,
  '0.015*"kill" + 0.013*"police" + 0.008*"murder" + 0.006*"money" + 0.006*"shoot" + 0.005*"gang" + 0.004*"car" + 0.004*"escape"'),
 (1,
  '0.010*"war" + 0.006*"kill" + 0.006*"soldier" + 0.005*"army" + 0.004*"force" + 0.004*"order" + 0.004*"return" + 0.004*"american"'),
 (2,
  '0.007*"house" + 0.007*"try" + 0.006*"run" + 0.006*"kill" + 0.005*"come" + 0.005*"room" + 0.005*"night" + 0.005*"away"'),
 (3,
  '0.010*"love" + 0.008*"father" + 0.008*"life" + 0.007*"family" + 0.007*"friend" + 0.006*"mother" + 0.005*"day" + 0.005*"meet"'),
 (4,
  '0.007*"team" + 0.006*"ship" + 0.004*"world" + 0.004*"new" + 0.004*"destroy" + 0.004*"crew" + 0.004*"earth" + 0.004*"time"'),
 (5,
  '0.007*"king" + 0.005*"fight" + 0.004*"kill" + 0.004*"return" + 0.004*"story" + 0.004*"father" + 0.004*"help" + 0.003*"village"')]

Using six topics seems quite interpretable!

In [155]:
base_models = dict()
model2 = LdaMulticore(corpus=corpus, num_topics=8, id2word=dictionary, workers=6, eta='auto',
                passes=params['passes'], random_state=params['random_state'])

In [156]:
model2.show_topics(num_words=10)

[(0,
  '0.012*"kill" + 0.010*"police" + 0.008*"money" + 0.007*"shoot" + 0.006*"gang" + 0.005*"car" + 0.005*"steal" + 0.005*"gun" + 0.005*"escape" + 0.004*"town"'),
 (1,
  '0.010*"kill" + 0.009*"police" + 0.007*"war" + 0.006*"prison" + 0.005*"officer" + 0.005*"murder" + 0.004*"arrest" + 0.004*"german" + 0.004*"escape" + 0.004*"brother"'),
 (2,
  '0.008*"try" + 0.007*"run" + 0.006*"house" + 0.006*"come" + 0.005*"car" + 0.005*"away" + 0.005*"head" + 0.005*"dog" + 0.004*"room" + 0.004*"look"'),
 (3,
  '0.012*"love" + 0.009*"father" + 0.008*"friend" + 0.008*"family" + 0.007*"life" + 0.007*"mother" + 0.006*"marry" + 0.006*"meet" + 0.006*"day" + 0.006*"home"'),
 (4,
  '0.007*"team" + 0.005*"ship" + 0.005*"attack" + 0.005*"destroy" + 0.004*"kill" + 0.004*"earth" + 0.004*"world" + 0.004*"force" + 0.004*"crew" + 0.004*"time"'),
 (5,
  '0.008*"new" + 0.008*"play" + 0.006*"story" + 0.005*"movie" + 0.005*"life" + 0.005*"character" + 0.005*"star" + 0.004*"band" + 0.004*"world" + 0.004*"york"'),
 (6,

also 8 topics..

In [157]:
base_models = dict()
model3 = LdaMulticore(corpus=corpus, num_topics=10, id2word=dictionary, workers=6, eta='auto',
                passes=params['passes'], random_state=params['random_state'])

In [158]:
model3.show_topics(num_words=8)

[(0,
  '0.013*"kill" + 0.010*"money" + 0.008*"police" + 0.008*"shoot" + 0.008*"gang" + 0.006*"town" + 0.006*"steal" + 0.005*"gun"'),
 (1,
  '0.011*"police" + 0.011*"kill" + 0.007*"murder" + 0.007*"officer" + 0.006*"agent" + 0.006*"prison" + 0.005*"escape" + 0.005*"drug"'),
 (2,
  '0.008*"run" + 0.008*"try" + 0.007*"car" + 0.007*"come" + 0.006*"house" + 0.006*"dog" + 0.006*"away" + 0.006*"head"'),
 (3,
  '0.008*"friend" + 0.007*"love" + 0.007*"school" + 0.006*"life" + 0.006*"new" + 0.006*"day" + 0.006*"father" + 0.005*"meet"'),
 (4,
  '0.012*"team" + 0.008*"game" + 0.005*"world" + 0.005*"win" + 0.005*"time" + 0.004*"plane" + 0.004*"united" + 0.004*"play"'),
 (5,
  '0.010*"new" + 0.008*"story" + 0.006*"york" + 0.006*"new_york" + 0.006*"play" + 0.006*"band" + 0.005*"life" + 0.005*"movie"'),
 (6,
  '0.009*"king" + 0.005*"return" + 0.004*"father" + 0.004*"prince" + 0.004*"young" + 0.004*"fall" + 0.004*"help" + 0.004*"princess"'),
 (7,
  '0.012*"kill" + 0.008*"house" + 0.007*"body" + 0.007*"

Maybe 10 topics starts to become a bit less interpretable?

# Some evaluation

# The avengers

In [186]:
plot_summaries[plot_summaries['movie_name'].str.contains('avengers', case=False, na=False)]

Unnamed: 0,wiki_movie_id,plot,movie_name,release_date
10574,8619888,A battle with Ultron leaves The Avengers defea...,Next Avengers: Heroes of Tomorrow,2008-09-02
10611,10603679,The insidious Masked Gang of hired killers hav...,Masked Avengers,1981-05-15
13596,12752698,"The Elusive Avengers, a posse of young Red Par...",The Crown of the Russian Empire/Once again the...,1971
18135,633411,"The film opens with John Steed , agent of The ...",The Avengers,1998-08-14
18937,12752223,The movie continues the story of the Elusive A...,The New Adventures of the Elusive Avengers,1968
21233,1624537,The film' is a comedy version of a story about...,The Elusive Avengers,1966
27482,2952825,"In the last days of World War II in Europe, Ca...",Ultimate Avengers,2006-02-21
34940,4644216,"T'Challa, the Prince of Wakanda, returns home ...",Ultimate Avengers 2,2006-08-08
37608,22114132,"The Asgardian Loki encounters the Other, the ...",The Avengers,2012-04-11


Model with 6 topics

In [176]:
print(sorted(model1[corpus[37608]],key=lambda x:x[1],reverse=True))

[(4, 0.9973279)]


Topic with "team", "ship", "attack" is most important, very interpretable

Model with 8 topics

In [187]:
print(sorted(model2[corpus[37608]],key=lambda x:x[1],reverse=True))

[(4, 0.99720234)]


Still same topic,...

Model with 10 topics

In [188]:
print(sorted(model3[corpus[37608]],key=lambda x:x[1],reverse=True))

[(9, 0.87842613), (5, 0.119019724)]


Becomes more dispersed,...

# Lord of the rings

In [177]:
plot_summaries[plot_summaries['movie_name'].str.contains('Lord of the rings', case=False, na=False)]

Unnamed: 0,wiki_movie_id,plot,movie_name,release_date
14529,173941,"In the Second Age, the Dark Lord Sauron attem...",The Lord of the Rings: The Fellowship of the Ring,2001-12-10
16411,396607,"Early in the Second Age of Middle-earth, elven...",The Lord of the Rings,1978-11-15
40429,173944,Gandalf the Grey gives his life in battle agai...,The Lord of the Rings: The Two Towers,2002-12-05
41926,174251,"Gandalf, Aragorn, Legolas, Gimli, Théoden, Ga...",The Lord of the Rings: The Return of the King,2003-12-17


6 topics

In [181]:
print(sorted(model1[corpus[41926]],key=lambda x:x[1],reverse=True))

[(1, 0.56592417), (5, 0.32989925), (2, 0.081065625), (4, 0.02189293)]


8 topics

In [185]:
print(sorted(model2[corpus[41926]],key=lambda x:x[1],reverse=True))

[(6, 0.9476347), (7, 0.049650464)]


10 topics

In [189]:
print(sorted(model3[corpus[41926]],key=lambda x:x[1],reverse=True))

[(9, 0.6285251), (6, 0.22024019), (2, 0.06361215), (8, 0.056242917), (7, 0.029562589)]


Saving private ryan

In [182]:
plot_summaries[plot_summaries['movie_name'].str.contains('Private ryan', case=False, na=False)]

Unnamed: 0,wiki_movie_id,plot,movie_name,release_date
5103,28269,"On the morning of June 6, 1944, the beginning ...",Saving Private Ryan,1998-07-24


In [183]:
print(sorted(model1[corpus[5103]],key=lambda x:x[1],reverse=True))

[(1, 0.8320996), (2, 0.090072624), (4, 0.06963664)]


In [184]:
print(sorted(model2[corpus[5103]],key=lambda x:x[1],reverse=True))

[(4, 0.42438954), (1, 0.3370785), (0, 0.17673382), (3, 0.053320218)]


## Adding all BoW and distribution over topics to each movie entry in df

In [191]:
plot_summaries["BoW"] = corpus
plot_summaries["6-topic model distibution"] = model1[corpus]
plot_summaries["8-topic model distibution"] = model2[corpus]
plot_summaries["10-topic model distibution"] = model3[corpus]

In [194]:
plot_summaries.head()

Unnamed: 0,wiki_movie_id,plot,movie_name,release_date,BoW,6-topic model distibution,8-topic model distibution,10-topic model distibution
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha...",Taxi Blues,1990-09-07,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[(3, 0.95065206)]","[(3, 0.91675663), (5, 0.038946092)]","[(3, 0.5361889), (8, 0.41666898)]"
1,31186339,The nation of Panem consists of a wealthy Capi...,The Hunger Games,2012-03-12,"[(8, 2), (15, 1), (16, 1), (17, 1), (18, 1), (...","[(0, 0.0617054), (1, 0.193731), (2, 0.26329622...","[(0, 0.042706113), (1, 0.10394004), (2, 0.1520...","[(0, 0.036004033), (1, 0.13272709), (2, 0.0634..."
2,20663735,Poovalli Induchoodan is sentenced for six yea...,Narasimham,2000,"[(8, 1), (40, 1), (45, 1), (69, 1), (70, 2), (...","[(0, 0.3633028), (1, 0.16953622), (3, 0.4649112)]","[(1, 0.5624627), (3, 0.24363874), (7, 0.186327...","[(1, 0.3228383), (8, 0.67359704)]"
3,2231378,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,1951-03-08,"[(19, 1), (26, 1), (37, 1), (39, 1), (40, 1), ...","[(0, 0.50796485), (2, 0.18307324), (3, 0.27190...","[(0, 0.4947431), (2, 0.21770631), (3, 0.244807...","[(0, 0.4541384), (2, 0.10857002), (3, 0.330554..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,1988-11-03,"[(0, 1), (5, 1), (22, 1), (44, 1), (45, 1), (4...","[(0, 0.349879), (1, 0.22577511), (2, 0.2690980...","[(1, 0.42650908), (2, 0.1583739), (3, 0.026417...","[(1, 0.40470022), (2, 0.04439504), (3, 0.09506..."


Find most representative movies for each topic (in 6-topic model)

In [201]:
def get_topic_value(distr, query_topic):
    
    for (topic, prob) in distr:
        if query_topic == topic:
            return prob
    return 0

most_repr_movies_per_topic = dict()
for topic in range(0,6):
    top_10_movies_indices = \
        plot_summaries["6-topic model distibution"].apply(lambda x: get_topic_value(x, topic)).sort_values(ascending=False)[:10].index
    most_repr_movies_per_topic[topic] = \
        plot_summaries.iloc[top_10_movies_indices]["movie_name"]
for topic in range(0,6):
    print("Topic with distribution of words: \n")
    print(model1.show_topics(num_words=8)[topic][1])
    print("\n")
    print("Has most representative movies:")
    print(most_repr_movies_per_topic[topic])
    

Topic with distribution of words: 

0.015*"kill" + 0.013*"police" + 0.008*"murder" + 0.006*"money" + 0.006*"shoot" + 0.005*"gang" + 0.004*"car" + 0.004*"escape"


Has most representative movies:
4927             The Lincoln Lawyer
11080         Chura Liyaa Hai Tumne
38522                Public Enemies
36597                           McQ
38066                  Gang Related
7576                    The Killers
38024                     Appaloosa
31242                   Blue Streak
37376    Slaughter in San Francisco
4258                    Bulletproof
Name: movie_name, dtype: object
Topic with distribution of words: 

0.010*"war" + 0.006*"kill" + 0.006*"soldier" + 0.005*"army" + 0.004*"force" + 0.004*"order" + 0.004*"return" + 0.004*"american"


Has most representative movies:
19918                             Taegukgi
39838      The Wind That Shakes the Barley
9643                        Ernst Thälmann
38807                    The Four Feathers
2186     Chetniks! The Fighting Guerrillas
