In [18]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.linear_model import LinearRegression
import xml.etree.ElementTree as ET
import time
import json
import tarfile
import gzip
import os

from data_preprocessing import * 
from imdb_ratings_scraper import *
import load_functions as lf

import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim_models


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
data_folder = './data/MovieSummaries/'
(movie_metadata, character_metadata, name_clusters, plot_summaries, test_data) = \
    lf.load_movie_summaries(data_folder)
plot_summaries = pd.merge(plot_summaries, movie_metadata[['wiki_movie_id','movie_name','release_date']], on="wiki_movie_id", how='left')


In [7]:
#number of nan in movie_name
print("Number of nan's in movie_name {}".format(plot_summaries.movie_name.isna().sum()))
#number of nan in release_date
print("Number of nan's in movie_name {}".format(plot_summaries.release_date.isna().sum()))

Number of nan's in movie_name 99
Number of nan's in movie_name 2717


In [8]:
#amount of plots
print(len(plot_summaries))

42303


In [9]:
plots = plot_summaries["plot"]

In [28]:
plots_list = list(plots)
plots_list_test = plots_list[:4000]

At the moment, don't filter out short plots yet

In [29]:
nlp = spacy.load('en_core_web_sm')

In [30]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_plots = list()
for doc in nlp.pipe(plots_list_test, n_process=5, batch_size=10):

    # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_plots.append(doc)


In [31]:
# Add bigrams too
from gensim.models.phrases import Phrases

# Add bigrams to docs (only ones that appear 25 times or more).
bigram = Phrases(processed_plots, min_count=25)

for idx in range(len(processed_plots)):
    for token in bigram[processed_plots[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            processed_plots[idx].append(token)

In [37]:
# Create a dictionary representation of the documents, and filter out frequent and rare words.
from gensim.corpora import Dictionary
dictionary = Dictionary(processed_plots)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in processed_plots]
#MmCorpus.serialize("models/corpus.mm", corpus)


In [43]:
# models
seed = 42
from gensim.models import LdaMulticore
params = {'passes': 30, 'random_state': seed}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=10, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])

In [45]:
model.show_topics(num_words=8)

[(0,
  '0.007*"film" + 0.007*"school" + 0.006*"play" + 0.005*"friend" + 0.005*"tell" + 0.005*"leave" + 0.005*"year" + 0.004*"New"'),
 (1,
  '0.013*"kill" + 0.010*"man" + 0.009*"find" + 0.007*"tell" + 0.007*"leave" + 0.006*"car" + 0.006*"try" + 0.005*"police"'),
 (2,
  '0.008*"find" + 0.006*"kill" + 0.005*"try" + 0.004*"leave" + 0.004*"attack" + 0.004*"Nick" + 0.004*"escape" + 0.004*"man"'),
 (3,
  '0.011*"love" + 0.010*"father" + 0.009*"life" + 0.008*"family" + 0.008*"find" + 0.007*"man" + 0.007*"son" + 0.006*"marry"'),
 (4,
  '0.010*"kill" + 0.010*"house" + 0.010*"find" + 0.008*"Sam" + 0.007*"police" + 0.006*"tell" + 0.006*"man" + 0.006*"murder"'),
 (5,
  '0.008*"find" + 0.007*"Ben" + 0.006*"Jenny" + 0.006*"car" + 0.005*"time" + 0.005*"Max" + 0.004*"try" + 0.004*"leave"'),
 (6,
  '0.007*"kill" + 0.005*"find" + 0.005*"ship" + 0.005*"leave" + 0.005*"Jack" + 0.005*"attack" + 0.004*"destroy" + 0.004*"Jason"'),
 (7,
  '0.010*"film" + 0.006*"find" + 0.006*"tell" + 0.006*"leave" + 0.006*"gir

In [46]:
model[corpus[1]]

[(0, 0.014542841),
 (1, 0.1894034),
 (3, 0.11207339),
 (5, 0.5149776),
 (6, 0.16745244)]

[(7, 2), (12, 1), (13, 1), (14, 1), (15, 3), (16, 7), (17, 5), (18, 2), (19, 8), (20, 2), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 2), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 2), (57, 1), (58, 3), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 2), (72, 1), (73, 2), (74, 1), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 1), (81, 3), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 2), (94, 1), (95, 3), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 2), (104, 2), (105, 2), (106, 2), (107, 1), (108, 1), (109, 2), (110, 1), (111, 1), (112, 1), (113, 2), (114, 1), (115, 1), (116, 1), (117, 1), (118, 1), (119, 1),