## NLP exploration

In [1]:
import string 
import nltk
import numpy as np
from nlp_helper import *
from nltk import pos_tag
from gensim import models
from nltk.corpus import stopwords
from gensim.models import Phrases
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/douglasbouchet/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Data Loading and cleaning

In [2]:
df = read_moviedb_data()
df_plots = df.copy()
# keep only the overview and providers columns as we don't use others for NLP
df_plots = df_plots[["overview", "providers"]]
# replace nan for overview by "" and nan for provider by {}
df_plots["providers"] = df_plots["providers"].fillna("{}")
df_plots["overview"] = df_plots["overview"].fillna("")
df_plots.head()
print(len(df_plots))

59794


### Plot preparation

We will transform the plots in order to make them intepretable by an LDA model. This includes
- Tokenization
- Lemmatization
- Removing of stopwords

This is usefull as we want to find ressemblance between words, so we should replace words with same meaning by one 
common word.  
We also want to remove most commun words. This allows to remove low-information words, allowing our 
model to focus on important $\\$ words.


#### Tokenization

In [3]:
# Tokenize the plots
df_plots['tokenized_plots'] = df_plots['overview'].apply(
    lambda movie_plot: word_tokenize(movie_plot))
df_plots.head()

Unnamed: 0,overview,providers,tokenized_plots
0,The adventures of a female reporter in the 1890s.,{},"[The, adventures, of, a, female, reporter, in,..."
1,Just as Galeen and Wegener's Der Golem (1915) ...,{},"[Just, as, Galeen, and, Wegener, 's, Der, Gole..."
2,The first feature-length motion picture produc...,{},"[The, first, feature-length, motion, picture, ..."
3,Australian bushranger movie. The first filmed...,{},"[Australian, bushranger, movie, ., The, first,..."
4,L. Frank Baum would appear in a white suit and...,{},"[L., Frank, Baum, would, appear, in, a, white,..."


#### Lemmatization

we start by assocating a POS tag to each word (i.e if a word is a Noun, Verb, Adjective, etc.)

In [4]:
df_plots['plots_with_POS_tag'] = df_plots['tokenized_plots'].apply(
    lambda tokenized_plot: pos_tag(tokenized_plot))
df_plots['plots_with_POS_tag'].head()

0    [(The, DT), (adventures, NNS), (of, IN), (a, D...
1    [(Just, RB), (as, IN), (Galeen, NNP), (and, CC...
2    [(The, DT), (first, JJ), (feature-length, JJ),...
3    [(Australian, JJ), (bushranger, NN), (movie, N...
4    [(L., NNP), (Frank, NNP), (Baum, NNP), (would,...
Name: plots_with_POS_tag, dtype: object

If a word has no tag we don't change it. However if there is a tag, we lemmatize the word according to its tag.

In [6]:
lemmatizer = WordNetLemmatizer()
# Now we can lemmatize each word, given its POS tag
df_plots['lemmatized_plots'] = df_plots['plots_with_POS_tag'].apply(
    lambda tokenized_plot: [word[0] if get_wordnet_pos(word[1]) == ''\
        else lemmatizer.lemmatize(word[0], get_wordnet_pos(word[1])) for word in tokenized_plot])
df_plots['lemmatized_plots'].head()[0]

['The',
 'adventure',
 'of',
 'a',
 'female',
 'reporter',
 'in',
 'the',
 '1890s',
 '.']

#### Stop words removal

In [7]:
# TODO list of stop words may be improved
# create our list of stopwords
stop_words = ['\'s']
all_stopwords = stopwords.words('English') + list(string.punctuation) + stop_words

In [10]:

# remove the white space inside each words
df_plots['plots_without_stopwords'] = df_plots['lemmatized_plots'].apply(
    lambda tokenized_plot: [word.strip() for word in tokenized_plot])
# lowercase all words in each plot
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word.lower() for word in plot])
# remove stopwords from the plots
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word for word in plot if word not in all_stopwords])
# remove word if contains other letter than a-z or is a single character
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word for word in plot if word.isalpha() and len(word) > 1])
df_plots['plots_without_stopwords'].head()[0:2]

0                        [adventure, female, reporter]
1    [galeen, wegener, der, golem, see, testament, ...
Name: plots_without_stopwords, dtype: object

In [14]:
before_stop_words_total_number_of_words =\
     len([word for sentence in df_plots['lemmatized_plots'] for word in sentence])
after_stop_words_total_number_of_words =\
     len([word for sentence in df_plots['plots_without_stopwords'] for word in sentence])
print("We kept {}% of the words in the corpus".format(\
    round(after_stop_words_total_number_of_words/before_stop_words_total_number_of_words, 2) * 100))

We kept 48.0% of the words in the corpus


### Latent Direchlet Allocation

We need to create a list of tokens, i.e words that will be used inside our dictionary (depending on their frequency). 
$\\$
We can start by creating bi-gram for some words (represent to one words by one unique composed word)  
It can be also interesting to see if creating tri-gram allows to extract more information from plots.

In [18]:
tokens = df_plots['plots_without_stopwords'].tolist()
bigram_model = Phrases(tokens)
tokens = list(bigram_model[tokens])
print(tokens[0:2])

[['adventure', 'female_reporter'], ['galeen', 'wegener', 'der', 'golem', 'see', 'testament', 'early', 'german', 'film', 'artistry', 'story', 'kelly', 'gang', 'symbolize', 'birth', 'australian', 'film_industry', 'emergence', 'australian', 'cinema', 'identity', 'even', 'significantly', 'herald', 'emergence', 'feature_film', 'format', 'however', 'fragment', 'original', 'production', 'one', 'hour', 'know_exist', 'preserve', 'national', 'film', 'sound', 'archive', 'canberra', 'efforts', 'reconstruction', 'make', 'film', 'available', 'modern', 'audience']]


#### Hyperparameters

In [29]:
no_below = 60 # minimum number of documents a word must be present in to be kept
no_above = 0.5 # maximum proportion of documents a word can be present in to be kept
n_topics = 10 # number of topics
n_passes = 10 # number of passes through the corpus during training


#### Dictionnary & Corpus

The dictionnary will be the list of unique words, and the corpus a list of movie plots bag of words.

In [30]:
# we create a dictionary that maps each word to a unique integer
# we also create a corpus. Each movie plot is encoded as a bag of words in the corpus. 
# A bag of word means that we count the number of times each word appears in the mvoie plot
dictionary,corpus = build_dictionnary_and_corpus(tokens, no_below=no_below, no_above=no_above)
print("Dictionary size: {}".format(len(dictionary)))
print("Dictionary first 10 elements: {}".format(list(dictionary.items())[0:10]))
print("Corpus size: {}".format(len(corpus)))
print("Corpus first 2 elements: {}".format(corpus[0:2]))

Dictionary size: 13475
Dictionary first 10 elements: [(0, 'adventure'), (1, 'female_reporter'), (2, 'archive'), (3, 'audience'), (4, 'australian'), (5, 'available'), (6, 'birth'), (7, 'cinema'), (8, 'der'), (9, 'early')]
Corpus size: 59794
Corpus first 2 elements: [[(0, 1), (1, 1)], [(2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 3), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)]]


#### LDA Model

In [31]:
np.random.seed(9999)
lda_model = create_lda_model(corpus, dictionary, num_topics=n_topics, passes=n_passes)

In [38]:
# get the topics 
topics = get_topics(lda_model, num_topics=n_topics, num_words=10)
# print topics with new line
for i,topic in enumerate(topics):
    print("Topic {}: {}".format(i,topic))

Topic 0: keep include french past government several actor appear plot world_war course best share
Topic 1: meet decide cop evil happen body affair high_school earth sell living camp hero
Topic 2: play learn save team visit director little different four future late game hide
Topic 3: life get woman man family father become work new live time son leave
Topic 4: find take go friend try girl help child way return however force must
Topic 5: one two young wife love kill old world daughter brother also people set
Topic 6: use dream bring village another whose company boyfriend great survive hand music without
Topic 7: film make story want first back look movie show star well long much
Topic 8: turn group murder three know escape local police order hong_kong gang student give
Topic 9: fight name still soldier island battle street experience teacher documentary land human free


In [40]:
# for each movie plot, get its topic distribution (i.e the probability of each topic) in descending order
topic_distributions = get_topic_distribution(lda_model, corpus)

[[(0, 0.04638879), (1, 0.06420414), (2, 0.07004631), (3, 0.19013405), (4, 0.14304884), (5, 0.15949595), (6, 0.06626958), (7, 0.08734115), (8, 0.120726734), (9, 0.0523445)], [(0, 0.030123878), (1, 0.04168074), (2, 0.056213398), (3, 0.12342403), (4, 0.12507316), (5, 0.11432252), (6, 0.22635376), (7, 0.14898047), (8, 0.08915698), (9, 0.04467106)], [(0, 0.036190297), (1, 0.05009002), (2, 0.093504086), (3, 0.17425533), (4, 0.111605875), (5, 0.11148201), (6, 0.14201573), (7, 0.13288711), (8, 0.09418971), (9, 0.05377984)]]
