## NLP exploration

In [64]:
import string 
import numpy as np
from nlp_helper import *
from nltk import pos_tag
from gensim import models
from nltk.corpus import stopwords
from gensim.models import Phrases
from nltk.tokenize import word_tokenize
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/douglasbouchet/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Data Loading and cleaning

In [51]:
df = read_moviedb_data()
df_plots = df.copy()
# keep only the overview and providers columns as we don't use others for NLP
df_plots = df_plots[["overview", "providers"]]
# replace nan for overview by "" and nan for provider by {}
df_plots["providers"] = df_plots["providers"].fillna("{}")
df_plots["overview"] = df_plots["overview"].fillna("")
df_plots.head()

Unnamed: 0,overview,providers
0,The adventures of a female reporter in the 1890s.,{}
1,Just as Galeen and Wegener's Der Golem (1915) ...,{}
2,The first feature-length motion picture produc...,{}
3,Australian bushranger movie. The first filmed...,{}
4,L. Frank Baum would appear in a white suit and...,{}


### Plot preparation

We will transform the plots in order to make them intepretable by an LDA model. This includes
- Tokenization
- Lemmatization
- Removing of stopwords

#### Tokenization

In [53]:
# Tokenize the plots
df_plots['tokenized_plots'] = df_plots['overview'].apply(
    lambda movie_plot: word_tokenize(movie_plot))
df_plots.head()

Unnamed: 0,overview,providers,tokenized_plots
0,The adventures of a female reporter in the 1890s.,{},"[The, adventures, of, a, female, reporter, in,..."
1,Just as Galeen and Wegener's Der Golem (1915) ...,{},"[Just, as, Galeen, and, Wegener, 's, Der, Gole..."
2,The first feature-length motion picture produc...,{},"[The, first, feature-length, motion, picture, ..."
3,Australian bushranger movie. The first filmed...,{},"[Australian, bushranger, movie, ., The, first,..."
4,L. Frank Baum would appear in a white suit and...,{},"[L., Frank, Baum, would, appear, in, a, white,..."


#### Lemmatization

we start by assocating a POS tag to each word (i.e if a word is a Noun, Verb, Adjective, etc.)

In [55]:
df_plots['plots_with_POS_tag'] = df_plots['tokenized_plots'].apply(
    lambda tokenized_plot: pos_tag(tokenized_plot))
df_plots['plots_with_POS_tag'].head()

0    [(The, DT), (adventures, NNS), (of, IN), (a, D...
1    [(Just, RB), (as, IN), (Galeen, NNP), (and, CC...
2    [(The, DT), (first, JJ), (feature-length, JJ),...
3    [(Australian, JJ), (bushranger, NN), (movie, N...
4    [(L., NNP), (Frank, NNP), (Baum, NNP), (would,...
Name: plots_with_POS_tag, dtype: object

In [56]:
lemmatizer = WordNetLemmatizer()
# Now we can lemmatize each word, given its POS tag
df_plots['lemmatized_plots'] = df_plots['plots_with_POS_tag'].apply(
    lambda tokenized_plot: [word[0] if get_wordnet_pos(word[1]) == ''\
        else lemmatizer.lemmatize(word[0], get_wordnet_pos(word[1])) for word in tokenized_plot])
    # lambda tokenized_plot: [lemmatizer.lemmatize(word[0], get_wordnet_pos(word[1]))
    #                         if get_wordnet_pos(word[1]) != '' else word[0] for word in tokenized_plot])
df_plots['lemmatized_plots'].head()

0    [The, adventure, of, a, female, reporter, in, ...
1    [Just, as, Galeen, and, Wegener, 's, Der, Gole...
2    [The, first, feature-length, motion, picture, ...
3    [Australian, bushranger, movie, ., The, first,...
4    [L., Frank, Baum, would, appear, in, a, white,...
Name: lemmatized_plots, dtype: object

#### Stop words removal

In [90]:
# print the total number of words in the corpus
print("Total number of words in the corpus: {}".format(
    len([word for sentence in df_plots['lemmatized_plots'] for word in sentence])))
    

Total number of words in the corpus: 652633


In [86]:
# TODO list of stop words may be improved
# create our list of stopwords
stop_words = ['\'s']
all_stopwords = stopwords.words('English') + list(string.punctuation) + stop_words

In [91]:

# remove the white space inside each words
df_plots['plots_without_stopwords'] = df_plots['lemmatized_plots'].apply(
    lambda tokenized_plot: [word.strip() for word in tokenized_plot])
# lowercase all words in each plot
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word.lower() for word in plot])
# remove stopwords from the plots
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word for word in plot if word not in all_stopwords])
# remove word if contains other letter than a-z or is a single character
df_plots['plots_without_stopwords'] = df_plots['plots_without_stopwords'].apply(
    lambda plot: [word for word in plot if word.isalpha() and len(word) > 1])
df_plots['plots_without_stopwords'].head()

0                        [adventure, female, reporter]
1    [galeen, wegener, der, golem, see, testament, ...
2    [first, motion, picture, produce, europe, run,...
3    [australian, bushranger, movie, first, filmed,...
4    [frank, baum, would, appear, white, suit, pres...
Name: plots_without_stopwords, dtype: object

In [95]:
# print the total number of words in the corpus
before_stop_words_total_number_of_words =\
     len([word for sentence in df_plots['lemmatized_plots'] for word in sentence])
after_stop_words_total_number_of_words =\
     len([word for sentence in df_plots['plots_without_stopwords'] for word in sentence])
print("We kept {}% of the words in the corpus".format(\
    round(after_stop_words_total_number_of_words/before_stop_words_total_number_of_words, 3) * 100))

We kept 48.1% of the words in the corpus


### Latent Direchlet Allocation

In [99]:
# construct a bi-gram model for each plot
#df_plots['bi_grams'] = df_plots['plots_without_stopwords'].apply(
#    lambda plot: list(ngrams(plot, 2)))
#df_plots['bi_grams'].head(1)
# TODO undertand and modify
print("hell")
tokens = df_plots['plots_without_stopwords'].tolist()
print(tokens)
#bigram_model = Phrases(tokens)
#trigram_model = Phrases(bigram_model[tokens], min_count=1)
#tokens = list(trigram_model[bigram_model[tokens]])

hell


In [100]:
print("sdd")

NameError: name 'xxx' is not defined

#### Dictionnary & Corpus

In [None]:
# TODO understand and modifiy METHOD  ? 
dictionnary,corpus = build_dictionnary_and_corpus(tokens)

#### LDA Model

In [None]:
np.random.seed(9999)
lda_model = create_lda_model(corpus, dictionary, num_topics=10, passes=10)

In [None]:
# get the topics 
topics = get_topics(lda_model, num_topics=10, num_words=10)

In [None]:
# for each movie plot, get its topic distribution (i.e the probability of each topic)
topic_distributions = get_topic_distribution(lda_model, corpus)