In this notebook we'll explore topic modeling to discover broad themes in a collection of movie summaries.

In [1]:
import nltk
import re
import gensim
from gensim import corpora
import operator

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

random.seed(1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/filipesantos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

Since we're running topic modeling on texts with lots of names, we'll add the Jockers list of stopwords (which includes character names) to our stoplist.

In [3]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words.update(read_stopwords("../data/jockers.stopwords"))
stop_words["'s"]=1
stop_words=list(stop_words.keys())

In [4]:
def filter(word, stopwords):
    
    """ Function to exclude words from a text """
    
    # no stopwords
    if word in stopwords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [5]:
def read_docs(plotFile, metadataFile, stopwords):
    
    names={}
    box={}
    
    with open(metadataFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            name=cols[2]
            boxoffice=cols[4]
            if len(boxoffice) != 0:
                box[idd]=int(boxoffice)
                names[idd]=name
    
    n=5000
    target_movies={}


    sorted_box = sorted(box.items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_box[:n]:
        target_movies[k]=names[k]
    
    docs=[]
    names=[]
   
    with open(plotFile, encoding="utf-8") as file:
        for line in file:
            cols=line.rstrip().split("\t")
            idd=cols[0]
            text=cols[1]
            
            if idd in target_movies:
                tokens=nltk.word_tokenize(text.lower())
                tokens=[x for x in tokens if filter(x, stopwords)]
                docs.append(tokens)
                name=target_movies[idd]
                names.append(name)
    return docs, names

We'll read in summaries of the 5,000 movies with the highest box office revenues.

In [6]:
metadataFile="../data/movie.metadata.tsv"
plotFile="../data/plot_summaries.txt"
data, doc_names=read_docs(plotFile, metadataFile, stop_words)

We will convert the movie summaries into a bag-of-words representation using gensim's [corpora.dictionary](https://radimrehurek.com/gensim/corpora/dictionary.html) methods.

In [14]:
# Create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents 
# and no more than 50% of all documents

dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

Dictionary<10000 unique tokens: ['12-year-old', 'able', 'accompanied', 'advice', 'afterward']...>


In [15]:
# Replace dataset with numeric ids words in vocab (and exclude all other words)
corpus = [dictionary.doc2bow(text) for text in data]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [9]:
num_topics=20

Now let's run a topic model on this data using gensim's built-in LDA.

In [10]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

We can get a sense of what the topics are by printing the top 10 words with highest $P(word \mid topic)$ for each topic

In [11]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	japanese boat camp water tank shark connor english holmes mask
topic 1:	house finds find room tells goes body tries night car
topic 2:	men town war president american west government army general wife
topic 3:	bond turner flynn agent frost griffin campbell skull sir nixon
topic 4:	case judge court shaw trial law firm attorney sin company
topic 5:	life film mother family time father begins story years finds
topic 6:	earth ship alien planet space robot crew aliens nuclear miller
topic 7:	find city battle castle escape group world fight kill return
topic 8:	dracula collins toys hayes machine cutter race apprentice count armstrong
topic 9:	team game play duke coach football win player players playing
topic 10:	tells show relationship n't father wedding asks job family band
topic 11:	captain plane mission men killed team crew bomb orders officer
topic 12:	banks ally powell hawk gin harvard paulie triangle clock bank
topic 13:	dr. kill creature killed vampire infected blood killing 

Another way of understanding topics is to print out the documents that have the highest topic representation -- i.e., for a given topic $k$, the documents with highest $P(topic=k | document)$.  How much do the documents listed here align with your understanding of the topics?

In [12]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s" % (i,v,doc_names[k]))
    print()
    
    

japanese boat camp water tank shark connor english holmes mask

0	0.660	Jaws 2
0	0.655	Elizabeth: The Golden Age
0	0.639	Shark Night 3D
0	0.632	Without a Clue
0	0.568	Jaws

house finds find room tells goes body tries night car

1	0.985	Paranormal Activity 4
1	0.977	Monster House
1	0.927	The Collector
1	0.912	Friday the 13th Part 3: 3D
1	0.898	Paranormal Activity 2

men town war president american west government army general wife

2	0.969	Il Divo
2	0.893	The Distinguished Gentleman
2	0.878	Appaloosa
2	0.874	Fair Game
2	0.844	First Family

bond turner flynn agent frost griffin campbell skull sir nixon

3	0.453	Casino Royale
3	0.390	From Russia with Love
3	0.388	Never Say Never Again
3	0.337	GoldenEye
3	0.291	Diamonds Are Forever

case judge court shaw trial law firm attorney sin company

4	0.547	The Devil's Advocate
4	0.508	Rollover
4	0.406	The Verdict
4	0.375	Guilty as Sin
4	0.374	The Associate

life film mother family time father begins story years finds

5	0.981	Once Around
5	0.968	B

In [16]:
print(lda_model.print_topics())

[(0, '0.015*"japanese" + 0.014*"boat" + 0.014*"camp" + 0.011*"water" + 0.009*"tank" + 0.008*"shark" + 0.007*"connor" + 0.007*"english" + 0.006*"holmes" + 0.006*"mask"'), (1, '0.011*"house" + 0.007*"finds" + 0.006*"find" + 0.006*"room" + 0.006*"tells" + 0.005*"goes" + 0.005*"body" + 0.005*"tries" + 0.004*"night" + 0.004*"car"'), (2, '0.008*"men" + 0.006*"town" + 0.006*"war" + 0.006*"president" + 0.004*"american" + 0.004*"west" + 0.004*"government" + 0.004*"army" + 0.004*"general" + 0.003*"wife"'), (3, '0.137*"bond" + 0.042*"turner" + 0.022*"flynn" + 0.019*"agent" + 0.018*"frost" + 0.018*"griffin" + 0.017*"campbell" + 0.016*"skull" + 0.015*"sir" + 0.014*"nixon"'), (4, '0.032*"case" + 0.028*"judge" + 0.024*"court" + 0.024*"shaw" + 0.019*"trial" + 0.017*"law" + 0.016*"firm" + 0.015*"attorney" + 0.014*"sin" + 0.013*"company"'), (5, '0.011*"life" + 0.009*"film" + 0.009*"mother" + 0.008*"family" + 0.007*"time" + 0.007*"father" + 0.005*"begins" + 0.005*"story" + 0.005*"years" + 0.004*"finds"')

In [20]:
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

In [19]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Collecting FuzzyTM>=0.4.0 (from gensim->pyLDAvis)
  Downloading FuzzyTM-2.0.5-py3-none-any.whl (29 kB)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading pyFUME-0.2.25-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.1/67.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting simpful (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading simpful-2.11.0-py3-none-any.whl (32 kB)
Collecting fst-pso (from pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting miniful (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim->pyLDAvis)
  Downlo

In [22]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis