In [2]:
import numpy as np
from time import time

STORYLINES_LOC = '../data/movie_storylines.bigdata'

In [3]:
# read in storylines, ignore movies with no storyline
storylines = {}
with open(STORYLINES_LOC) as f:
    for line in f:
        data = line.rstrip().split('|')
        if data[1] != '':
            if data[0] not in storylines:
                storylines[data[0]] = data[1]

In [4]:
import nltk, re
from nltk import word_tokenize
from nltk.corpus import stopwords

chars = ['{','}','#','%','&','\(','\)','\[','\]','<','>',',', '!', '.', ';',
'?', '*', '\\', '\/', '~', '_','|','=','+','^',':','\"','\'','@','-']

In [5]:
MIN_NUM_TOKENS = 5

# from Assignment 1 code
porter = nltk.PorterStemmer() # also lancaster stemmer
wnl = nltk.WordNetLemmatizer()
stopWords = stopwords.words("english")
words = {}
movie_tokens = {}
movie_index_mapping = {}
index_movie_mapping = {}

# get words and counts
movie_last_index = 0
for movie_ID, storyline in storylines.iteritems():
    storyline = storyline.decode('latin1')
    # remove noisy characters; tokenize
    raw = re.sub('[%s]' % ''.join(chars), ' ', storyline)
    tokens = word_tokenize(raw)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    tokens = [w for w in tokens if w not in stopWords]
    tokens = [wnl.lemmatize(t) for t in tokens]
    tokens = [porter.stem(t) for t in tokens]
    
    # only want some tokens
    if len(tokens) >= MIN_NUM_TOKENS:
        if movie_ID not in movie_index_mapping:
            for t in tokens:
                # this is a hack but much faster than lookup each
                # word within many dict keys
                try:
                    words[t] = words[t]+1
                except:
                    words[t] = 1
            movie_tokens[movie_ID] = tokens
            movie_index_mapping[movie_ID] = movie_last_index
            index_movie_mapping[movie_last_index] = movie_ID
            movie_last_index += 1

In [6]:
MIN_OCCURS = 5

# remove all ones that have less than MIN_OCCURS occurrences
# assign indices
word_last_index = 0
word_index_mapping = {}
index_word_mapping = {}
for word, count in words.iteritems():
    if count >= MIN_OCCURS:
        word_index_mapping[word] = word_last_index
        index_word_mapping[word_last_index] = word
        word_last_index += 1

In [7]:
# from the scipy.sparse library, do the importation of the csc_matrix
from scipy.sparse import csc_matrix

# prepare binary feature matrix, rows = movies, columns = actors
matrix = np.zeros(shape=(len(movie_index_mapping), len(word_index_mapping)), dtype=np.uint8)

for movie_ID, movie_index in movie_index_mapping.iteritems():
    for word in movie_tokens[movie_ID]:
        if word in word_index_mapping:
            word_index = word_index_mapping[word]
            matrix[movie_index, word_index] += 1

smatrix = csc_matrix(matrix)

In [8]:
import lda

NUM_TOPICS = 20

start = time()
lda_model = lda.LDA(n_topics=NUM_TOPICS, n_iter=1500, random_state=420)
lda_model.fit(smatrix)
print 'Time taken (s): %d\n' % (time() - start)



Time taken (s): 145



In [9]:
n_top_words = 8
for i, topic_dist in enumerate(lda_model.topic_word_):
    topic_words = []
    for word_index in np.argsort(topic_dist)[:-n_top_words:-1]:
        topic_words.append(index_word_mapping[word_index])
    print 'Topic {}: {}'.format(i, ' '.join([word.encode('latin1') for word in topic_words]))

Topic 0: school high friend student girl colleg teacher
Topic 1: war american armi stori soldier world u
Topic 2: murder polic drug cop prison kill crime
Topic 3: get two bank gang plan take go
Topic 4: film movi life stori music show star
Topic 5: murder find kill town hous killer investig
Topic 6: team game win player play race coach
Topic 7: love meet woman life friend new marri
Topic 8: life stori love world young man famili
Topic 9: get money work job new make life
Topic 10: love young fall king woman daughter stori
Topic 11: get go one find want day time
Topic 12: famili dog name young boy find town
Topic 13: evil world power must vampir becom forc
Topic 14: island find ship crew travel group captain
Topic 15: year father mother famili old son child
Topic 16: agent secret must fbi team assassin u
Topic 17: human earth world alien scientist power planet
Topic 18: life work becom take dr doctor howev
Topic 19: new york citi art camp american time


In [10]:
# get movie name associated with each ID
from json import loads
MOVIE_DATA_LOC = '../data/movies.bigdata'

# read in movie data
movie_data = []
with open(MOVIE_DATA_LOC) as f:
    for line in f:
        movie_data.append(loads(line))
        
# mapping of movie_id to name
movie_name_mapping = {}

for entry in movie_data:
    movie_ID = entry['data']['tconst']
    if movie_ID not in movie_name_mapping:
        movie_name_mapping[movie_ID] = entry['data']['title']

In [11]:
for i in xrange(10): # first 10 documents
    movie_name = movie_name_mapping[index_movie_mapping[i]]
    print "{} (top topic: {})".format(movie_name, lda_model.doc_topic_[i].argmax())

The Seeker: The Dark Is Rising (top topic: 13)
Before Sunset (top topic: 11)
Penelope (top topic: 10)
The Wedding Planner (top topic: 7)
Parting Glances (top topic: 8)
Summerhill (top topic: 18)
Celebrity Sex Tape (top topic: 4)
When in Rome (top topic: 11)
The Contender (top topic: 18)
Spy Kids: All the Time in the World in 4D (top topic: 13)


In [12]:
topic_movie_mapping = {}
for movie_index in index_movie_mapping:
    top_topic = lda_model.doc_topic_[movie_index].argmax()
    top_topic_prob = lda_model.doc_topic_[movie_index][top_topic]
    if top_topic not in topic_movie_mapping:
        topic_movie_mapping[top_topic] = []
    topic_movie_mapping[top_topic].append((index_movie_mapping[movie_index], top_topic_prob))
    
for topic in xrange(NUM_TOPICS):
    movies = topic_movie_mapping[topic]
    movies = sorted(movies, key=lambda x: x[1], reverse=True)
    print topic
    for movie_ID, _ in movies[:10]:
        print "%s (%s)" % (movie_name_mapping[movie_ID], movie_ID)
    print

0
Teachers (tt0088242)
The Prince (tt1087470)
Pretty in Pink (tt0091790)
High School Musical 2 (tt0810900)
American Reunion (tt1605630)
Mac & Devin Go to High School (tt1870425)
Slap Her, She's French! (tt0187512)
High School Musical 3: Senior Year (tt0962726)
High School Musical (tt0475293)
Mantivities (tt2402963)

1
Eleanor and Franklin (tt0074464)
Farewell to Manzanar (tt0074518)
Countdown to War (tt0204980)
9th Company (tt0417397)
Lion of the Desert (tt0081059)
Rambo III (tt0095956)
The Road to Freedom (tt1469387)
The Bang Bang Club (tt1173687)
Into the Storm (tt0992993)
Skin (tt0964586)

2
Q & A (tt0100442)
Shakedown (tt0096087)
McQ (tt0071824)
Backdraft (tt0101393)
The Late Show (tt0076301)
Maniac Cop (tt0095583)
Stakeout (tt0094025)
Magnum Force (tt0070355)
The Green Mile (tt0120689)
Bending the Rules (tt1715320)

3
Scorched (tt0286947)
Screwed (tt0156323)
City of Industry (tt0118859)
Smokey and the Bandit (tt0076729)
3 Strikes (tt0199290)
Heist (tt0252503)
Metro (tt0119664)
Kno

In [13]:
storylines['tt0816692']

'In the near future, Earth has been devastated by drought and famine, causing a scarcity in food and extreme changes in climate. When humanity is facing extinction, a mysterious rip in the space-time continuum is discovered, giving mankind the opportunity to widen its lifespan. A group of explorers must travel beyond our solar system in search of a planet that can sustain life. The crew of the Endurance are required to think bigger and go further than any human in history as they embark on an interstellar voyage into the unknown. Coop, the pilot of the Endurance, must decide between seeing his children again and the future of the human race.'

In [24]:
import operator
for word in sorted(words.keys(), key=lambda x: words[x], reverse=True)[:100]:
    print ' '.join([word for __ in xrange(words[word])])

life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life life 