In [1]:
import sklearn.metrics.pairwise as pw
import numpy as np

X = np.matrix([1,-13])
Y = np.matrix([-34,13])
pw.cosine_similarity(X, Y)

array([[-0.42772402]])

In [2]:
import io, json
from collections import OrderedDict

def read_data(path):
    with io.open(path, 'r', encoding = 'latin-1') as f:
        movies = json.load(f)
        return OrderedDict({(movie['title'],int(movie['year'])):movie['plot'] for movie in movies}.items())
    
movies = read_data('data.json')
print(len(movies))

147238


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = 'english') 
feat_vec = vectorizer.fit_transform(movies.values())
vocab = vectorizer.get_feature_names()
print('Number of words in vocabulary:', len(vocab))

Number of words in vocabulary: 208140


In [4]:
import math

def similar_vectors(X, v, n = 6, batch_size = 10000):
    num_vecs = X.shape[0]
    num_batches = math.ceil(num_vecs/batch_size)
    min_indices = np.zeros(num_batches*n, dtype=np.int)
    min_dists = np.ones(num_batches*n, dtype=np.float)
    for batch in range(num_batches):
        dists = pw.pairwise_distances(X[batch*batch_size:min((batch+1)*batch_size,num_vecs)], \
                                      v, metric='cosine')
        ind_min = np.argpartition(dists[:,0], n)[:n]
        min_indices[batch*n:(batch+1)*n] = ind_min + batch*batch_size
        min_dists[batch*n:(batch+1)*n] = dists[ind_min][:,0]
    ind_min = np.argpartition(min_dists, n)[:n]   
    best_ind = min_indices[ind_min]
    best_dist = min_dists[ind_min]
    return best_ind, best_dist
    
def similar_movies(title, feat_vec, vectorizer, n = 6):
    v = vectorizer.transform([movies[title]])
    best_ind, best_dist = similar_vectors(feat_vec, v, n)
    key_list = list(movies.keys())
    for i in range(n):
        sim_title = key_list[best_ind[i]]
        print(sim_title)
        print(best_dist[i])
        print(movies[sim_title])
    
similar_movies(('Easy A',2010),feat_vec,vectorizer)

('Easy A', 2010)
-2.22044604925e-16
After a little white lie about losing her virginity gets out, a clean cut high school girl sees her life paralleling Hester Prynne's in "The Scarlet Letter," which she is currently studying in school - until she decides to use the rumor mill to advance her social and financial standing. High school student Olive Penderghast (Emma Stone) finds herself the victim of her school's "rumor mill" when she lies to her best friend Rhiannon (Alyson Michalka) about a weekend tryst with a fictional college freshman. Word quickly spreads of Olive's promiscuity and, much to her surprise, she welcomes the attention. When she agrees to help out a bullied friend by pretending to sleep with him, her image rapidly degrades to a more lascivious state and her world begins to spin out of control. As she helps more and more of her classmates and her lies continue to escalate, Olive must find a way to save face before the school's religious fanatic Marianne (Amanda Bynes) g

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = 'english') 
tfidf_vec = tfidf_vectorizer.fit_transform(movies.values())

similar_movies(('Easy A',2010),tfidf_vec,tfidf_vectorizer)

('Olive Prepares', 2008)
0.5319884352
Olive Prepares was inspired by, and is based on, time I spent working at home as an artist, living with fellow animator and filmmaker Helen Hill. Olive has spent too long cooped up by herself trying to make art. She starts to see things as drawn animation. Then, a new tenant catches Olive in an intriguing position, is curious and invites her to tea. The invitation adds panic to Olive's already crazy self, but after awhile there's nothing for her to do but embrace her surreal world, take a breath and go upstairs.
('Olive Kitteridge', 2014)
0.519125160644
A look at a seemingly placid New England town that is actually wrought with illicit affairs, crime and tragedy, all told through the lens of Olive, whose wicked wit and harsh demeanor mask a warm but troubled heart and staunch moral center. The story spans 25 years and focuses on Olive's relationships with her husband, Henry, the good-hearted and kindly town pharmacist; their son, Christopher, who r

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

def similar_movies2(title, feat_vec, vectorizer, vectorizer2, n = 6):
    v = vectorizer2.transform(vectorizer.transform([movies[title]]))
    best_ind, best_dist = similar_vectors(feat_vec, v, n)
    key_list = list(movies.keys())
    for i in range(n):
        sim_title = key_list[best_ind[i]]
        print(sim_title)
        print(best_dist[i])
        print(movies[sim_title])

n_topics = 20
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=4,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda_vec = lda.fit_transform(feat_vec)
similar_movies2(('Easy A',2010),lda_vec,vectorizer,lda)

('Easy A', 2010)
2.22044604925e-16
After a little white lie about losing her virginity gets out, a clean cut high school girl sees her life paralleling Hester Prynne's in "The Scarlet Letter," which she is currently studying in school - until she decides to use the rumor mill to advance her social and financial standing. High school student Olive Penderghast (Emma Stone) finds herself the victim of her school's "rumor mill" when she lies to her best friend Rhiannon (Alyson Michalka) about a weekend tryst with a fictional college freshman. Word quickly spreads of Olive's promiscuity and, much to her surprise, she welcomes the attention. When she agrees to help out a bullied friend by pretending to sleep with him, her image rapidly degrades to a more lascivious state and her world begins to spin out of control. As she helps more and more of her classmates and her lies continue to escalate, Olive must find a way to save face before the school's religious fanatic Marianne (Amanda Bynes) ge

In [51]:
n_topics = 20
def list_topics(lda, vocab, n_topics, n_words = 10):
    for n in range(n_topics):
        top_inds = np.argpartition(lda.components_[n], -n_words)[-n_words:]
        topics = [vocab[i] for i in top_inds]
        print('Topic ',str(n+1)+':',topics)
        
list_topics(lda, vocab, n_topics)

Topic  1: ['way', 'house', 'like', 'make', 'home', 'big', 'day', 'time', 'just', 'new']
Topic  2: ['demon', 'zombie', 'world', 'joey', 'evil', 'hunter', 'fight', 'nick', 'battle', 'vs']
Topic  3: ['cooking', 'chicken', 'bruce', 'cook', 'al', 'food', 'jeff', 'restaurant', 'qv', 'host']
Topic  4: ['audrey', 'virtual', 'damon', 'uld', 'thriller', 'logan', 'robot', 'japan', 'japanese', 'computer']
Topic  5: ['tyler', 'powers', 'angel', 'jesse', 'jack', 'tony', 'tom', 'jonathan', 'luke', 'evil']
Topic  6: ['years', 'lives', 'world', 'man', 'love', 'story', 'old', 'family', 'life', 'young']
Topic  7: ['lana', 'arthur', 'queen', 'gold', 'el', 'boxing', 'royal', 'prince', 'castle', 'king']
Topic  8: ['alex', 'wedding', 'high', 'sex', 'relationship', 'new', 'love', 'school', 'marriage', 'party']
Topic  9: ['isis', 'vampires', 'beast', 'calvin', 'count', 'spongebob', 'goa', 'vampire', 'blood', 'moon']
Topic  10: ['contestants', 'teams', 'win', 'game', 'competition', 'season', 'chris', 'team', 'w