In this notebook we will use the previously generated plot similarities to remake the paper. The codes from the last notebook are compiled in the Featurizer class in the featurizer.py file.

In [20]:
from featurizer import Featurizer
cf = Featurizer(plot_vectorizer = 'binary', tokenizer = None, lda = False)
cf.load_train('data.json')

In [21]:
cf.find_movie('Tokyo Drift')

[('The Fast and the Furious: Tokyo Drift', '2006')]

In order to make a linear combination of all features we have (as the paper uses), we need to do a linear regression on some training data.

In [22]:
from featurizer import Featurizer
import numpy as np
from scipy.sparse import issparse

def first_find(title, year = None):
    return cf.find_movie(title, year)[0]

def training_row(A, B, score, featurizer):
    fv = featurizer.single_features(A,B)
    return np.hstack((fv.todense() if issparse(fv) else fv, np.array(score).reshape((1,1))))

In [23]:
import io, json

def get_train_data(path):
    with io.open(path, 'r', encoding='latin-1') as f:
        data = json.load(f)
        return [((movie['movie1']['title'],movie['movie1']['year']), \
                 (movie['movie2']['title'],movie['movie2']['year']), movie['score']) \
                for movie in data]

training_data = get_train_data('training.json')

In [24]:
def training_vectors(training_data, featurizer):
    v = [training_row(A, B, score, featurizer) for A, B, score in training_data]
    v = np.vstack(v)
    v = np.hsplit(v,[-1])
    return v[0], v[1]

A, b = training_vectors(training_data, cf)
print(A.shape)
print(b.shape)

(9, 3)
(9, 1)


In [26]:
from sklearn.linear_model import LinearRegression

def lin_reg(A, b):
    lr = LinearRegression().fit(A,b)
    r2 = lr.score(A,b)
    coef = lr.coef_
    return r2, coef

r2, coef = lin_reg(A,b)
print('R^2 value:',r2)

R^2 value: 0.49418843026


In [7]:
del(cf)

Sample runs.

In [9]:
def linear_test(training_data, path, plot_vectorizer = 'count', tokenizer = None, lda = False, use_genre_vecs = False):
    cf = Featurizer(plot_vectorizer = plot_vectorizer, tokenizer = tokenizer, lda = lda, use_genre_vecs = use_genre_vecs)
    cf.load_train(path)
    A, b = training_vectors(training_data, cf)
    r2, coef = lin_reg(A, b)
    print('R^2:',r2)
    print('Similar to the Avengers:',cf.similar_movies(coef, ('The Avengers', '2012'), movies = None, n = 15))
    print('Similar to Burlesque:',cf.similar_movies(coef, ('Burlesque','2010/I'), movies = None, n = 15))
    print('Similar to Gladiator:',cf.similar_movies(coef, ('Gladiator','2000'), movies = None, n = 15))
    print('Similar to Tokyo Drift:',cf.similar_movies(coef, ('The Fast and the Furious: Tokyo Drift','2006'), movies = None, n = 15))
    
#linear_test(training_data, 'binary', tokenizer = None, use_genre_vecs = True)

In [14]:
plot_vects = ['count','binary','tfidf']
tokenizers = [None]
genre_vecs = [False, True]
datas = ['data.json', 'ne_data.json']

for gv in genre_vecs:
    for data in datas:
        for tokenizer in tokenizers:
            for plot_vect in plot_vects:
                print('Plot vectorizer:',plot_vect)
                print('Tokenizer:',tokenizer)
                print('Using genre vectors:', gv)
                print('Using data:', data)
                linear_test(training_data, path = data, plot_vectorizer = plot_vect, \
                            tokenizer = tokenizer, use_genre_vecs = gv)

Plot vectorizer: count
Tokenizer: None
Using genre vectors: False
Using data: data.json
R^2: 0.541549138789
Similar to the Avengers: [('Team Knight Rider', '1997'), ('Return of Hours', '2013'), ('Stargate', '1994'), ('Mission Park', '2010'), ('Dark World: Duel of the Assassins', '2003'), ('Ritânâ', '2002'), ('Einhänder', '1997'), ('Dune Warriors', '1991'), ('2012', '2009/I'), ('Raiders of the Sun', '1992'), ('Airboss II: Preemptive Strike', '1998'), ('Superman Returns', '2006'), ('Intergalactic Combat', '2007'), ('Independence Day', '1996'), ('Onslaught', '2015')]
Similar to Burlesque: [('The Song', '2014/I'), ('Save the Last Dance', '2001'), ('Beat Girl', '2013'), ('Liberty Heights', '1999'), ('The Other Side of Love', '2016'), ('The Company', '2003'), ('A Slipping-Down Life', '1999'), ('1400', '2015'), ('Between Love & Goodbye', '2008'), ('Sine novela', '2007'), ('Yellow', '2006/I'), ('Once', '2007'), ('Musiek vir die Agtergrond', '2013'), ('Listen to Your Heart', '2010'), ('Julien &

In [8]:
plot_vects = ['count','binary','tfidf']
tokenizers = [None]
genre_vecs = [False, True]
datas = ['data.json', 'ne_data.json']

for data in datas:
    for tokenizer in tokenizers:
        for gv in genre_vecs:
            for plot_vect in plot_vects:
                print('Plot vectorizer:',plot_vect)
                print('Tokenizer:',tokenizer)
                print('Using genre vectors:', gv)
                print('Using data:', data)
                linear_test(training_data, path = data, plot_vectorizer = plot_vect, \
                            tokenizer = tokenizer, use_genre_vecs = gv)

Plot vectorizer: count
Tokenizer: None
Using genre vectors: False
Using data: data.json
R^2: 0.035111619399
Similar to the Avengers: [('Foxcatcher', '2014'), ('Promised Land', '2012'), ('Alex Cross', '2012'), ('With This Ring', '2015'), ('Madtown', '2016'), ("Ultimate Iron Man: The Making of 'Iron Man 2'", '2010'), ('Avengers: Age of Ultron', '2015'), ('Cleveland Abduction', '2015'), ('Made in Cleveland', '2013'), ('Draft Day', '2014/I'), ('The Avengers', '2012'), ('Criminal Activities', '2015'), ('Avengers: Age of Ultron - From the Inside Out: Making of Avengers - Age of Ultron', '2015'), ('Captain America: The Winter Soldier', '2014'), ('Thor', '2011')]
Similar to Burlesque: [('Baggage Claim', '2013'), ('Annie', '1999'), ('Fuller House', '2016'), ('17 Again', '2009'), ('Burlesque', '2010/I'), ('Beacon Hill', '2004'), ('The Ten Commandments: The Musical', '2006'), ('Fired Up!', '2009'), ('Strange Fruit', '2008'), ('Dance Your Ass Off', '2009'), ('Sex Tax: Based on a True Story', '2010

In [12]:
plot_vects = ['count','binary','tfidf']
tokenizers = [None]
genre_vecs = [True]
datas = ['ne_data.json','data.json']

for data in datas:
    for tokenizer in tokenizers:
        for gv in genre_vecs:
            for plot_vect in plot_vects:
                print('Plot vectorizer:',plot_vect)
                print('Tokenizer:',tokenizer)
                print('Using genre vectors:', gv)
                print('Using data:', data)
                linear_test(training_data, path = data, plot_vectorizer = plot_vect, \
                            tokenizer = tokenizer, use_genre_vecs = gv)

Plot vectorizer: count
Tokenizer: None
Using genre vectors: True
Using data: ne_data.json
R^2: 0.668943685437
Similar to the Avengers: [('Songbirth', '2015'), ('The Godfather Family: A Look Inside', '1990'), ('Songbook', '2008'), ('Maple Flavour Films', '2008'), ('Nigger', '2012'), ('Rock & Roll', '1995'), ('Pick Up the Mic', '2006'), ("The Making of Steven Spielberg's 'Jaws'", '1995'), ('The Pod', '2014'), ('Misconception', '2009'), ('Trinity and Beyond: The Atomic Bomb Movie', '1995'), ('The Summer of Massacre', '2012'), ('The Celluloid Closet', '1995'), ('Tardisodes', '2006'), ('The First Measured Century', '2000')]
Similar to Burlesque: [('The Godfather Family: A Look Inside', '1990'), ('Maple Flavour Films', '2008'), ('Nigger', '2012'), ('The Celluloid Closet', '1995'), ('Tardisodes', '2006'), ('The Summer of Massacre', '2012'), ('Pick Up the Mic', '2006'), ('Rock & Roll', '1995'), ('The First Measured Century', '2000'), ('The Pod', '2014'), ('Misconception', '2009'), ("The Making

KeyboardInterrupt: 