In [67]:
from json import loads, dumps

import sklearn.linear_model as lm
from sklearn.cross_validation import KFold

import numpy as np

from sklearn.metrics import mean_squared_error, r2_score

from math import sqrt

from time import time

MOVIE_DATA_LOC = '../data/movies.bigdata'

In [2]:
movie_data = []
with open(MOVIE_DATA_LOC) as f:
    for line in f:
        movie_data.append(loads(line))

In [3]:
def rmse(test_classes_true, test_classes_pred):
    return sqrt(mean_squared_error(test_classes_true, test_classes_pred))

In [4]:
def pprint(json_data):
    print dumps(json_data, indent=4, separators=(',', ': '))

In [158]:
# mapping of actor_id to (index, name)
actor_id_mapping = {}

# mapping if index to (actor_id, name)
actor_index_mapping = {}

# mapping of movie_id to (index, title, set(actor_ids...), rating)
movie_id_mapping = {}

# mapping of index to (movie_id, title, set(actor_ids...), rating)
movie_index_mapping = {}

actor_last_index = 0
movie_last_index = 0
for movie in movie_data:
    if ('cast_summary' in movie['data']) and ('rating' in movie['data']):
        movie_id = movie['data']['tconst']
        if movie_id not in movie_id_mapping: # some movie_IDs are repeated for some reason...
            actor_ids = set()
            for cast_member in movie['data']['cast_summary']:
                actor_id = cast_member['name']['nconst']
                actor_ids.add(actor_id)
                if actor_id not in actor_id_mapping:
                    actor_name = cast_member['name']['name']
                    actor_id_mapping[actor_id] = (actor_last_index, actor_name)
                    actor_index_mapping[actor_last_index] = (actor_id, actor_name)
                    actor_last_index += 1
                    
            movie_id_mapping[movie_id] = (movie_last_index, movie['data']['title'], actor_ids, movie['data']['rating'])
            movie_index_mapping[movie_last_index] = (movie_id, movie['data']['title'], actor_ids, movie['data']['rating'])
            movie_last_index += 1
            
# prepare binary feature matrix, rows = movies, columns = actors
actor_matrix = np.zeros(shape=(len(movie_id_mapping), len(actor_id_mapping)), dtype=np.uint8)
ratings = np.empty(shape=len(movie_id_mapping), dtype=float)

for movie_id in movie_id_mapping:
    movie_index = movie_id_mapping[movie_id][0]
    for actor_id in movie_id_mapping[movie_id][2]:
        actor_index = actor_id_mapping[actor_id][0]
        actor_matrix[movie_index, actor_index] = 1
    ratings[movie_index] = movie_id_mapping[movie_id][3]

In [150]:
# mapping of director_id to (index, name)
director_id_mapping = {}

# mapping if index to (director_id, name)
director_index_mapping = {}

# mapping of movie_id to (index, title, set(director_ids...), rating)
movie_id_mapping = {}

# mapping of index to (movie_id, title, set(director_ids...), rating)
movie_index_mapping = {}

director_last_index = 0
movie_last_index = 0
for movie in movie_data:
    if ('directors_summary' in movie['data']) and ('rating' in movie['data']):
        movie_id = movie['data']['tconst']
        if movie_id not in movie_id_mapping: # some movie_IDs are repeated for some reason...
            director_ids = set()
            for director in movie['data']['directors_summary']:
                director_id = director['name']['nconst']
                director_ids.add(director_id)
                if director_id not in director_id_mapping:
                    director_name = director['name']['name']
                    director_id_mapping[director_id] = (director_last_index, director_name)
                    director_index_mapping[director_last_index] = (director_id, director_name)
                    director_last_index += 1
            
            movie_id_mapping[movie_id] = (movie_last_index, movie['data']['title'], director_ids, movie['data']['rating'])
            movie_index_mapping[movie_last_index] = (movie_id, movie['data']['title'], director_ids, movie['data']['rating'])
            movie_last_index += 1
    
# prepare binary feature matrix, rows = movies, columns = actors
director_matrix = np.zeros(shape=(len(movie_id_mapping), len(director_id_mapping)), dtype=np.uint8)
ratings = np.empty(shape=len(movie_id_mapping), dtype=float)

for movie_id in movie_id_mapping:
    movie_index = movie_id_mapping[movie_id][0]
    for director_id in movie_id_mapping[movie_id][2]:
        director_index = director_id_mapping[director_id][0]
        director_matrix[movie_index, director_index] = 1
    ratings[movie_index] = movie_id_mapping[movie_id][3]

In [6]:
def predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier):
    classifier.fit(train_feature_matrix, train_classes)
    test_classes_pred = classifier.predict(test_feature_matrix)
    return test_classes_pred, test_classes_real

In [7]:
# cross validation predict
def cv_predict(feature_matrix, classes, classifier, k):
    kf = KFold(len(classes), n_folds=k)
    results = []
    for train, test in kf:
        train_feature_matrix = feature_matrix[train]
        train_classes = classes[train]
        test_feature_matrix = feature_matrix[test]
        test_classes_real = classes[test]
        results.append(predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier))
    return results

In [56]:
lasso_alpha = 0.000575877769539

In [19]:
regressor = lm.Lasso(alpha=lasso_alpha)
regressor.fit(matrix, ratings)



LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', tol=0.0001, verbose=False)

In [163]:
matrix_copy = np.copy(actor_matrix)
np.random.shuffle(matrix_copy)

tt_index = len(matrix_copy) / 5

regressor2 = lm.Lasso(alpha=1.0)
regressor2.fit(matrix_copy[tt_index:], ratings[tt_index:])

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)

In [167]:
print regressor2.intercept_
print np.sum(regressor2.coef_)

print np.mean(ratings[tt_index:]), np.median(ratings[tt_index:])

print rmse(regressor2.predict(matrix_copy[:tt_index]), ratings[:tt_index])

mean_preds = [np.mean(ratings[tt_index:])] * len(ratings[:tt_index])
print rmse(mean_preds, ratings[:tt_index])

6.30079702444
0.0
6.30079702444 6.4
1.01999494743
1.01999494743


In [15]:
regressor = LinearRegression()

K = 5

total_rmse = 0
total_r2_score = 0
start_time = time()

for pred_values, true_values in cv_predict(matrix, ratings, regressor, K):
    total_rmse += rmse(true_values, pred_values)
    total_r2_score += r2_score(true_values, pred_values)
    
avg_rmse = total_rmse / K
avg_r2_score = total_r2_score / K

print avg_rmse, avg_r2_score, time() - start_time

2.85427722152e+13 -8.0587005462e+26 1550.96899986


In [157]:
for movie_index, features in enumerate(director_matrix[:10]):
    print movie_index_mapping[movie_index][1]
    for director_index, value in enumerate(features):
        if value == 1:
            print director_index_mapping[director_index][1]
    print '-' * 80

Pulp Fiction
Quentin Tarantino
--------------------------------------------------------------------------------
The Amazing Spider-Man 2
Marc Webb
--------------------------------------------------------------------------------
The Shawshank Redemption
Frank Darabont
--------------------------------------------------------------------------------
Star Wars
George Lucas
--------------------------------------------------------------------------------
Back to the Future
Robert Zemeckis
--------------------------------------------------------------------------------
The Breakfast Club
John Hughes
--------------------------------------------------------------------------------
The Goonies
Richard Donner
--------------------------------------------------------------------------------
The Silence of the Lambs
Jonathan Demme
--------------------------------------------------------------------------------
Jurassic Park
Steven Spielberg
-----------------------------------------------------------