In [1]:
from json import loads, dumps

from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.cross_validation import KFold

import numpy as np

from sklearn.metrics import mean_squared_error, r2_score

from math import sqrt

from time import time

from scipy.sparse import csc_matrix

MOVIE_DATA_LOC = '../data/movies.bigdata'
MOVIE_ACTOR_DATA_LOC = '../data/movie_casts.bigdata'

In [2]:
# read in movie data
movie_data = []
with open(MOVIE_DATA_LOC) as f:
    for line in f:
        movie_data.append(loads(line))

In [3]:
# read in movie actor data
movie_actor_data = []
with open(MOVIE_ACTOR_DATA_LOC) as f:
    for line in f:
        movie_actor_data.append(tuple(line.rstrip().split(',')))

In [4]:
# mapping of actor_id to name
actor_name_mapping = {}

# mapping of actor_id to index
actor_index_mapping = {}

# mapping of movie_id to set(actor_ids)
movie_cast_mapping = {}

actor_last_index = 0
for entry in movie_actor_data:
    movie_ID, actor_ID, actor_name = entry
    if actor_ID not in actor_name_mapping:
        actor_name_mapping[actor_ID] = actor_name
    if actor_ID not in actor_index_mapping:
        actor_index_mapping[actor_ID] = actor_last_index
        actor_last_index += 1
    if movie_ID not in movie_cast_mapping:
        movie_cast_mapping[movie_ID] = set()
    movie_cast_mapping[movie_ID].add(actor_ID)
    
# mapping of movie_id to name
movie_name_mapping = {}

# mapping of movie_id to rating
movie_rating_mapping = {}

# mapping of movie_id to index
movie_index_mapping = {}

movie_last_index = 0
for entry in movie_data:
    try:
        movie_rating = float(entry['data']['rating'])
    except KeyError:
        movie_rating = -1. # sentinel value for indicating lack of rating
    movie_ID = entry['data']['tconst']
    movie_name = entry['data']['title']
    if movie_ID not in movie_name_mapping:
        movie_name_mapping[movie_ID] = movie_name
    if movie_ID not in movie_rating_mapping:
        movie_rating_mapping[movie_ID] = movie_rating
    if movie_ID not in movie_index_mapping:
        # needs to have a rating and a cast
        if (movie_rating_mapping[movie_ID] != -1.) and (movie_ID in movie_cast_mapping):
            # year filter
            movie_year = int(entry['data']['year'])
            if 0 <= movie_year:
                movie_index_mapping[movie_ID] = movie_last_index
                movie_last_index += 1

print len(actor_name_mapping), len(actor_index_mapping), len(movie_cast_mapping), len(movie_name_mapping), len(movie_rating_mapping), len(movie_index_mapping)

221334 221334 9624 9868 9868 9409
221334 221334 9624 9868 9868 9409


In [5]:
def rmse(test_classes_true, test_classes_pred):
    return sqrt(mean_squared_error(test_classes_true, test_classes_pred))

In [6]:
def predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier):
    classifier.fit(train_feature_matrix, train_classes)
    test_classes_pred = classifier.predict(test_feature_matrix)
    return test_classes_pred, test_classes_real

In [7]:
# cross validation predict
def cv_predict(feature_matrix, classes, classifier, k):
    kf = KFold(len(classes), n_folds=k)
    results = []
    for train, test in kf:
        train_feature_matrix = feature_matrix[train]
        train_classes = classes[train]
        test_feature_matrix = feature_matrix[test]
        test_classes_real = classes[test]
        results.append(predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier))
    return results

In [8]:
# prepare binary feature matrix, rows = movies, columns = actors
matrix = np.empty(shape=(len(movie_index_mapping), len(actor_index_mapping)), dtype=np.uint8)
matrix.fill(0)

ratings = np.empty(shape=len(movie_index_mapping), dtype=float)

for movie_ID in movie_index_mapping:
    movie_index = movie_index_mapping[movie_ID]
    for actor_ID in movie_cast_mapping[movie_ID]:
        actor_index = actor_index_mapping[actor_ID]
        matrix[movie_index, actor_index] = 1
    ratings[movie_index] = movie_rating_mapping[movie_ID]
    
smatrix = csc_matrix(matrix)

In [10]:
regressor = Lasso(alpha=0.01)

K = 10

total_rmse = 0
total_r2_score = 0
start_time = time()

for pred_values, true_values in cv_predict(smatrix, ratings, regressor, K):
    total_rmse += rmse(true_values, pred_values)
    total_r2_score += r2_score(true_values, pred_values)
    
avg_rmse = total_rmse / K
avg_r2_score = total_r2_score / K

print avg_rmse, avg_r2_score, time() - start_time
print regressor

1.13005315916 -0.235606378468 41.6980001926
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)
1.13005315916 -0.235606378468 41.6980001926
Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute='auto', tol=0.0001,
   warm_start=False)
