In [41]:
from json import loads, dumps

from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

import numpy as np

from sklearn.metrics import mean_squared_error, r2_score

from math import sqrt

from time import time

MOVIE_DATA_LOC = '../data/movies.bigdata'
MOVIE_ACTOR_DATA_LOC = '../data/movie_casts.bigdata'

In [None]:
# read in movie data
movie_data = []
with open(MOVIE_DATA_LOC) as f:
    for line in f:
        movie_data.append(loads(line))

In [None]:
# read in movie actor data
movie_actor_data = []
with open(MOVIE_ACTOR_DATA_LOC) as f:
    for line in f:
        movie_actor_data.append(tuple(line.rstrip().split(',')))

In [None]:
# mapping of actor_id to (index, name)
actor_id_mapping = {}

# mapping of index to (actor_id, name)
actor_index_mapping = {}

# mapping of movie_id to (index, title, set(actor_ids...), rating)
movie_id_mapping = {}

# mapping of index to (movie_id, title, set(actor_ids...), rating)
movie_index_mapping = {}

actor_last_index = 0
movie_last_index = 0
for movie in movie_data:
    if ('cast_summary' in movie['data']) and ('rating' in movie['data']):
        actor_ids = set()
        for cast_member in movie['data']['cast_summary']:
            actor_id = cast_member['name']['nconst']
            actor_ids.add(actor_id)
            if actor_id not in actor_id_mapping:
                actor_name = cast_member['name']['name']
                actor_id_mapping[actor_id] = (actor_last_index, actor_name)
                actor_index_mapping[actor_last_index] = (actor_id, actor_name)
                actor_last_index += 1
        movie_id = movie['data']['tconst']
        if movie_id not in movie_id_mapping: # some movie_IDs are repeated for some reason...
            movie_id_mapping[movie_id] = (movie_last_index, movie['data']['title'], actor_ids, movie['data']['rating'])
            movie_index_mapping[movie_last_index] = (movie_id, movie['data']['title'], actor_ids, movie['data']['rating'])
            movie_last_index += 1


In [33]:
def rmse(test_classes_true, test_classes_pred):
    return sqrt(mean_squared_error(test_classes_true, test_classes_pred))

In [34]:
def predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier):
    classifier.fit(train_feature_matrix, train_classes)
    test_classes_pred = classifier.predict(test_feature_matrix)
    return test_classes_pred, test_classes_real

In [35]:
# cross validation predict
def cv_predict(feature_matrix, classes, classifier, k):
    kf = KFold(len(classes), n_folds=k)
    results = []
    for train, test in kf:
        train_feature_matrix = feature_matrix[train]
        train_classes = classes[train]
        test_feature_matrix = feature_matrix[test]
        test_classes_real = classes[test]
        results.append(predict(train_feature_matrix, train_classes, test_feature_matrix, test_classes_real, classifier))
    return results

In [39]:
# prepare binary feature matrix, rows = movies, columns = actors
matrix = np.empty(shape=(len(movie_id_mapping), len(actor_id_mapping)), dtype=np.uint8)
matrix.fill(-1)

ratings = np.empty(shape=len(movie_id_mapping), dtype=float)

for movie_id in movie_id_mapping:
    movie_index = movie_id_mapping[movie_id][0]
    for actor_id in movie_id_mapping[movie_id][2]:
        actor_index = actor_id_mapping[actor_id][0]
        matrix[movie_index, actor_index] = 1
    ratings[movie_index] = movie_id_mapping[movie_id][3]

In [40]:
regressor = LinearRegression()

K = 5

total_rmse = 0
total_r2_score = 0
start_time = time()

for pred_values, true_values in cv_predict(matrix[:100], ratings, regressor, K):
    total_rmse += rmse(true_values, pred_values)
    total_r2_score += r2_score(true_values, pred_values)
    
avg_rmse = total_rmse / K
avg_r2_score = total_r2_score / K

print avg_rmse, avg_r2_score, time() - start_time

ValueError: Cannot have number of folds n_folds=5 greater than the number of samples: 0.