In [1]:
import csv
import itertools
import time
import pickle
import random
import copy

import numpy as np
import math

from collections import defaultdict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import roc_curve, auc

from scikits.crab.models.classes import MatrixPreferenceDataModel, MatrixBooleanPrefDataModel
from scikits.crab.metrics import *
from scikits.crab.similarities import UserSimilarity
from scikits.crab.recommenders.knn import UserBasedRecommender
from scikits.crab.recommenders.svd.classes import MatrixFactorBasedRecommender
from scikits.crab.recommenders.knn.item_strategies import AllPossibleItemsStrategy, ItemsNeighborhoodStrategy

# Primary Data Holder for Reviews Information
The reviews dataset allows us to load and pass information to Crab to be analyzed. As well let's us remove and add data as needed for producing personalized recommendations.

In [332]:
class reviews_dataset():
    '''
    Specialized class for reading recommendations csv files produced
    by our rotten tomatoes parsing script.
    
    Implements the Crab (http://muricoca.github.io/crab/index.html)
    interface to be used with their recommendation system implementations
    
    namely:
    data - mapping of user ids to preferences
    user_ids - mapping of reviewer names to assigned id
    item_ids - mapping of movie ids to their titles
    '''
    
    def __init__(self, fileloc):
        
        self.data = {}
        
        self.item_ids = {}
        self.movie_ids = {}
        
        self.user_ids = set()
        
        self.invalid_scores = []        
        self.reviews = []
        
        with open(fileloc) as f:
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            
            for row in reader:
                self.__append_row(row)
                
    def __append_row(self, row):
        name = row[0]
        mid = int(row[1])       
        title = row[2]
        score = row[6]
        
        try:
            # Normalize score to a 1-5 scale
            score = float(score)
            score = 1 + (score*4)
        except:
            self.invalid_scores.append(row)
            return
                
        self.reviews.append(row)
        
        self.user_ids.add(name)
        
        self.item_ids[mid] = title
        if title not in self.movie_ids:
            self.movie_ids[title] = set()
        self.movie_ids[title].add(mid)
        
        if not name in self.data:
            self.data[name] = {}
        self.data[name][mid] = score
        
    def get_review_counts(self):
        """
        Return a list of movie ids, sorted by the highest number of 
        review counts.
        """
        movies = self.data

        # Get the reviews with the most data in them
        counts = {}
        for rev in movies:
            for mid in movies[rev]:
                if not mid in counts:
                    counts[mid] = 0
                counts[mid] += 1
            
        return sorted(counts.items(), key=lambda x:x[1], reverse=True)
    
    def get_score_counts(self):
        """
        Return the number of non-empty cells in the data dictionary
        """
        count = 0
        for name in self.data:
            count += len(self.data[name])
        name
    
    def get_title(self, mid):
        """
        Returns title string for movie id
        """
        return self.item_ids[mid]
    
    def get_mid(self, title):
        """
        Returns the movie id given the title of the movie (in a list) since
        there may be more than one mid per title.
        """
        return self.movie_ids[title]
    
    def add_reviewer(self, name, scores):
        """
        Given a name and scores, add a reviewer to this dataset
        
        Note: This will overwrite any data currently in place!
        """
        self.remove_reviewer(name)
        
        self.data[name] = scores
        self.user_ids.add(name)
        
    def remove_reviewer(self, name):
        """
        Removes and returns the scores for a given reviewer (in the case of saving them)
        """
        if name not in self.data:
            return None
        
        scores = self.data[name]
        del(self.data[name])
        self.user_ids.remove(name)
        
        return scores
    
    def remove_movie(self, mid):
        """
        Removes and returns ratings for a given movie.
        """
        if mid not in self.item_ids:
            return None
        
        scores = {}
        for name in self.data:
            if mid in self.data[name]:
                scores[name] = self.data[name][mid]
                del(self.data[name][mid])
        
        title = self.item_ids[mid]
        del(self.item_ids[mid])
        self.movie_ids[title].remove(mid)
        
        return scores
    
    def add_movie(self, mid, title, scores):
        """
        given id, title, and scores, add them to the data set for this object
        
        Note: This will overwrite any existing data!
        """
        self.remove_movie(mid)
        
        for name in scores:
            self.data[name][mid] = scores[name]
        
        self.item_ids[mid] = title
        
        if title not in self.movie_ids:
            self.movie_ids[title] = set()
        self.movie_ids[title].add(mid)
        
        
        

# Basic Functional Testing

To make sure that everything in the previous class works. We set up a small movie set with 6 movies and a few hundred reviews.

In [333]:
test_loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/test.csv"
mvs = reviews_dataset(test_loc)

In [334]:
mvs.item_ids

{91976: 'Grey, The',
 93840: 'Cabin in the Woods, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

In [335]:
mvs.data['A.O. Scott'][91976]

3.4

In [336]:
mvs.get_mid("World War Z")

{103249}

In [337]:
mvs.get_review_counts()

[(93840, 199),
 (103249, 197),
 (107069, 159),
 (91976, 150),
 (104272, 97),
 (109895, 95)]

In [338]:
ao_scott_scores = mvs.remove_reviewer("A.O. Scott")

In [339]:
ao_scott_scores

{91976: 3.4, 103249: 3.4}

In [340]:
"A.O. Scott" in mvs.data

False

In [341]:
mvs.add_reviewer("A.O. Scott", ao_scott_scores)

In [342]:
"A.O. Scott" in mvs.data

True

In [343]:
revs = mvs.remove_movie(93840)

In [344]:
len(revs)

199

In [345]:
mvs.item_ids

{91976: 'Grey, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

In [346]:
mvs.add_movie(93840, "Cabin in the Woods, The", revs)
len(mvs.item_ids)

6

In [347]:
mvs.item_ids

{91976: 'Grey, The',
 93840: 'Cabin in the Woods, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

# Experimenting with Crab

It took a fair bit of time to get crab working. It's documentation is woefully lacking, and there are a few spots in the code that have typos in them still. That being said, we found both the neighbors model and the matrix model to be adequate implementations of both systems. We use our test data set here to make sure everything is working properly.

In [348]:
model = MatrixPreferenceDataModel(mvs.data)
print(model)

MatrixPreferenceDataModel (420 by 6)
         91976      93840      103249     104272     107069   ...
A.A. Dow    ---     4.200000   3.000000      ---     3.000000
A.O. Sco 3.400000      ---     3.400000      ---        ---
Adam Ros    ---     4.200000      ---        ---        ---
Al Alexa    ---        ---        ---        ---     3.800000
Alan Jon    ---     4.200000      ---        ---        ---
Alex Zan    ---     5.000000      ---        ---        ---
Ali Gray    ---     5.000000      ---        ---        ---
Alistair    ---        ---        ---     4.200000      ---
Amber Wi    ---        ---        ---     4.600000      ---
Amy Bian 2.000000      ---        ---        ---        ---
Amy Curt    ---     4.000000      ---        ---        ---
Amy Nich    ---        ---        ---        ---     2.600000
Anders W    ---        ---     2.600000      ---     2.600000
Andrea C 4.200000      ---     2.600000      ---        ---
Andy Lea    ---     4.200000   4.200000      --- 

In [349]:
def get_recommendations(reviews, recommender, userid):
    recs = recommender.recommend(userid)
    return [(reviews.get_title(x[0]), x[1]) for x in recs]

In [350]:
def get_similar_users(reviews, recommender, userid, n=15):
    return recommender.most_similar_users(userid, how_many=n)

In [351]:
"""
There are many other options for distance metrics other available in

https://github.com/muricoca/crab/blob/master/scikits/crab/metrics/pairwise.py
"""
users_similarity = UserSimilarity(model, pairwise.cosine_distances)
recommender = UserBasedRecommender(model, users_similarity, with_preference=True)

In [352]:
mvs.data["A.O. Scott"]

{91976: 3.4, 103249: 3.4}

In [353]:
user = "A.O. Scott"
print(mvs.data[user])
print(get_recommendations(mvs, recommender, user))

{91976: 3.4, 103249: 3.4}
[('Blackfish', 4.1314285714285708), ('Cabin in the Woods, The', 4.0886614173228342), ('Lone Survivor', 3.659259259259259), ('Bad Words', 3.5019047619047625)]


In [354]:
get_similar_users(mvs, recommender, user, n=2)

array(['Brad Miska', 'Edward Douglas', 'Jordan Farley'], 
      dtype='|S14')

# So Does This Actually Scale?
We tried applying it to the full dataset of reviews, but it was painfully slow (the neighbors method especially was non-functional). So we cut the dataset down to only look at recent reviews from 2012, 2013, 2014, and 2015. The matrix model was also a drastic improvement in both efficiency and performance.

In [41]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [42]:
model = MatrixPreferenceDataModel(all_mvs.data)
print(model)

MatrixPreferenceDataModel (1987 by 2206)
         89745      91485      91500      91529      91535    ...
A.A. Dow    ---        ---        ---        ---        ---
A.O. Sco 0.400000      ---        ---        ---        ---
AP Kryza    ---        ---        ---        ---        ---
AV Club     ---        ---        ---        ---        ---
Aaron Ar    ---        ---        ---        ---        ---
Aaron Me    ---        ---        ---        ---        ---
Aaron Ne    ---        ---        ---        ---        ---
Aaron Ya    ---     0.400000      ---        ---     0.400000
Aarti Jh    ---        ---        ---        ---        ---
Abbey Be    ---        ---        ---        ---        ---
Abby Gar    ---        ---        ---        ---        ---
Abby Wes    ---        ---        ---        ---        ---
Abhimany    ---        ---        ---        ---        ---
Abhinav     ---        ---        ---        ---        ---
Abigail     ---        ---        ---        ---   

In [43]:
# users_similarity = UserSimilarity(model, pairwise.euclidean_distances)
# recommender = UserBasedRecommender(model, users_similarity, with_preference=True)
recommender = MatrixFactorBasedRecommender(model)

In [44]:
rec = recommender.estimate_preference("A.O. Scott", 91485)
rec

0.57882195445260931

# I need to rate some movies!

I wanted personalized recommendations, so I just rated some of the movies in the dataset to see how it did for my preferences

In [45]:
movies = all_mvs.get_review_counts()

In [46]:
def rate_some_movies(reviews, my_scores={}):
    """
    If you don't have any ratings, use this to start ranking movies in order
    by review count. Can pass in a filled dictionary if you have already done
    this for some movies.
    """
    
    print("Input movie ratings 0 - 5 (enter nothing to continue or enter any other number to exit)")
        
    score = 0
    
    for x in reviews.get_review_counts():
        if x[0] in my_scores:
            continue
            
        print("%s: " % all_mvs.get_title(x[0]))
        
        score = raw_input()
        if not score:
            continue
        
        score = int(score)
        if score < 0 or score > 5:
            break
            
        score = score/5.0
        
        my_scores[x[0]] = score
        
    return my_scores

In [85]:
#scores = rate_some_movies(all_mvs)

In [86]:
# Commented out to not overwrite
#f = open("eriks_reviews.pickle", 'wb')
#pickle.dump(scores, f)
#f.close()

In [87]:
# Did this previously and wrote my ratings to a file
eriks_scores = pickle.load(open("eriks_reviews.pickle", 'rb'))

In [88]:
all_mvs.data["Erik Holum"] = eriks_scores

In [89]:
# I rated 76 movies
len(eriks_scores)

65

# Timing the Recommendations

So looking below it takes about 90 seconds to perform the matrix factorization for the data we currently have. 

In [62]:
start = time.time()
model = MatrixPreferenceDataModel(all_mvs.data)
print(time.time() - start)

3.65814399719


In [63]:
start = time.time()
my_recommender = MatrixFactorBasedRecommender(model,
                                             items_selection_strategy=ItemsNeighborhoodStrategy,
                                             n_features=20,
                                             learning_rate=.01,
                                             n_interations=50,
                                             with_preference=True)
print(time.time() - start)

91.8303189278


In [120]:
movies = ["%d %s" % (x[0],all_mvs.get_title(x[0])) for x in all_mvs.get_review_counts()]
c = 20
movies[c:c+10]

['112623 Dawn of the Planet of the Apes',
 '103228 Pacific Rim',
 '97752 Cloud Atlas',
 '117529 Jurassic World',
 '110102 Captain America: The Winter Soldier',
 '98961 Zero Dark Thirty',
 '96610 Looper',
 '102445 Star Trek Into Darkness',
 '93840 Cabin in the Woods, The',
 '102407 Great Gatsby, The']

In [73]:
# I've tested this with a bunch of movies that I've seen, and it's never 
# off by more than one star. So that's cool!!!
for i in range(10000000, 10000014):
    print("%s %f" % (all_mvs.get_title(i), my_recommender.estimate_preference("Erik Holum", i)))

Star Wars VII: The Force Awakens 0.747579
Sisters 0.652037
Peanuts Movie, The 0.676708
Son of Saul 0.704667
Hunger Games Mockingjay Part 2, The 0.800000
Good Dinosaur, The 0.627532
Creed 0.713339
Krampus 0.566268
Spectre 0.400000
Hateful Eight, The 0.626948
In the Heart of the Sea 0.560637
Martian, The 1.000000
Pan 0.516011
Last Witch Hunter, The 0.418982


# Testing and Verification

We use the standard but controversial Root Mean Squared Error to evaluate our recommender's performance. We start off just by splitting the scraped scores into crossvalidation sets, and found that the RMSE was .63! Unsurprising since we used professionally produced data on a small set, but it makes sense that reviewers' preferences would be more accurate over time than the average users. As we expect them to be more "consistent" in their ratings.

In [355]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [356]:
# Find reviewers with more than 50 reviews
reviewers = []
for name in all_mvs.user_ids:
    if len(all_mvs.data[name]) > 50:
        reviewers.append(name)
print(len(reviewers))

# choose a selection of them at random
np.random.seed(1000)
reviewers = np.random.choice(reviewers, size=100, replace=False)

447


In [357]:
def split_dictionary(dictionary, size=.3, random=None):
    """
    Randomly split a dictionary into two parts of size and 1 - size.
    
    Return deep copies of the new dictionaries
    """
    
    tmp = dictionary.keys()
    
    np.random.seed(random)
    keys = np.random.choice(tmp, size=int(len(tmp)*size), replace=False)

    newdict = {}
    olddict = copy.deepcopy(dictionary)
    for k in keys:
        newdict[k] = olddict[k]    
        del(olddict[k])
        
    return [olddict, newdict]

d = {a:a+1 for a in range(10)}
old, new = split_dictionary(d, size=.3, random=None)
print(old)
print(new)

{0: 1, 1: 2, 3: 4, 4: 5, 5: 6, 7: 8, 9: 10}
{8: 9, 2: 3, 6: 7}


In [423]:
def evaluate_reviewers(names, test_scores=None, mvs_set=None, cv_size=.3, random_state=None, fileloc=None):
    """
    Given the names of a set of reviewers, splits their review data into cross-validation set and
    a training set, builds a cf model using all other data, then evaluates the predictions
    produced for the CV set.
    
    Returns 1-D arrays of expected and actual results. In order for comparison.
    """
    
    # Allow specification of reviews file location
    if fileloc:
        mvs_set = reviews_dataset(loc)
        
    if not mvs_set:
        raise Exception
        
    # Add scores if specified
    if test_scores:
        for name in names:
            mvs_set.add_reviewer(name, test_scores[name])

    # Remove data from data set
    all_scores = {}
    for name in names:
        all_scores[name] = mvs_set.remove_reviewer(name)
    
    # Split into a CV set
    train_set = {}
    test_set = {}
    for name in names:
        train_scores, test_scores = split_dictionary(all_scores[name], 
                                                     size=cv_size, 
                                                     random=random_state)
        train_set[name] = train_scores
        test_set[name] = test_scores
    
    # Add the training set to the data set
    for name in names:
        mvs_set.add_reviewer(name, train_set[name])
    
    try :
        # Perform the matrix factorization
        # The real problem with crab is that this has to be recomputed every time!
        # We note that we have a problem with "overfitting" and hence limit the iterations to 30.
        model = MatrixPreferenceDataModel(mvs_set.data)
        my_recommender = MatrixFactorBasedRecommender(model,
                                                      items_selection_strategy=ItemsNeighborhoodStrategy,
                                                      n_features=20,
                                                      learning_rate=.01,
                                                      n_interations=30,
                                                      with_preference=True)

        # Evaluate the test scores
        actual_scores = []
        predicted_scores = []
        revs = []
        movies = []
        
        # For each name and each left out rating, get the scores and append them to our results
        for name in names:
            cv = test_set[name]
            for mid in cv:
                if mid not in model.item_ids():
                    # This can happen if we remove the only review for that movie. In that
                    # case we're not that interested anyway.
                    continue
                actual_scores.append(cv[mid])
                predicted_scores.append(my_recommender.estimate_preference(name, mid))
                revs.append(name)
                movies.append([mid, mvs_set.get_title(mid)])
                
    finally:
        # No matter what be sure we put the data back
        for name in names:
            mvs_set.add_reviewer(name, all_scores[name])
            
    return [revs, movies, actual_scores, predicted_scores]


In [419]:
start = time.time()
results = evaluate_reviewers(reviewers, 
                             fileloc="/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv", 
                             cv_size=.2, 
                             random_state=10000)
print(time.time() - start)

68.7227740288


In [421]:
# Shocking? Maybe not since we are comparing netflix reviews to netflix reviews. Recall that the prize for
# the netflix competition had an RMSE of .85
math.sqrt(mean_squared_error(results[2], results[3]))

0.6306454695244874

# Actual User Ratings Comparison

The 550 MB ratings file in ml-20 has far too many movies for us to actually evaluate in a simple iPython notebook. Here we are filtering out the reviews for only the movies in our all_mvs set. We still end up with 181122 ratings, more than we are interested in using now. We select a random set of 650 users, then split each set into a training set and a test set for cross-validation.

Using a set of 3696 reviews, we found our RMSE to be .867, so less than a star rating off. The next step would be to build a matrix factorization algorithm that actually produces results in a more efficient manner.

In [375]:
# Copy only movies that we are interested in
f = open("/Users/eho/Documents/HES/Machine Learning/movies/ml-20m/ratings.csv", "r")
o = open("test_ratings.csv", "w")
f.readline()
for line in f:
    mid = int(line.split(',')[1])
    if mid in all_mvs.item_ids:
        o.write(line)
        
o.close()

In [425]:
user_ratings = {}
ratings_file = open("test_ratings.csv", "r")

for line in ratings_file:
    l = line.split(',')
    userId = str(l[0])
    mid = int(l[1])
    score = float(l[2])
    
    if userId not in user_ratings:
        user_ratings[userId] = {}
    
    user_ratings[userId][mid] = score

len(user_ratings)

12602

In [426]:
# 120602 is a lot, lets remove the empty ones and ones with less than 5 (there are many)
tmp = user_ratings.keys()
for uid in tmp:
    if len(user_ratings[uid]) < 5:
        del(user_ratings[uid])
len(user_ratings)

6705

In [432]:
test_user_ratings = split_dictionary(user_ratings, size=.1, random=1000)[1]
len(test_user_ratings)

670

In [433]:
uids = []
for uid in test_user_ratings:
    uids.append(uid)
    all_mvs.add_reviewer(uid, test_user_ratings[uid])

In [434]:
# Run our tests, do this in sets of 50, assuming that that many uids won't disrupt the model (since there are ~2000
# professionals)
start = time.time()

ratings_results = []

for i in range(0, 550, 50):
    tmp_users = uids[i:i+50]
    tmp_scores = {uid:test_user_ratings[uid] for uid in tmp_users}
    ratings_results.append(evaluate_reviewers(tmp_users, 
                           fileloc="/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv", 
                           cv_size=.3, 
                           random_state=10000,
                           test_scores=tmp_scores))
print(time.time() - start)

782.425244093


In [442]:
all_results = [[],[],[],[]]
for r in ratings_results:
    all_results[0] += r[0]
    all_results[1] += r[1]
    all_results[2] += r[2]
    all_results[3] += r[3]

In [444]:
# .86 isn't too shabby. It's a small test set, but we haven't done a lot of work for this. I think it shows promise
math.sqrt(mean_squared_error(all_results[2], all_results[3]))

0.8655136031204704

In [445]:
# With 3696 test scores our RSME was .866
len(all_results)

3696

In [468]:
pickle.dump(all_results, open("all_ratings_results.pickle", 'wb'))

In [36]:
all_results = pickle.load(open("all_ratings_results.pickle", 'rb'))

In [30]:
all_results[3][0]

3.5052290928927783

In [34]:
# Dumping truth and predictions
f = open("all_ratings_results.csv", "w")
for i in range(len(all_results[0])):
    f.write("%f,%f\n" % (all_results[2][i],
                               all_results[3][i]))
f.close()

# How Will People Like Star Wars?

I set up a google forum to gather reviews from friends, family, and classmates. Just to let them know what to expect when choosing whether to wait in line for star wars, or to give Sisters a much needed extra $10 on their opening weekend.

Surprisingly, according to our recommender there are a few people who shouldn't spend the money on Star Wars (including my mother)

In [540]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/classmate_ratings.csv"
reader = csv.reader(open(loc), delimiter=',', quotechar='\"')

class_scores = {}

header = reader.next()[2:]
movies = [copy.deepcopy(all_mvs.get_mid(x)) for x in header]
movies[0].add(10000005)
movies[1].add(10000011)
movies[2].add(10000013)

for line in reader:
    name = line[1]
    scores = {}
    
    for i in range(len(movies)):
        mid = movies[i].pop()
        movies[i].add(mid)
        s = line[i+2]
        try:
            scores[mid] = float(s)
        except:
            continue
    
    class_scores[name] = scores
        

In [541]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [542]:
for c in class_scores:
    all_mvs.add_reviewer(c, class_scores[c])

In [544]:
model = MatrixPreferenceDataModel(all_mvs.data)
my_recommender = MatrixFactorBasedRecommender(model,
                                              items_selection_strategy=ItemsNeighborhoodStrategy,
                                              n_features=20,
                                              learning_rate=.01,
                                              n_interations=30,
                                              with_preference=True)

In [545]:
sw = 10000000
sis = 10000001

for c in class_scores:
    sisters = my_recommender.estimate_preference(c, sis)
    starwars = my_recommender.estimate_preference(c, sw)
    print("%s %f %f" % (c, starwars, sisters))

Sarah 4.532940 3.815769
ABC 2.723338 3.243975
Ankit 3.791182 3.606406
Nick 4.004851 3.443085
merri.huang@gmail.com 4.023050 3.476290
Luke 3.177488 3.428144
Dan 4.212531 3.629121
Gaby 3.568885 3.399609
Britta 4.449709 3.598436
Moma G 4.051785 3.514519
MT 3.686679 3.404661
Movie H8er 0.735812 2.754545
chollum@msn.com 3.748584 3.444994
Nils 4.885844 3.832616
Linda 2.914382 3.145942
