In [2]:
import csv
import itertools
import time
import pickle

from collections import defaultdict

In [50]:
class reviews_dataset():
    '''
    Specialized class for reading recommendations csv files produced
    by our rotten tomatoes parsing script.
    
    Implements the Crab (http://muricoca.github.io/crab/index.html)
    interface to be used with their recommendation system implementations
    
    namely:
    data - mapping of user ids to preferences
    user_ids - mapping of reviewer names to assigned id
    item_ids - mapping of movie ids to their titles
    '''
    
    def __init__(self, fileloc):
        
        self.data = {}
        
        self.item_ids = {}
        self.movie_ids = {}
        
        self.user_ids = set()
        
        self.invalid_scores = []        
        self.reviews = []
        
        with open(fileloc) as f:
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            
            for row in reader:
                self.__append_row(row)
                
    def __append_row(self, row):
        name = row[0]
        mid = int(row[1])       
        title = row[2]
        score = row[6]
        
        try:
            score = float(score)
        except:
            self.invalid_scores.append(row)
            return
                
        self.reviews.append(row)
        
        self.user_ids.add(name)
        
        self.item_ids[mid] = title
        if title not in self.movie_ids:
            self.movie_ids[title] = set()
        self.movie_ids[title].add(mid)
        
        if not name in self.data:
            self.data[name] = {}
        self.data[name][mid] = score
        
    def get_review_counts(self):
        """
        Return a list of movie ids, sorted by the highest number of 
        review counts.
        """
        movies = self.data

        # Get the reviews with the most data in them
        counts = {}
        for rev in movies:
            for mid in movies[rev]:
                if not mid in counts:
                    counts[mid] = 0
                counts[mid] += 1
            
        return sorted(counts.items(), key=lambda x:x[1], reverse=True)
    
    def get_score_counts(self):
        """
        Return the number of non-empty cells in the data dictionary
        """
        count = 0
        for name in self.data:
            count += len(self.data[name])
        name
    
    def get_title(self, mid):
        """
        Returns title string for movie id
        """
        return self.item_ids[mid]
    
    def get_mid(self, title):
        """
        Returns the movie id given the title of the movie (in a list) since
        there may be more than one mid per title.
        """
        return self.movie_ids[title]
    
    def add_reviewer(self, name, scores):
        """
        Given a name and scores, add a reviewer to this dataset
        
        Note: This will overwrite any data currently in place!
        """
        self.remove_reviewer(name)
        
        self.data[name] = scores
        self.user_ids.add(name)
        
    def remove_reviewer(self, name):
        """
        Removes and returns the scores for a given reviewer (in the case of saving them)
        """
        if name not in self.data:
            return None
        
        scores = self.data[name]
        del(self.data[name])
        self.user_ids.remove(name)
        
        return scores
    
    def remove_movie(self, mid):
        """
        Removes and returns ratings for a given movie.
        """
        if mid not in self.item_ids:
            return None
        
        scores = {}
        for name in self.data:
            if mid in self.data[name]:
                scores[name] = self.data[name][mid]
                del(self.data[name][mid])
        
        title = self.item_ids[mid]
        del(self.item_ids[mid])
        self.movie_ids[title].remove(mid)
        
        return scores
    
    def add_movie(self, mid, title, scores):
        """
        given id, title, and scores, add them to the data set for this object
        
        Note: This will overwrite any existing data!
        """
        self.remove_movie(mid)
        
        for name in scores:
            self.data[name][mid] = scores[name]
        
        self.item_ids[mid] = title
        
        if title not in self.movie_ids:
            self.movie_ids[title] = set()
        self.movie_ids[title].add(mid)
        
        
        

# Basic Tests for the Class Above

In [7]:
test_loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/test.csv"
mvs = reviews_dataset(test_loc)

In [8]:
mvs.item_ids

{91976: 'Grey, The',
 93840: 'Cabin in the Woods, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

In [9]:
mvs.data['A.O. Scott'][91976]

0.6

In [10]:
mvs.get_mid("World War Z")

{103249}

In [11]:
mvs.get_review_counts()

[(93840, 199),
 (103249, 197),
 (107069, 159),
 (91976, 150),
 (104272, 97),
 (109895, 95)]

In [12]:
ao_scott_scores = mvs.remove_reviewer("A.O. Scott")

In [13]:
ao_scott_scores

{91976: 0.6, 103249: 0.6}

In [14]:
"A.O. Scott" in mvs.data

False

In [15]:
mvs.add_reviewer("A.O. Scott", ao_scott_scores)

In [16]:
"A.O. Scott" in mvs.data

True

In [17]:
revs = mvs.remove_movie(93840)

In [18]:
len(revs)

199

In [19]:
mvs.item_ids

{91976: 'Grey, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

In [20]:
mvs.add_movie(93840, "Cabin in the Woods, The", revs)

# Experimenting with Crab

In [46]:
from scikits.crab.models.classes import MatrixPreferenceDataModel, MatrixBooleanPrefDataModel
from scikits.crab.metrics import *
from scikits.crab.similarities import UserSimilarity
from scikits.crab.recommenders.knn import UserBasedRecommender
from scikits.crab.recommenders.svd.classes import MatrixFactorBasedRecommender
from scikits.crab.recommenders.knn.item_strategies import AllPossibleItemsStrategy, ItemsNeighborhoodStrategy

In [11]:
model = MatrixPreferenceDataModel(mvs.data)
print(model)

MatrixPreferenceDataModel (420 by 6)
         91976      93840      103249     104272     107069   ...
A.A. Dow    ---     0.800000   0.500000      ---     0.500000
A.O. Sco 0.600000      ---     0.600000      ---        ---
Adam Ros    ---     0.800000      ---        ---        ---
Al Alexa    ---        ---        ---        ---     0.700000
Alan Jon    ---     0.800000      ---        ---        ---
Alex Zan    ---     1.000000      ---        ---        ---
Ali Gray    ---     1.000000      ---        ---        ---
Alistair    ---        ---        ---     0.800000      ---
Amber Wi    ---        ---        ---     0.900000      ---
Amy Bian 0.250000      ---        ---        ---        ---
Amy Curt    ---     0.750000      ---        ---        ---
Amy Nich    ---        ---        ---        ---     0.400000
Anders W    ---        ---     0.400000      ---     0.400000
Andrea C 0.800000      ---     0.400000      ---        ---
Andy Lea    ---     0.800000   0.800000      --- 

In [12]:
def get_recommendations(reviews, recommender, userid):
    recs = recommender.recommend(reviews.get_user_id(userid))
    return [(reviews.get_title(x[0]), x[1]) for x in recs]

In [13]:
def get_similar_users(reviews, recommender, userid, n=15):
    return recommender.most_similar_users(reviews.get_user_id(userid), how_many=n)

In [14]:
"""
There are many other options for distance metrics other available in

https://github.com/muricoca/crab/blob/master/scikits/crab/metrics/pairwise.py
"""
users_similarity = UserSimilarity(model, pairwise.cosine_distances)
recommender = UserBasedRecommender(model, users_similarity, with_preference=True)

In [15]:
mvs.data[mvs.get_user_id(2)]

{93840: 0.8}

In [16]:
user = 10
print(mvs.data[mvs.get_user_id(user)])
print(get_recommendations(mvs, recommender, user))

  similarities = similarities[~np.isnan(prefs)]
  similarities = similarities[~np.isnan(prefs)]


{93840: 0.6, 91976: 0.6, 107069: 0.4, 103249: 0.4}
[('Blackfish', 0.7926428571428572), ('Bad Words', 0.60706666666666675)]


In [17]:
get_similar_users(mvs, recommender, 10, n=10)

array(["Karen D'Souza", 'Tim Martain', 'Jack Rodgers', 'Rich Phippen',
       "Michael O'Sullivan", 'Michael Nordine', 'Mara Reinstein',
       'Curtis Woloschuk', 'Connie Ogle', 'Steve Macfarlane', 'Mark Ellis'], 
      dtype='|S18')

# So Does This Actually Scale?
We found the the KNN method is unbearably slow in producing recommendations based on the entire dataset. Matrix factorization is WAAAAY faster.

In [25]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [68]:
model = MatrixPreferenceDataModel(all_mvs.data)
print(model)

MatrixPreferenceDataModel (1969 by 2192)
         89745      91485      91500      91529      91535    ...
A.A. Dow    ---        ---        ---        ---        ---
A.O. Sco 0.400000      ---        ---        ---        ---
AP Kryza    ---        ---        ---        ---        ---
AV Club     ---        ---        ---        ---        ---
Aaron Ar    ---        ---        ---        ---        ---
Aaron Me    ---        ---        ---        ---        ---
Aaron Ne    ---        ---        ---        ---        ---
Aaron Ya    ---     0.400000      ---        ---     0.400000
Aarti Jh    ---        ---        ---        ---        ---
Abbey Be    ---        ---        ---        ---        ---
Abby Gar    ---        ---        ---        ---        ---
Abby Wes    ---        ---        ---        ---        ---
Abhimany    ---        ---        ---        ---        ---
Abhinav     ---        ---        ---        ---        ---
Abigail     ---        ---        ---        ---   

In [69]:
# users_similarity = UserSimilarity(model, pairwise.euclidean_distances)
# recommender = UserBasedRecommender(model, users_similarity, with_preference=True)
recommender = MatrixFactorBasedRecommender(model)

In [75]:
rec = recommender.estimate_preference("A.O. Scott", 91485)
rec

0.57207704042536567

# I need to rate some movies!

In [76]:
movies = all_mvs.get_review_counts()

In [77]:
def rate_some_movies(reviews, my_scores={}):
    """
    If you don't have any ratings, use this to start ranking movies in order
    by review count. Can pass in a filled dictionary if you have already done
    this for some movies.
    """
    
    print("Input movie ratings 0 - 5 (enter nothing to continue or enter any other number to exit)")
        
    score = 0
    
    for x in reviews.get_review_counts():
        if x[0] in my_scores:
            continue
            
        print("%s: " % all_mvs.get_title(x[0]))
        
        score = raw_input()
        if not score:
            continue
        
        score = int(score)
        if score < 0 or score > 5:
            break
            
        score = score/5.0
        
        my_scores[x[0]] = score
        
    return my_scores

In [78]:
#scores = rate_some_movies(all_mvs)

In [35]:
# Did this previously and wrote my ratings to a file
eriks_scores = pickle.load(open("eriks_reviews.pickle", 'rb'))

In [37]:
# Commented out to not overwrite
f = open("eriks_reviews.pickle", 'wb')
#pickle.dump(scores, f)

In [79]:
all_mvs.data["Erik Holum"] = eriks_scores

In [80]:
# I rated 76 movies
len(eriks_scores)

76

# Okay How Does This Do?

So looking below it takes about 90 seconds to perform the matrix factorization for the data we currently have. 

In [118]:
start = time.time()
model = MatrixPreferenceDataModel(all_mvs.data)
print(time.time() - start)

3.22317695618


In [119]:
start = time.time()
my_recommender = MatrixFactorBasedRecommender(model,
                                             items_selection_strategy=ItemsNeighborhoodStrategy,
                                             n_features=20,
                                             learning_rate=.01,
                                             n_interations=50,
                                             with_preference=True)
print(time.time() - start)

98.8014740944


In [120]:
movies = ["%d %s" % (x[0],all_mvs.get_title(x[0])) for x in all_mvs.get_review_counts()]
c = 20
movies[c:c+10]

['112623 Dawn of the Planet of the Apes',
 '103228 Pacific Rim',
 '97752 Cloud Atlas',
 '117529 Jurassic World',
 '110102 Captain America: The Winter Soldier',
 '98961 Zero Dark Thirty',
 '96610 Looper',
 '102445 Star Trek Into Darkness',
 '93840 Cabin in the Woods, The',
 '102407 Great Gatsby, The']

In [121]:
# I've tested this with a bunch of movies that I've seen, and it's never 
# off by more than one star. So that's cool!!!
my_recommender.estimate_preference("Erik Holum", 102407)

0.54495644019721579

In [122]:
all_mvs.get_title(102407)

'Great Gatsby, The'

# Sarah's Scores

In [21]:
f = open("sarahs_reviews.pickle", 'rb')
s_scores = pickle.load(f)

'f = open("sarahs_reviews.pickle", \'rb\')\ns_scores = pickle.load(f)'

In [98]:
s_scores = rate_some_movies(all_mvs)

Input movie ratings 0 - 5 (enter nothing to continue or enter any other number to exit)
Dark Knight Rises, The: 
-1


In [171]:
f = open("sarahs_reviews.pickle", 'wb')
pickle.dump(s_scores, f)

In [173]:
all_mvs.data["Sarah GW"] = s_scores

In [174]:
model = MatrixPreferenceDataModel(all_mvs.data)

In [175]:
my_recommender = MatrixFactorBasedRecommender(model)

In [205]:
all_mvs.get_mid("Inside Out")

{134853}

In [206]:
my_recommender.estimate_preference("Sarah GW", 134853)

0.90489631958761718

In [207]:
all_mvs.get_mid("Furious 7")

{130634}

In [99]:
my_recommender.estimate_preference("Sarah GW", 130634)

# Bringing Sample Set Down

Maybe 2000 movies and ~110,000 reviews is too much for testing. Let's just choose the 500 most reviewed movies and try and tune performance using that.

In [1]:
# This package isn't that useful
from scikits.crab.metrics import cross_validation

In [32]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [33]:
movies = all_mvs.get_review_counts()

In [34]:
# Only keep top 500 movies
for i in movies[500:]:
    all_mvs.remove_movie(i[0])

In [36]:
print(len(all_mvs.item_ids))

500


In [44]:
# Remove all empty reviewers
empty = set()
for name in all_mvs.data:
    if len(all_mvs.data[name]) == 0:
        empty.add(name)
for name in empty:
    del(all_mvs.data[name])

In [48]:
model = MatrixPreferenceDataModel(all_mvs.data)

In [49]:
# Now how long does it take to factorize the matrix?
start = time.time()
my_recommender = MatrixFactorBasedRecommender(model,
                                             items_selection_strategy=ItemsNeighborhoodStrategy,
                                             n_features=20,
                                             learning_rate=.01,
                                             n_interations=50,
                                             with_preference=True)
print(time.time() - start)

66.8191010952
