In [2]:
import csv
import itertools
import time
import pickle

from collections import defaultdict

In [3]:
class reviews_dataset():
    '''
    Specialized class for reading recommendations csv files produced
    by our rotten tomatoes parsing script.
    
    Implements the Crab (http://muricoca.github.io/crab/index.html)
    interface to be used with their recommendation system implementations
    
    namely:
    data - mapping of user ids to preferences
    user_ids - mapping of reviewer names to assigned id
    item_ids - mapping of movie ids to their titles
    '''
    
    def __init__(self, fileloc):
        
        self.data = {}
        
        self.item_ids = {}
        self.movie_ids = {}
        self.user_ids = {}
        self.user_names = {}
        
        self.invalid_scores = []        
        self.reviews = []
        
        with open(fileloc) as f:
            reader = csv.reader(f, delimiter='\t', quotechar='\"')
            
            for row in reader:
                self.__append_row(row)
                
    def __append_row(self, row):
        name = row[0]
        mid = int(row[1])       
        title = row[2]
        score = row[6]
        
        try:
            score = float(score)
        except:
            self.invalid_scores.append(row)
            return
                
        self.reviews.append(row)
        
        if name not in self.user_names:
            uid = len(self.user_names)
            self.user_names[uid] = name
            self.user_ids[name] = uid    
        
        self.item_ids[mid] = title
        if title not in self.movie_ids:
            self.movie_ids[title] = set()
        self.movie_ids[title].add(mid)
        
        if not name in self.data:
            self.data[name] = {}
        self.data[name][mid] = score
        
    def get_review_counts(self):
        """
        Return a list of movie ids, sorted by the highest number of 
        review counts.
        """
        movies = self.data

        # Get the reviews with the most data in them
        counts = {}
        for rev in movies:
            for mid in movies[rev]:
                if not mid in counts:
                    counts[mid] = 0
                counts[mid] += 1
            
        return sorted(counts.items(), key=lambda x:x[1], reverse=True)
        
    def get_user_id(self, uid):
        """
        Returns the user text name for an integer
        """
        return self.user_names[uid]
    
    def get_title(self, mid):
        """
        Returns title string for movie id
        """
        return self.item_ids[mid]
    
    def get_mid(self, title):
        """
        Returns the movie id given the title of the movie (in a list) since
        there may be more than one mid per title.
        """
        return self.movie_ids[title]

In [4]:
test_loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/test.csv"
mvs = reviews_dataset(test_loc)

In [5]:
mvs.item_ids

{91976: 'Grey, The',
 93840: 'Cabin in the Woods, The',
 103249: 'World War Z',
 104272: 'Blackfish',
 107069: 'Lone Survivor',
 109895: 'Bad Words'}

In [6]:
["%d : %s" % (x, mvs.get_user_id(x)) for x in range(10)]

['0 : Fr. Chris Carpenter',
 '1 : Shawn Levy',
 '2 : Martin Roberts',
 '3 : Susan Granger',
 '4 : Eugene Novikov',
 '5 : David Keyes',
 '6 : Greg Maki',
 '7 : Jaime N. Christley',
 '8 : Kevin Carr',
 '9 : Amy Curtis']

In [7]:
mvs.data['A.O. Scott'][91976]

0.6

In [8]:
mvs.get_mid("World War Z")

{103249}

In [9]:
mvs.get_review_counts()

[(93840, 199),
 (103249, 197),
 (107069, 159),
 (91976, 150),
 (104272, 97),
 (109895, 95)]

# Experimenting with Crab

In [10]:
from scikits.crab.models.classes import MatrixPreferenceDataModel, MatrixBooleanPrefDataModel
from scikits.crab.metrics import *
from scikits.crab.similarities import UserSimilarity
from scikits.crab.recommenders.knn import UserBasedRecommender
from scikits.crab.recommenders.svd.classes import MatrixFactorBasedRecommender
from scikits.crab.recommenders.knn.item_strategies import AllPossibleItemsStrategy, ItemsNeighborhoodStrategy

In [11]:
model = MatrixPreferenceDataModel(mvs.data)
print(model)

MatrixPreferenceDataModel (420 by 6)
         91976      93840      103249     104272     107069   ...
A.A. Dow    ---     0.800000   0.500000      ---     0.500000
A.O. Sco 0.600000      ---     0.600000      ---        ---
Adam Ros    ---     0.800000      ---        ---        ---
Al Alexa    ---        ---        ---        ---     0.700000
Alan Jon    ---     0.800000      ---        ---        ---
Alex Zan    ---     1.000000      ---        ---        ---
Ali Gray    ---     1.000000      ---        ---        ---
Alistair    ---        ---        ---     0.800000      ---
Amber Wi    ---        ---        ---     0.900000      ---
Amy Bian 0.250000      ---        ---        ---        ---
Amy Curt    ---     0.750000      ---        ---        ---
Amy Nich    ---        ---        ---        ---     0.400000
Anders W    ---        ---     0.400000      ---     0.400000
Andrea C 0.800000      ---     0.400000      ---        ---
Andy Lea    ---     0.800000   0.800000      --- 

In [12]:
def get_recommendations(reviews, recommender, userid):
    recs = recommender.recommend(reviews.get_user_id(userid))
    return [(reviews.get_title(x[0]), x[1]) for x in recs]

In [13]:
def get_similar_users(reviews, recommender, userid, n=15):
    return recommender.most_similar_users(reviews.get_user_id(userid), how_many=n)

In [14]:
"""
There are many other options for distance metrics other available in

https://github.com/muricoca/crab/blob/master/scikits/crab/metrics/pairwise.py
"""
users_similarity = UserSimilarity(model, pairwise.cosine_distances)
recommender = UserBasedRecommender(model, users_similarity, with_preference=True)

In [15]:
mvs.data[mvs.get_user_id(2)]

{93840: 0.8}

In [16]:
user = 10
print(mvs.data[mvs.get_user_id(user)])
print(get_recommendations(mvs, recommender, user))

  similarities = similarities[~np.isnan(prefs)]
  similarities = similarities[~np.isnan(prefs)]


{93840: 0.6, 91976: 0.6, 107069: 0.4, 103249: 0.4}
[('Blackfish', 0.7926428571428572), ('Bad Words', 0.60706666666666675)]


In [17]:
get_similar_users(mvs, recommender, 10, n=10)

array(["Karen D'Souza", 'Tim Martain', 'Jack Rodgers', 'Rich Phippen',
       "Michael O'Sullivan", 'Michael Nordine', 'Mara Reinstein',
       'Curtis Woloschuk', 'Connie Ogle', 'Steve Macfarlane', 'Mark Ellis'], 
      dtype='|S18')

# So Does This Actually Scale?
We found the the KNN method is unbearably slow in producing recommendations based on the entire dataset. Matrix factorization is WAAAAY faster.

In [18]:
loc = "/Users/eho/Documents/HES/Machine Learning/Final/Data/recent_reviews.csv"
all_mvs = reviews_dataset(loc)

In [112]:
model = MatrixPreferenceDataModel(all_mvs.data)
print(model)

MatrixPreferenceDataModel (1969 by 2192)
         89745      91485      91500      91529      91535    ...
A.A. Dow    ---        ---        ---        ---        ---
A.O. Sco 0.400000      ---        ---        ---        ---
AP Kryza    ---        ---        ---        ---        ---
AV Club     ---        ---        ---        ---        ---
Aaron Ar    ---        ---        ---        ---        ---
Aaron Me    ---        ---        ---        ---        ---
Aaron Ne    ---        ---        ---        ---        ---
Aaron Ya    ---     0.400000      ---        ---     0.400000
Aarti Jh    ---        ---        ---        ---        ---
Abbey Be    ---        ---        ---        ---        ---
Abby Gar    ---        ---        ---        ---        ---
Abby Wes    ---        ---        ---        ---        ---
Abhimany    ---        ---        ---        ---        ---
Abhinav     ---        ---        ---        ---        ---
Abigail     ---        ---        ---        ---   

In [113]:
# users_similarity = UserSimilarity(model, pairwise.euclidean_distances)
# recommender = UserBasedRecommender(model, users_similarity, with_preference=True)
recommender = MatrixFactorBasedRecommender(model)

In [115]:
start = time.time()
rec = recommender.estimate_preference("A.O. Scott", 91485)
end = time.time()
rec

0.58577247659859422

# I need to rate some movies!

In [19]:
movies = all_mvs.get_review_counts()

In [20]:
def rate_some_movies(reviews, my_scores={}):
    """
    If you don't have any ratings, use this to start ranking movies in order
    by review count. Can pass in a filled dictionary if you have already done
    this for some movies.
    """
    
    print("Input movie ratings 0 - 5 (enter nothing to continue or enter any other number to exit)")
        
    score = 0
    
    for x in reviews.get_review_counts():
        if x[0] in my_scores:
            continue
            
        print("%s: " % all_mvs.get_title(x[0]))
        
        score = raw_input()
        if not score:
            continue
        
        score = int(score)
        if score < 0 or score > 5:
            break
            
        score = score/5.0
        
        my_scores[x[0]] = score
        
    return my_scores

In [141]:
scores = rate_some_movies(all_mvs)

Input movie ratings 0 - 5 (enter nothing to continue or enter any other number to exit)
Dark Knight Rises, The: 
5
Skyfall: 
4
The Rise: 

Avengers, The: 
4
Argo: 
4
Gravity: 
3
Man of Steel: 
1
Mad Max: Fury Road: 
5
Hunger Games, The: 
4
Iron Man 3: 
4
Godzilla: 

Guardians of the Galaxy: 
5
12 Years a Slave: 

Interstellar: 
5
Gone Girl: 

Inside Out: 

Avengers: Age of Ultron: 
3
Hobbit: An Unexpected Journey, The: 
2
Prometheus: 
2
Edge of Tomorrow: 
5
Dawn of the Planet of the Apes: 

Pacific Rim: 
5
Cloud Atlas: 
3
Jurassic World: 
3
Captain America: The Winter Soldier: 
2
Looper: 

Zero Dark Thirty: 
4
Star Trek Into Darkness: 
4
Cabin in the Woods, The: 

Great Gatsby, The: 

World War Z: 

Lincoln: 
5
Boyhood: 
3
Thor: The Dark World: 
2
In The Dark: 

American Hustle: 
4
Hunger Games: Catching Fire, The: 
4
Grand Budapest Hotel, The: 
4
Birdman: 

Captain Phillips: 
3
The Hunger Games: Mockingjay - Part 1: 
3
Mission: Impossible - Rogue Nation: 
4
Men in Black III (M.III.B.)

In [148]:
all_mvs.data["Erik Holum"] = scores

# Okay How Does This Do?

In [21]:
model = MatrixPreferenceDataModel(all_mvs.data)

In [154]:
my_recommender = MatrixFactorBasedRecommender(model)

In [182]:
movies = ["%d %s" % (x[0],all_mvs.get_title(x[0])) for x in all_mvs.get_review_counts()]
movies[300:310]

['127130 Mistress America',
 '103075 Purge, The',
 '94953 Wanderlust',
 '114601 This Is Where I Leave You',
 '107949 Invisible Woman, The',
 '116977 Dumb and Dumber To',
 "95105 Madagascar 3: Europe's Most Wanted",
 '120132 Annie',
 '104760 Getaway',
 '113068 For a Woman (Pour une femme)']

In [180]:
# I've tested this with a bunch of movies that I've seen, and it's never 
# off by more than one star. So that's cool!!!
my_recommender.estimate_preference("Erik Holum", 112138)

0.65705482179388308

In [167]:
f = open("eriks_reviews.pickle", 'wb')
pickle.dump(scores, f)

# Sarah's Scores

In [23]:
f = open("sarahs_reviews.pickle", 'rb')
s_scores = pickle.load(f)

'f = open("sarahs_reviews.pickle", \'rb\')\ns_scores = pickle.load(f)'

In [170]:
s_scores = rate_some_movies(all_mvs)

In [171]:
f = open("sarahs_reviews.pickle", 'wb')
pickle.dump(s_scores, f)

In [173]:
all_mvs.data["Sarah GW"] = s_scores

In [174]:
model = MatrixPreferenceDataModel(all_mvs.data)

In [175]:
my_recommender = MatrixFactorBasedRecommender(model)

In [205]:
all_mvs.get_mid("Inside Out")

{134853}

In [206]:
my_recommender.estimate_preference("Sarah GW", 134853)

0.90489631958761718

In [207]:
all_mvs.get_mid("Furious 7")

{130634}

In [1]:
my_recommender.estimate_preference("Sarah GW", 130634)

NameError: name 'my_recommender' is not defined