# The MovieLens dataset: loading and first look

The MovieLens data is spread across three files. We'll load each file using the pd.read_table function:

In [2]:
import pandas as pd
import numpy as np

In [3]:
users = pd.read_table('users.dat',
                      sep='::', header=None, 
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],engine='python')

ratings = pd.read_table('ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'],engine='python')

movies = pd.read_table('movies.dat',
                       sep='::', header=None, 
                       names=['movie_id', 'title', 'genres'],engine='python')

# show how one of them looks
ratings.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
movielens = pd.merge(pd.merge(ratings, users), movies)
movielens.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [5]:
# let's work with a smaller subset for speed reasons
movielens = movielens.ix[np.random.choice(movielens.index, size=10000, replace=False)]
print(movielens.shape)
print(movielens.user_id.nunique())
print(movielens.movie_id.nunique())

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


(10000, 10)
3691
2251


In [6]:
user_ids_larger_1 = pd.value_counts(movielens.user_id, sort=False) > 1
user_ids_larger_1 = user_ids_larger_1[user_ids_larger_1].index

movielens = movielens.select(lambda l: movielens.loc[l, 'user_id'] in user_ids_larger_1)
print(movielens.shape)
assert np.all(movielens.user_id.value_counts() > 1)

(8470, 10)


  after removing the cwd from sys.path.


In [7]:
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.2)),
                                   replace=False)
    df.ix[sampled_ids, 'for_testing'] = True
    return df

movielens['for_testing'] = False
grouped = movielens.groupby('user_id', group_keys=False).apply(assign_to_set)
movielens_train = movielens[grouped.for_testing == False]
movielens_test = movielens[grouped.for_testing == True]
print(movielens.shape)
print(movielens_train.shape)
print(movielens_test.shape)
assert len(movielens_train.index & movielens_test.index) == 0

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


(8470, 11)
(5825, 11)
(2645, 11)


In [8]:
movielens_train.to_csv('my_generated_movielens_train.csv')
movielens_test.to_csv('my_generated_movielens_test.csv')

In [9]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [10]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

In [11]:
def my_estimate_function(user_id, movie_id):
    return 3

In [12]:
print('RMSE for my estimate function: %s' % evaluate(my_estimate_function))

RMSE for my estimate function: 1.256815070319809


In [13]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('my_generated_movielens_train.csv', index_col=0)
movielens_test = pd.read_csv('my_generated_movielens_test.csv', index_col=0)

In [14]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
291463,3641,2890,2,966481345,M,25,0,92008,Three Kings (1999),Drama|War,False
803670,4666,2944,4,964810020,M,35,1,53704,"Dirty Dozen, The (1967)",Action|War,False
43474,5406,1,5,960304768,F,35,0,49504,Toy Story (1995),Animation|Children's|Comedy,False
723052,302,2416,1,976504783,M,18,4,4901,Back to School (1986),Comedy,False
552625,3285,208,3,968212606,M,25,4,44706,Waterworld (1995),Action|Adventure,False


In [15]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
127971,4490,590,5,965008271,M,35,5,70123,Dances with Wolves (1990),Adventure|Drama|Western,False
505294,5046,2795,4,962499360,M,25,16,60614,Vacation (1983),Comedy,False
260840,5981,2716,2,956932021,M,35,7,1776,Ghostbusters (1984),Comedy|Horror,False
991009,3942,3310,2,966380307,M,18,4,49855,"Kid, The (1921)",Action,False
194475,4153,653,5,965340458,M,50,17,16801,Dragonheart (1996),Action|Adventure|Fantasy,False


## Content-based filtering

Recommend based on the user's rating history.

Generic expression (notice how this is kind of a 'row-based' approach):

$$ \newcommand{\aggr}{\mathop{\rm aggr}\nolimits} r{u,i} = \aggr{i' \in I(u)} [r_{u,i'}] $$

A simple example using the mean as an aggregation function:

$$ r_{u,i} = \bar ru = \frac{\sum{i' \in I(u)} r_{u,i'}}{|I(u)|} $$

## Collaborative filtering

Recommend based on other user's rating histories.

Generic expression (notice how this is kind of a 'col-based' approach):

$$ \newcommand{\aggr}{\mathop{\rm aggr}\nolimits} r{u,i} = \aggr{u' \in U(i)} [r_{u',i}] $$

A simple example using the mean as an aggregation function:

$$ r_{u,i} = \bar ri = \frac{\sum{u' \in U(i)} r_{u',i}}{|U(i)|} $$

## Hybrid solutions

The literature has lots of examples of systems that try to combine the strengths of the two main approaches. This can be done in a number of ways:

Combine the predictions of a content-based system and a collaborative system.
Incorporate content-based techniques into a collaborative approach.
Incorporarte collaborative techniques into a content-based approach.
Unifying model.

### Challenges

#### Availability of item metadata

Content-based techniques are limited by the amount of metadata that is available to describe an item. There are domains in which feature extraction methods are expensive or time consuming, e.g., processing multimedia data such as graphics, audio/video streams. In the context of grocery items for example, it's often the case that item information is only partial or completely missing. Examples include:

Ingredients
Nutrition facts
Brand
Description
County of origin

#### New user problem

A user has to have rated a sufficient number of items before a recommender system can have a good idea of what their preferences are. In a content-based system, the aggregation function needs ratings to aggregate.

#### New item problem

Collaborative filters rely on an item being rated by many users to compute aggregates of those ratings. Think of this as the exact counterpart of the new user problem for content-based systems.

#### Data sparsity

When looking at the more general versions of content-based and collaborative systems, the success of the recommender system depends on the availability of a critical mass of user/item iteractions. We get a first glance at the data sparsity problem by quantifying the ratio of existing ratings vs $|U|x|I|$. A highly sparse matrix of interactions makes it difficult to compute similarities between users and items. As an example, for a user whose tastes are unusual compared to the rest of the population, there will not be any other users who are particularly similar, leading to poor recommendations.

## Minimal reco engine v1.0: simple mean ratings

### Content-based filtering using mean ratings

With this table-like representation of the ratings data, a basic content-based filter becomes a one-liner function.

In [16]:
def content_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    return movielens_train.loc[user_condition, 'rating'].mean()

print('RMSE for estimate1: %s' % evaluate(content_mean))

RMSE for estimate1: 1.2666441562178004


## 1. Simple collaborative filtering using mean ratings

In [26]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    # second, compute the mean of those ratings
    user_condition = movielens_train.user_id != user_id
    item_condition = movielens_train.movie_id == movie_id
    rating_by_others= movielens_train.loc[user_condition&item_condition]
    if rating_by_others.empty:
        print(3)
    else:
        return rating_by_others.rating.mean()
  
# try it out for a user_id, movie_id pair
print(collab_mean(4653,2648))

3.5


In [27]:
movielens_train.groupby('gender')['rating'].mean()

gender
F    3.542029
M    3.551181
Name: rating, dtype: float64

In [28]:
movielens_train.groupby(['gender', 'age'])['rating'].mean()

gender  age
F       1      3.666667
        18     3.368852
        25     3.550459
        35     3.524138
        45     3.645669
        50     3.574468
        56     4.045455
M       1      3.617021
        18     3.503038
        25     3.529700
        35     3.550388
        45     3.537313
        50     3.739550
        56     3.687500
Name: rating, dtype: float64

In [29]:
# transform the ratings frame into a ratings matrix
ratings_mtx_df = movielens_train.pivot_table(values='rating',
                                             index='user_id',
                                             columns='movie_id')
ratings_mtx_df.head(3)

movie_id,1,2,3,4,5,6,7,9,10,11,...,3927,3928,3929,3930,3932,3946,3947,3948,3949,3950
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [30]:
# grab another subsquare of the ratings matrix to actually diplay some real entries!
ratings_mtx_df.loc[11:16, 1196:1200]

movie_id,1196,1197,1198,1199,1200
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11,,,,,
14,,,,,


In [31]:
movielens_train.pivot_table(values='rating', index='age', columns='gender', aggfunc='mean')

gender,F,M
age,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.666667,3.617021
18,3.368852,3.503038
25,3.550459,3.5297
35,3.524138,3.550388
45,3.645669,3.537313
50,3.574468,3.73955
56,4.045455,3.6875


In [32]:
movielens_train.pivot_table(values='rating', index='age', columns='gender', 
                            aggfunc=[np.mean, np.std])

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.666667,3.617021,1.414214,1.155691
18,3.368852,3.503038,1.170282,1.137581
25,3.550459,3.5297,1.08199,1.149921
35,3.524138,3.550388,1.135052,1.074527
45,3.645669,3.537313,0.980181,1.060002
50,3.574468,3.73955,1.082504,1.047121
56,4.045455,3.6875,0.805636,1.013674


### Minimal reco engine v1.1: implicit sim functions

We're going to need a user index from the users portion of the dataset. This will allow us to retrieve information given a specific user_id in a more convenient way:

In [33]:
user_info = users.set_index('user_id')
user_info.head(5)

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [34]:
user_id = 3
user_info.loc[user_id, 'gender']

'M'

### Collaborative-based filtering using implicit sim functions

Using the pandas aggregation framework we will build a collaborative filter that estimates ratings using an implicit sim(u,u') function to compare different users.

In [37]:
def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.ix[user_id, 'gender']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.loc[movie_id, user_gender]
    else:
        return means_by_gender.loc[movie_id].mean()

print('RMSE for collab_gender: %s' % evaluate(collab_gender))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()


RMSE for collab_gender: 1.1931558959102566


In [39]:
class CollabGenderReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_gender = movielens_train.pivot_table('rating', index='movie_id', columns='gender')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_gender.index: 
            return 3.0
        
        user_gender = user_info.ix[user_id, 'gender']
        if ~np.isnan(self.means_by_gender.ix[movie_id, user_gender]):
            return self.means_by_gender.ix[movie_id, user_gender]
        else:
            return self.means_by_gender.ix[movie_id].mean()

reco = CollabGenderReco()
reco.learn()
print('RMSE for CollabGenderReco: %s' % evaluate(reco.estimate))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


RMSE for CollabGenderReco: 1.1931558959102566


In [40]:
class CollabAgeReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.ix[movie_id, user_age]):
            return self.means_by_age.ix[movie_id, user_age]
        else:
            return self.means_by_age.ix[movie_id].mean()

reco = CollabAgeReco()
reco.learn()
print('RMSE for CollabAgeReco: %s' % evaluate(reco.estimate))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


RMSE for CollabAgeReco: 1.2390161519825953


In [41]:
class CollabOccupationReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_occupation = movielens_train.pivot_table('rating', index='movie_id', columns='occupation')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_occupation.index: 
            return 3.0
        
        user_occupation = user_info.ix[user_id, 'occupation']
        if ~np.isnan(self.means_by_occupation.ix[movie_id, user_occupation]):
            return self.means_by_occupation.ix[movie_id, user_occupation]
        else:
            return self.means_by_occupation.ix[movie_id].mean()

reco = CollabOccupationReco()
reco.learn()
print('RMSE for CollabOccupationReco: %s' % evaluate(reco.estimate))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


RMSE for CollabOccupationReco: 1.223496055719031


In [42]:
class CollabZipReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_zip = movielens_train.pivot_table('rating', index='movie_id', columns='zip')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if movie_id not in self.means_by_zip.index: 
            return 3.0
        
        user_zip = user_info.ix[user_id, 'zip']
        if ~np.isnan(self.means_by_zip.ix[movie_id, user_zip]):
            return self.means_by_zip.ix[movie_id, user_zip]
        else:
            return self.means_by_zip.ix[movie_id].mean()

reco = CollabZipReco()
reco.learn()
print('RMSE for CollabZipReco: %s' % evaluate(reco.estimate))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  from ipykernel import kernelapp as app
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  app.launch_new_instance()
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


RMSE for CollabZipReco: 1.142360132827884


### Minimal reco engine v1.2: custom similarity functions

#### A few similarity functions

These were all written to operate on two pandas Series, each one representing the rating history of two different users. You can also apply them to any two feature vectors that describe users or items. In all cases, the higher the return value, the more similar two Series are. You might need to add checks for edge cases, such as divisions by zero, etc.

Euclidean 'similarity'
$$ sim(x,y) = \frac{1}{1 + \sqrt{\sum (x - y)^2}}$$

In [43]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

        - Cosine similarity
$$ sim(x,y) = \frac{(x . y)}{\sqrt{(x . x) (y . y)}} $$

In [44]:
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

        -Pearson correlation
$$ sim(x,y) = \frac{(x - \bar x).(y - \bar y)}{\sqrt{(x - \bar x).(x - \bar x) * (y - \bar y)(y - \bar y)}} $$

In [45]:
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

        -Jaccard similarity
$$ sim(x,y) = \frac{(x . y)}{(x . x) + (y . y) - (x . y)} $$

In [46]:
def jaccard(s1, s2):
    dotp = np.sum(s1 * s2)
    return dotp / (np.sum(s1 ** 2) + np.sum(s2 ** 2) - dotp)

def binjaccard(s1, s2):
    dotp = (s1.index & s2.index).size
    return dotp / (s1.sum() + s2.sum() - dotp)

### Collaborative-based filtering using custom sim functions

In [48]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
print('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

  """


RMSE for CollabPearsonReco: 1.0733275667805626
