In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.debugger import Tracer

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [29]:
from surprise import *
import os
from surprise.accuracy import rmse

# Loading Data 

## Loading with Pandas

In [6]:

basefile = '/Users/chris/.surprise_data/ml-100k/ml-100k/'

# User file 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(basefile+'u.user', sep='|',names=u_cols,
 encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(basefile+'u.data', sep='\t', names=r_cols,
 encoding='latin-1')
ratings.head()


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [8]:
#Reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv(basefile+'u.item', sep='|', names=i_cols,
 encoding='latin-1')
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
# this is a random split of data
trainset2 = pd.read_csv(basefile+'u1.base', sep='\t',names=r_cols)
print('number of movies: {0}'.format(str(len(trainset2.movie_id.unique()))))
print('number of users: {0}'.format(str(len(trainset2.user_id.unique()))))
print('length of train set: {0}'.format(str(len(trainset2))))
trainset2.head()

# this is a random split of data data 
testset2 = pd.read_csv(basefile+'u1.test', sep='\t',names=r_cols)
print('length of test set: {0}'.format(str(len(testset2))))
testset2.head()


number of movies: 1650
number of users: 943
length of train set: 80000
length of test set: 20000


Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


## Loading with Surprise Package

In [11]:
train_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.base'
test_file = os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u1.test'
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))

data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))
for fold_i, (trainset, testset) in enumerate(data.folds()):
    None

In [12]:
testset[0:10]

[(u'1', u'6', 5.0),
 (u'1', u'10', 3.0),
 (u'1', u'12', 5.0),
 (u'1', u'14', 5.0),
 (u'1', u'17', 3.0),
 (u'1', u'20', 4.0),
 (u'1', u'23', 4.0),
 (u'1', u'24', 3.0),
 (u'1', u'27', 2.0),
 (u'1', u'31', 3.0)]

In [13]:
trainset

<surprise.dataset.Trainset instance at 0x1104fb680>

# Collaborative Filtering

Recommendation using Collaborative filtering involves comuting:
- user-user similarity
- item-item similarity? 
- making predictions of items, from similar users. 



## Baseline Predictions

This algorithm just predicts each rating using the average rating per user and per movie. 

$$ r_{ui} = \mu + b_u + b_i $$


In [14]:
# calculate baselines
user_list = trainset2.user_id.unique()
n_users = len(user_list)

item_list = trainset2.movie_id.unique()
n_items = len(item_list)

b_u = {}#np.empty(n_users)
b_i = {}#np.empty(n_items)

n_ratings_u = np.empty(n_users)
n_ratings_i = np.empty(n_items)

mu = trainset2.rating.mean()

for user in user_list:
    b_u[user]=trainset2.loc[trainset2['user_id']==user,'rating'].mean()
    
for item in item_list:
    b_i[item]=trainset2.loc[trainset2['movie_id']==item,'rating'].mean()
    

In [15]:
# make predictions 
for index, row in testset2.iterrows():
    nsum=1
    if row['movie_id'] in b_i:
        b_i_row = b_i[row['movie_id']]
        nsum+=1
    else:
        b_i_row = 0
    if row['user_id'] in b_u:
        b_u_row = b_u[row['user_id']]
        nsum+=1
    else:
        b_u_row = 0

        
    testset2.loc[index,'predicted_rating'] = (mu + b_u_row + b_i_row)/nsum


In [26]:
testset2.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,predicted_rating
0,1,6,5,887431973,3.53661
1,1,10,3,875693118,3.695515
2,1,12,5,878542960,3.869312
3,1,14,5,874965706,3.703277
4,1,17,3,875073198,3.463462


In [28]:
MSE = np.mean((testset2['rating']-testset2['predicted_rating'])**2)
RMSE = np.sqrt(MSE)
print(RMSE)
#testset['predicted_rating'].max()

1.03664184189


### Using the surprise package with added algorithm


In [21]:
class myBaseline(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):

        
        sum_means = self.trainset.global_mean
        #print('here')
        #print(sum_means)
        
        div = 1

        if self.trainset.knows_user(u):
            sum_means += np.mean([r for (_, r) in self.trainset.ur[u]])
            div += 1
            #print('here')
            #print(sum_means)
        if self.trainset.knows_item(i):
            sum_means += np.mean([r for (_, r) in self.trainset.ir[i]])
            div += 1
            #print('here')
            #print(sum_means)

        return sum_means / div


In [22]:
algo = myBaseline()
algo.train(trainset)


In [23]:
predictions = algo.test(testset)
df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df.head()

Unnamed: 0,uid,iid,rui,est,details
0,1,6,5,3.53661,{u'was_impossible': False}
1,1,10,3,3.695515,{u'was_impossible': False}
2,1,12,5,3.869312,{u'was_impossible': False}
3,1,14,5,3.703277,{u'was_impossible': False}
4,1,17,3,3.463462,{u'was_impossible': False}


In [24]:
MSE = np.mean((df['rui']-df['est'])**2)
np.sqrt(MSE)

1.0366418418889307

### Using the surprise package with their baseline

- they use other methods to estimate the baseline. 


In [30]:
algo2 = BaselineOnly()
algo2.train(trainset)                             
predictions2 = algo2.test(testset)
rmse(predictions2)   

Estimating biases using als...
RMSE: 0.9599


0.95994383330777366

In [31]:
df = pd.DataFrame(predictions2, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df.head()

Unnamed: 0,uid,iid,rui,est,details
0,1,6,5,3.6292,{u'was_impossible': False}
1,1,10,3,3.909437,{u'was_impossible': False}
2,1,12,5,4.444498,{u'was_impossible': False}
3,1,14,5,3.924356,{u'was_impossible': False}
4,1,17,3,3.361724,{u'was_impossible': False}


## KNN

In [None]:
                
# algo = KNNBasic()                                                       

# for fold_i, (trainset, testset) in enumerate(data.folds()):
#     print('fold {0}'.format(str(fold_i)))
#     algo.train(trainset)                             
#     predictions = algo.test(testset)
#     rmse(predictions)                                                             
#     dump.dump('./dump_file', predictions, algo)