In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [7]:
# Reading user data
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
usr=pd.read_csv("./ml-100k/u.user",sep='|',names=u_cols)

In [8]:
usr.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [30]:
# Reading items file
i_cols = ['movie_id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')

In [31]:
items.head()

Unnamed: 0,movie_id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [32]:
# Reading rating file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [33]:
#Dataset Information
print("User: {0}".format(usr.shape))
print("Items: {0}".format(items.shape))
print("Ratings: {}".format(ratings.shape))

User: (943, 5)
Items: (1682, 24)
Ratings: (100000, 4)


In [34]:
#Building the model

# Model will be based on user-user similarity and item-item similarity

##checking how many unique users and unique movies are there

n_users=ratings.user_id.unique().shape[0]
n_items=items.movie_id.unique().shape[0]

In [52]:
# Creating a user-item matrix

data_matrix=np.zeros((n_users,n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

In [54]:
data_matrix.shape

(943, 1682)

In [55]:
from sklearn.metrics.pairwise import pairwise_distances 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [61]:
user_similarity.shape

(943, 943)

In [62]:
item_similarity.shape

(1682, 1682)

In [67]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred
 

In [92]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# Function to calculate RMSE
def rmse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual))

In [95]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')
# RMSE on the test data
print('User-based CF RMSE: ' + str(rmse(user_prediction, data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, data_matrix)))

User-based CF RMSE: 2.963475328997318
Item-based CF RMSE: 3.392143861739501


In [94]:
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ..., 
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [84]:
import turicreate
from turicreate import SFrame

NameError: name 'ratings_train' is not defined

In [79]:
from sklearn.model_selection import train_test_split
ratings_train, ratings_test= train_test_split(ratings, test_size=0.2)

In [85]:
train_data = SFrame(ratings_train)
test_data = SFrame(ratings_test)

In [86]:
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [87]:
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5],k=5)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+-------+------+
| user_id | movie_id | score | rank |
+---------+----------+-------+------+
|    1    |   1104   |  5.0  |  1   |
|    1    |   1594   |  5.0  |  2   |
|    1    |   1201   |  5.0  |  3   |
|    1    |   1293   |  5.0  |  4   |
|    1    |   1536   |  5.0  |  5   |
|    2    |   1104   |  5.0  |  1   |
|    2    |   1594   |  5.0  |  2   |
|    2    |   1201   |  5.0  |  3   |
|    2    |   1293   |  5.0  |  4   |
|    2    |   1536   |  5.0  |  5   |
|    3    |   1104   |  5.0  |  1   |
|    3    |   1594   |  5.0  |  2   |
|    3    |   1201   |  5.0  |  3   |
|    3    |   1293   |  5.0  |  4   |
|    3    |   1536   |  5.0  |  5   |
|    4    |   1104   |  5.0  |  1   |
|    4    |   1594   |  5.0  |  2   |
|    4    |   1201   |  5.0  |  3   |
|    4    |   1293   |  5.0  |  4   |
|    4    |   1536   |  5.0  |  5   |
|    5    |   1104   |  5.0  |  1   |
|    5    |   1594   |  5.0  |  2   |
|    5    |   1201   |  5.0  |  3   |
|    5    | 

In [88]:
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5],k=5)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+---------------------+------+
| user_id | movie_id |        score        | rank |
+---------+----------+---------------------+------+
|    1    |   176    |  0.7517882816426389 |  1   |
|    1    |   173    |  0.7334425567506669 |  2   |
|    1    |   216    |  0.7067253576205658 |  3   |
|    1    |    82    |  0.6473590028178584 |  4   |
|    1    |   191    |  0.6448366354714643 |  5   |
|    2    |   100    |  0.8934629546835068 |  1   |
|    2    |   237    |  0.7661151784531613 |  2   |
|    2    |    7     |  0.6658614747067715 |  3   |
|    2    |   258    |  0.662309910388703  |  4   |
|    2    |   181    |  0.604636292508308  |  5   |
|    3    |   313    |  0.6156851242889058 |  1   |
|    3    |   286    |  0.5328296558423475 |  2   |
|    3    |   748    | 0.49118169058452954 |  3   |
|    3    |   301    |  0.4884781403975053 |  4   |
|    3    |   269    |  0.4690023118799383 |  5   |
|    4    |   258    |  0.8715049965041024 |  1   |
|    4    | 