In [1]:
import pandas as pd

# Reading user file
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep = '|', names = u_cols, encoding = 'latin-1')

n_users = users.shape[0]
print('Number of users: ', n_users)

Number of users:  943


In [20]:
# Reading ratings file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

print('Number of training rates: ', rate_train.shape[0])
print('Number of test rates: ', rate_test.shape[0])

Number of training rates:  90570
Number of test rates:  9430


In [21]:
# Reading items file
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,
 encoding='latin-1')

n_items = items.shape[0]
print('Number of items:', n_items)

Number of items: 1682


In [22]:
import numpy as np

X0 = items.values
X_train_counts = X0[:, -19:]

In [23]:
#tfidf
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()
# after this step, every row of tfidf is a feature vector of an item

In [24]:
def get_items_rated_by_user(rate_matrix, user_id):
    y = rate_matrix[:, 0] # all users
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [27]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1]
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range (n_users):
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha = 0.01, fit_intercept = True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores)
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

In [28]:
# predicted score
Yhat = tfidf.dot(W) + b

In [34]:
n = 100
ids, scores = get_items_rated_by_user(rate_test, n)
print('User id: ', n)
print('Rated movies ids: ', ids)
print('True ratings: ', scores)
print('Predicted ratings: ', Yhat[ids, n].round(2))

User id:  100
Rated movies ids:  [221 251 280 281 303 368 404 470 595 828]
True ratings:  [3 3 2 3 3 2 4 3 3 3]
Predicted ratings:  [3.28 3.5  3.07 2.28 3.23 2.11 2.83 3.29 2.69 3.28]


In [38]:
# Evaluating model using Root Mean Squared Error

def evaluate(Yhat, rates, W, b):
    se = 0
    count = 0
    for n in range(n_users):
        ids, scores_truth = get_items_rated_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred
        se += (e*e).sum(axis = 0)
        count += e.size
    return np.sqrt(se/count)

print('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print('RMSE for test    :', evaluate(Yhat, rate_test, W, b))

RMSE for training: 0.9089804562826721
RMSE for test    : 1.2703282700393035
