In [1]:

!pip install pandas



In [2]:
import pandas as pd 

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

n_users = users.shape[0]
print(users.head(3))
n_users

   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067


943

In [3]:
import numpy as np
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.to_numpy()
rate_test = ratings_test.to_numpy()

print(rate_train[0])
print(rate_train.shape, rate_test.shape)

[        1         1         5 874965758]
(90570, 4) (9430, 4)


In [4]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]
print(items.shape)
print(n_items)


(1682, 24)
1682


In [5]:
X0 = items.to_numpy()
X_train = X0[:, -19:]
print(X_train.shape)

(1682, 19)


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train.tolist()).toarray()

In [7]:
def get_items_by_user(rate_matrix, user_id):
    
    y = rate_matrix[:, 0]
#     print(y)
    ids = np.where(y == user_id + 1)[0]
    item_ids = rate_matrix[ids, 1] - 1
    scores = rate_matrix[ids, 2]
    
    return (item_ids, scores)


In [10]:
from sklearn.linear_model import Ridge
from sklearn import linear_model 

d = tfidf.shape[1]
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):
    
    ids, scores = get_items_by_user(rate_train, n)
    clf = Ridge(alpha=.01, fit_intercept=True)
    Xhat = tfidf[ids, :]
    
    clf.fit(Xhat, scores)
    
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_

Yhat = tfidf.dot(W) + b
np.set_printoptions(precision=2) 
n = 10
ids, scores = get_items_by_user(rate_test, 10)

print(ids)
print(scores)
print(Yhat[ids, n])

[189 229 311 516 560 659 713 719 740 745]
[3 4 4 2 2 3 4 1 5 4]
[3.5  3.09 3.88 3.92 3.35 3.95 3.72 3.32 3.37 3.35]


In [15]:
def evaluate(Yhat, rates, W, b):
    se = 0
    cnt = 0
    for n in range(n_users):
        ids, scores_truth = get_items_by_user(rates, n)
        scores_pred = Yhat[ids, n]
        e = scores_truth - scores_pred 
        se += (e*e).sum(axis = 0)
        cnt += e.size 
    return np.sqrt(se/cnt)


print('RMSE for training:', evaluate(Yhat, rate_train, W, b))
print('RMSE for test    :', evaluate(Yhat, rate_test, W, b))


RMSE for training: 0.908980456282672
RMSE for test    : 0.8053347669373638
