In [1]:
import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

In [2]:
mat=sio.loadmat('Recommendation_movies.mat')
mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Y', 'R'])

In [3]:
Y,R=mat['Y'],mat['R']
Y.shape,R.shape

((1682, 943), (1682, 943))

In [4]:
param_mat=sio.loadmat('Recommendation_movieParams.mat')
param_mat.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'Theta', 'num_users', 'num_movies', 'num_features'])

In [5]:
X,Theta,nu,nm,nf=param_mat['X'],param_mat['Theta'],param_mat['num_users'],param_mat['num_movies'],param_mat['num_features']

In [6]:
X.shape,Theta.shape,nu,nm,nf

((1682, 10),
 (943, 10),
 array([[943]], dtype=uint16),
 array([[1682]], dtype=uint16),
 array([[10]], dtype=uint8))

In [7]:
nu=int(nu)
nm=int(nm)
nf=int(nf)
nu,nm,nf

(943, 1682, 10)

In [8]:
def serialize(X,Theta):
    return np.append(X.flatten(),Theta.flatten())

In [9]:
def deserialize(params,nm,nu,nf):
    X=params[:nm*nf].reshape(nm,nf)
    Theta=params[nm*nf:].reshape(nu,nf)
    return X,Theta

In [10]:
def costFunction(params,Y,R,nm,nu,nf,lamda):
    X,Theta=deserialize(params,nm,nu,nf)
    error=0.5*np.square((X@Theta.T-Y)*R).sum()
    reg1=0.5*lamda*np.square(X).sum()
    reg2=0.5*lamda*np.square(Theta).sum()
    return error+reg1+reg2

In [11]:
users=4
movies=5
features=3
X_sub=X[:movies,:features]
Theta_sub=Theta[:users,:features]
Y_sub=Y[:movies,:users]
R_sub=R[:movies,:users]
cost1=costFunction(serialize(X_sub,Theta_sub),Y_sub,R_sub,movies,users,features,lamda=0)
cost1

22.224603725685675

In [12]:
def costGradient(params,Y,R,nm,nu,nf,lamda):
    X,Theta=deserialize(params,nm,nu,nf)
    X_grad=((X@Theta.T-Y)*R)@Theta+lamda*X
    Theta_grad=((X@Theta.T-Y)*R).T@X+lamda*Theta
    return serialize(X_grad,Theta_grad)

In [13]:
my_ratings=np.zeros((nm,1))
my_ratings[27]=5
my_ratings[28]=5
my_ratings[49]=5
my_ratings[134]=5
my_ratings[140]=5
my_ratings[175]=5
my_ratings[200]=2
my_ratings[203]=4
my_ratings[226]=5
my_ratings[227]=5
my_ratings[228]=5
my_ratings[229]=5
my_ratings[230]=5
my_ratings[342]=5
my_ratings[402]=5
my_ratings[422]=5
my_ratings[448]=5
my_ratings[449]=5

In [14]:
Y=np.c_[Y,my_ratings]
R=np.c_[R,my_ratings!=0]

In [15]:
Y.shape

(1682, 944)

In [16]:
nm,nu=Y.shape

In [17]:
def normalizeRatings(Y,R):
    Y_mean=(Y.sum(axis=1)/R.sum(axis=1)).reshape(-1,1)
    Y_norm=(Y-Y_mean)*R
    return Y_norm,Y_mean

In [18]:
Y_norm,Y_mean=normalizeRatings(Y,R)

In [19]:
X=np.random.random((nm,nf))
Theta=np.random.random((nu,nf))
params=serialize(X,Theta)
lamda=5

In [20]:
from scipy.optimize import minimize
res=minimize(fun=costFunction,x0=params,args=(Y_norm,R,nm,nu,nf,lamda),method='TNC',jac=costGradient,options={'maxiter':100})

  res=minimize(fun=costFunction,x0=params,args=(Y_norm,R,nm,nu,nf,lamda),method='TNC',jac=costGradient,options={'maxiter':100})


In [21]:
params_fit=res.x

In [22]:
fit_X,fit_Theta=deserialize(params_fit,nm,nu,nf)

In [23]:
Y_pred=fit_X@fit_Theta.T

In [24]:
y_pred=Y_pred[:,-1]+Y_mean.flatten()

In [25]:
index=np.argsort(-y_pred)

In [26]:
index[:10]

array([ 120,   49,  180,  171,  173,   95,  312,  194, 1466,  209],
      dtype=int64)

In [27]:
movies=[]
with open('movie_ids.txt','r',encoding='latin 1')as f:
    for line in f:
        tokens=line.strip().split(' ')
        movies.append(' '.join(tokens[1:]))

In [28]:
len(movies)

1682

In [29]:
for i in range(10):
    print(index[i],movies[index[i]],y_pred[index[i]])

120 Independence Day (ID4) (1996) 5.449120842440642
49 Star Wars (1977) 5.352125183811118
180 Return of the Jedi (1983) 5.319959550159248
171 Empire Strikes Back, The (1980) 5.182303379024724
173 Raiders of the Lost Ark (1981) 5.170746823498372
95 Terminator 2: Judgment Day (1991) 5.109846973143337
312 Titanic (1997) 5.042844020713496
194 Terminator, The (1984) 5.026389017143506
1466 Saint of Fort Washington, The (1993) 5.0090202813542275
209 Indiana Jones and the Last Crusade (1989) 5.008614086836323
