In [0]:
"""

  This code performs matrix completion based on Collaborative Filtering. The data set
  used is the MovieLens small dataset.

  This code has been modified baed on the lab_3 of the course DL&BDP given at the VUB.

"""

In [0]:
!pip install XlsxWriter

In [0]:
#%%

### Step 1: Load data and create masks
import numpy as np
import xlsxwriter

def load_rating(fname, N=610, M=9725):
    ''' load rating file with the format: UserID::MovieID::Rating::Timestamp
    Can be used with MovieLens100K & MovieLens1M
    Params:
        - fname: file name
        - N: number of users
        - M: number of items (e.g. movies)
    '''
    R = np.zeros((N, M))    # Matrix of zeros of size (N, M)

    movies_list = np.zeros(M)
    idx = 0

    with open(fname, 'r') as fin:
        lines = fin.readlines()

        for line in lines[1:]:
            splt = line.strip().split(',')
            if splt[0] != '':
              uid = int(splt[0]) - 1
              if splt[1] != '':
                movie_id = int(splt[1])
                if uid < 610:
                  if movie_id in movies_list:
                      mid = np.where(movies_list == movie_id)
                  else:
                      movies_list[idx] = movie_id
                      mid = idx
                      idx += 1

                  r = float(splt[2])
                  
                  R[uid,mid] = r

    print(R.shape)
    return R
    

N = 610
M = 9725
# This number is the number of users and movies taken from the data base.

data = load_rating("ratings.csv", N, M)

(610, 9725)


In [0]:
workbook = xlsxwriter.Workbook('data_set.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0

for line in data:
  col = 0
  if row%100 == 0:
    print(row)
  for el in line:
    worksheet.write(row, col, el)
    col = col + 1
  row = row + 1

workbook.close()

0
100
200
300
400
500
600


In [3]:
#%%

# In data, we've got the matrix coming from ratings.csv, 
#   which is a sparse one of dimension (610, 9725).

from random import randint

# create mask matrix
def create_mask(X):
    '''

    :param X:
    :return: Matrix of 1 and 0
    '''
    mask = X > 0
    return mask.astype(int)

def removes_entries_mask(mask, frac):
    '''

    :param data:
    :param frac:
    :return: Matrix of 1 and 0 depending on the removed entries
    '''
    matrix = np.random.choice([0, 1], size=(mask.shape), p=[frac, 1-frac])
    mask = np.multiply(mask, matrix)
    return mask.astype(int)

data_test = data[:N, :M]

# The next print are used to verify the number of entries of each data set.

cnt_1 = 0
for line in data_test:
    for el_1 in line:
        if el_1 != 0:
            cnt_1 += 1

print("Number of entries inside the matrix 'data_test' : " + str(cnt_1))

mask_data_set = create_mask(data_test)  #mask_data_set -> Based on the data set
rmv_entries_mask = removes_entries_mask(mask_data_set, 1/10)
data_train = np.multiply(data[:N, :M], rmv_entries_mask)
mask_data_train = create_mask(data_train)  #mask_data_train -> Based on the data train
# The data train possess less entries than the data test, therefore a mask needs to
# be computed for each data set.

cnt_2 = 0
for line, linee in zip(data_train, data_test):
    for el_1, el_2 in zip(line, linee):
        if el_1 != el_2:
            cnt_2 += 1

print("Number of entries of the matrix 'data_train' different from the matrix 'data_test' : " + str(cnt_2))

print("Ratio between the removed entries and the number of entries : " + str(cnt_2/cnt_1))

cnt_3 = 0
for line in mask_data_set:
    for el_1 in line:
        if el_1 != 0:
            cnt_3 += 1

print("Number of entries inside the matrix 'mask_data_test' : " + str(cnt_3))

cnt_4 = 0
for line in mask_data_train:
    for el_1 in line:
        if el_1 != 0:
            cnt_4 += 1

print("Number of entries inside the matrix 'mask_data_train' : " + str(cnt_4))

NameError: ignored

In [0]:
workbook = xlsxwriter.Workbook('mask_data_set.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0

for line in mask_data_set:
  col = 0
  if row%100 == 0:
    print(row)
  for el in line:
    worksheet.write(row, col, el)
    col = col + 1
  row = row + 1

workbook.close()

0
100
200
300
400
500
600


In [0]:
#%%
# This part of the code remains unchanged from the lab

### Step 2: Implement functions to calculate cost and gradients
# This function computes the cost that we want to minimize
def compute_cost(U, V, R, _lambda, mask):
    # Equation 10 from
    '''
    Inputs:
        U: A matrix contains users' feature
        V: A matrix contains movies' feature
        R: A matrix contains ground truth (size: n_movies x n_users)
        _lambda: Regularization weight
        mask: The binary mask matrix
    Output:
        loss: (scalar) the mean square error over known elements in R
    '''
    assert U.shape[1] == V.shape[1]
    assert U.shape[0] == R.shape[0]
    assert V.shape[0] == R.shape[1]
    assert R.shape == mask.shape
    numerator = (((np.matmul(U,V.T) - R)**2) * mask).sum()
    return (0.5 * numerator / mask.sum() + 0.5 * _lambda * (U**2).sum()
            + 0.5 * _lambda * (V**2).sum())


# This function computes partial derivatives of the cost function w.r.t. movie and user features
def compute_gradient(U, V, R, _lambda, mask):
    '''
    Inputs:
        U: A matrix contains users' feature
        V: A matrix contains movies' feature
        R: A matrix contains ground truth (size: n_movies x n_users)
        _lambda: Regularization weight
        mask: The binary mask matrix
    Output:
        grad_U: gradients of the cost function w.r.t. U, must have the same shape as U
        grad_V: gradients of the cost function w.r.t. U, must have the same shape as V
    '''
    assert U.shape[1] == V.shape[1]
    assert U.shape[0] == R.shape[0]
    assert V.shape[0] == R.shape[1]
    assert R.shape == mask.shape
    mul = np.matmul(U, V.T)
    gradient_U = np.matmul((mul - R) * mask, V) + _lambda * U
    gradient_V = np.matmul(((mul - R) * mask).T, U) + _lambda * V

    return gradient_U, gradient_V


In [0]:
#%%
# In this part, the lambda, alpha and training_iterations has been modified after
# some experiments to match obtain a satisfactory training

import time
t = time.time()
### Step 3: Training
n_features = 150
# randomly initialize U and V
U = 0.25 * np.random.randn(n_users, n_features)
V = 0.25 * np.random.randn(n_movies, n_features)
# regularization weight
_lambda = 0.00005
# learning rate
alpha = 0.001
# number of training iteration
training_iterations = 250

counter = 0
while counter < training_iterations:
    # perform one step of training
    grad_U, grad_V = compute_gradient(U, V, data_train, _lambda, mask_data_train)

    U = U - alpha * grad_U
    V = V - alpha * grad_V

    cost = compute_cost(U, V, data_train, _lambda, mask_data_train)
    # increase counter
    counter += 1
    if counter % 10 == 0:
        print("Iteration:",counter,"cost: ",cost)
print(str(time.time() - t))

Iteration: 10 cost:  4.5623943090853025
Iteration: 20 cost:  3.2691276469497383
Iteration: 30 cost:  2.99202238132196
Iteration: 40 cost:  2.883759022129001
Iteration: 50 cost:  2.831142595835094
Iteration: 60 cost:  2.802236605708085
Iteration: 70 cost:  2.785089202607082
Iteration: 80 cost:  2.774395299244973
Iteration: 90 cost:  2.767501885963081
Iteration: 100 cost:  2.7629636727679685
Iteration: 110 cost:  2.759941952995736
Iteration: 120 cost:  2.757926005278313
Iteration: 130 cost:  2.7565927657357507
Iteration: 140 cost:  2.7557314210472796
Iteration: 150 cost:  2.755200778737052
Iteration: 160 cost:  2.7549041650877846
Iteration: 170 cost:  2.7547741348460653
Iteration: 180 cost:  2.7547628822878645
Iteration: 190 cost:  2.7548360711144566
Iteration: 200 cost:  2.754968771520113
Iteration: 210 cost:  2.755142728339054
Iteration: 220 cost:  2.755344489088772
Iteration: 230 cost:  2.755564099096284
Iteration: 240 cost:  2.7557941777857122
Iteration: 250 cost:  2.7560292556885195

In [0]:
#%%

# Step 4: Evaluation function
def RMSE(A, B, mask):
    ''' Root mean square error
    '''
    rmse = np.sqrt(np.sum(np.multiply(mask, np.square(A - B))) / np.sum(mask))
    return rmse

def MAE(A, B, mask):
    ''' Mean absolute error
    '''
    mae = np.sum(np.multiply(mask, np.square(A - B))) / np.sum(mask)
    return mae

In [0]:
#%%

### Step 5: Evaluate the model
# make prediction
prediction = np.dot(U, V.T)

# Compute RMSE and MAE on the training set

print("RMSE_train: ", RMSE(data_train, prediction, mask_data_train))
print("MAE_train: ", MAE(data_train, prediction, mask_data_train))

# Compute RMSE and MAE on the testing set
print("RMSE_test: ",RMSE(data_test, prediction, mask_data_set))
print("MAE_test: ",MAE(data_test, prediction, mask_data_set))

# To compute the validation, it is done only on the predictions of the matrix
# factorization. To avoid evaluating the reconstruction of the data train.

mask_prediction = mask_data_set - mask_data_train;

# Compute RMSE and MAE on the testing set
print("RMSE_prediction: ",RMSE(data_test, prediction, mask_prediction))
print("MAE_prediction: ",MAE(data_test, prediction, mask_prediction))


RMSE_train:  0.08540715309085423
MAE_train:  0.007294381799084612
RMSE_test:  0.5554127550831367
MAE_test:  0.30848332850904037
RMSE_prediction:  1.7324116946934949
MAE_prediction:  3.001250279910787


In [0]:
import xlsxwriter

workbook = xlsxwriter.Workbook('prediction.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0

for line in prediction:
  col = 0
  if row%100 == 0:
    print(row)
    print(line)
  for el in line:
    worksheet.write(col, row, el)
    col = col + 1
  row = row + 1

workbook.close()


0
[4.84736209 3.70927932 4.55980349 ... 1.80436184 2.13492027 1.05812267]
100
[4.0704561  3.00227169 3.87933075 ... 2.89589947 1.45280024 1.37953385]
200
[5.55482541 3.9733254  4.0074259  ... 2.55748091 1.66123339 1.76782399]
300
[3.84068147 2.24245704 3.02804476 ... 1.69745515 1.18250619 0.66788104]
400
[3.96328647 2.16042466 3.36248019 ... 2.29911203 1.39100853 1.51923238]
500
[3.93782857 3.79770557 3.27392604 ... 0.92641932 1.63389587 0.59942973]
600
[4.49609374 2.66641339 4.33837024 ... 1.83846937 2.45526097 0.75598333]


In [0]:
workbook = xlsxwriter.Workbook('prediction_mask.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 0

prediction_mask = np.multiply(mask_data_set, prediction)

for line in prediction_mask:
  col = 0
  if row%100 == 0:
    print(row)
    print(line)
  for el in line:
    worksheet.write(row, col, el)
    col = col + 1
  row = row + 1

workbook.close()


0
[4.84736209 3.70927932 4.55980349 ... 0.         0.         0.        ]
100
[0. 0. 0. ... 0. 0. 0.]
200
[5.55482541 0.         0.         ... 0.         0.         0.        ]
300
[0. 0. 0. ... 0. 0. 0.]
400
[3.96328647 0.         0.         ... 0.         0.         0.        ]
500
[0.         3.79770557 3.27392604 ... 0.         0.         0.        ]
600
[4.49609374 0.         0.         ... 0.         0.         0.        ]
