In [1]:
import numpy as np

from db_handler import DBHandler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

In [34]:
db = DBHandler()
db.setup_test_table()

R = {}
row = 0
col = 0

for x in db.read_data("comp3208-train-small.csv"):
    list_parts = x.strip().split(',')

    if list_parts[2] == "rating":
        continue
    
    user = int(list_parts[0]) - 1
    item = int(list_parts[1]) - 1
    rating = float(list_parts[2])
        
    if int(user) > row:
        row = user
    if item > col:
        col = item
    
    if user in R:
        R[user][item] = rating
    else:
        R[user] = {item: rating}
        
row += 1
col += 1

INFO 2021-02-26 19:13:58,575 logging started
INFO 2021-02-26 19:13:58,577 Setting up test table
INFO 2021-02-26 19:13:58,857 Reading data from database: comp3208-train-small.csv


In [33]:
db = DBHandler()
db.setup_test_table()

row = [] # user
col = [] # item
rating = []

for x in db.read_data("comp3208-train-small.csv"):
    line_split = x.strip().split(',')
    if line_split[2] == "rating":
        continue
    row.append(int(line_split[0]) - 1)
    col.append(int(line_split[1]) - 1)
    rating.append(float(line_split[2]))
    count += 1
    

INFO 2021-02-26 19:12:07,503 logging started
INFO 2021-02-26 19:12:07,505 Setting up test table
INFO 2021-02-26 19:12:08,015 Reading data from database: comp3208-train-small.csv


9425745


In [28]:
from scipy.sparse import coo_matrix

R = coo_matrix((rating, (row, col)), shape=((max(row) + 1), (max(col) + 1)))

# Latent Factor Model

In [25]:
from numpy.linalg import norm

def calc_mse(R, U, V, lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    total_error = 0
    
    for index in range(len(ratings)):
        rating = ratings[index]
        user = rows[index]
        item = cols[index]
        
        # lambda for regularisation
        if rating > 0:
            total_error = total_error + pow(rating - np.dot(U[user,:], V[:,item]), 2) + lamda * (pow(norm(U[user,:]), 2) + pow(norm(V[:,item]), 2))
    print(len(ratings))
    
    return total_error / len(ratings)

In [26]:
def gradient_descent(R, K, lam = 0.02, steps = 10, learning_rate = 0.01):
    # using K create matricies U and V
    M, N = R.shape
    np.random.seed(0)
    U = np.random.rand(M, K)
    np.random.seed(1)
    V = np.random.rand(K, N)
    mse = calc_mse(R, U, V)
    
    for step in range(steps):
        for index in range(len(R.data)):
            # --> you can parallelise this
            rating = R.data[index]
            user = R.row[index]
            item = R.col[index]
            # ignore 0 ratings as its means that no rating has been made
            if rating > 0:
                loss = rating - np.dot(U[user,:], V[:,item])
                U[user,:] = U[user,:] + learning_rate * 2 * (loss * V[:,item] - lam * U[user,:])
                V[:,item] = V[:,item] + learning_rate * 2 * (loss * U[user,:] - lam * V[:,item])
            # <-- end
        mse = calc_mse(R, U, V)
    print("Final MSE:", mse)
    
    return U, V

In [35]:
from numpy.linalg import norm

# dict version
def calc_mse(R, U, V, lamda=0.02):
    total_error = 0
    count = 0
    
    for user, ratings in R.items():
        for item, rating in ratings.items():
            total_error = total_error + pow(rating - np.dot(U[user,:], V[:,item]), 2) + lamda * (pow(norm(U[user,:]), 2) + pow(norm(V[:,item]), 2))
            count += 1
    
    return total_error / count

In [40]:
# dict version
def gradient_descent(R, K, shape, lam = 0.02, steps = 10, learning_rate = 0.01):
    # using K create matricies U and V
    M, N = shape
    np.random.seed(0)
    U = np.random.rand(M, K)
    np.random.seed(1)
    V = np.random.rand(K, N)
    # mse = calc_mse(R, U, V)
    
    for step in range(steps):
        # split R into n chunks, aggregrate the U and V of all the processes
        # --> you can parallelise this
        for user, ratings in R.items():
            for item, rating in ratings.items():
                loss = rating - np.dot(U[user,:], V[:,item])
                U[user,:] = U[user,:] + learning_rate * 2 * (loss * V[:,item] - lam * U[user,:])
                V[:,item] = V[:,item] + learning_rate * 2 * (loss * U[user,:] - lam * V[:,item])
        # <-- end
    mse = calc_mse(R, U, V)
    print("Final MSE:", mse)
    
    return U, V

In [41]:
U, V = gradient_descent(R, K=3, shape=(row, col), lam=0.01, steps=10, learning_rate=0.001) #shape=(row, col)

Final MSE: 1.0490034627151001
