In [2]:
import numpy as np
import pandas as pd

from db_handler import DBHandler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy

import random

In [3]:
db = DBHandler()
db.setup_test_table()

R = {}
row = 0
col = 0
count = 0

for x in db.read_data("comp3208-train-small.csv"):
    list_parts = x.strip().split(',')

    if list_parts[2] == "rating":
        continue
    
    user = int(list_parts[0]) - 1
    item = int(list_parts[1]) - 1
    rating = float(list_parts[2])
        
    if int(user) > row:
        row = user
    if item > col:
        col = item
    
    if user in R:
        R[user][item] = rating
        count += 1
    else:
        R[user] = {item: rating}
        count += 1
        
row += 1
col += 1

INFO 2021-05-12 09:06:18,128 logging started
INFO 2021-05-12 09:06:18,150 Setting up test table
INFO 2021-05-12 09:06:18,159 Reading data from database: comp3208-train-small.csv


In [33]:
db = DBHandler()
db.setup_test_table()

row = [] # user
col = [] # item
rating = []

for x in db.read_data("comp3208-train-small.csv"):
    line_split = x.strip().split(',')
    if line_split[2] == "rating":
        continue
    row.append(int(line_split[0]) - 1)
    col.append(int(line_split[1]) - 1)
    rating.append(float(line_split[2]))
    count += 1
    

INFO 2021-02-26 19:12:07,503 logging started
INFO 2021-02-26 19:12:07,505 Setting up test table
INFO 2021-02-26 19:12:08,015 Reading data from database: comp3208-train-small.csv


9425745


In [28]:
from scipy.sparse import coo_matrix

R = coo_matrix((rating, (row, col)), shape=((max(row) + 1), (max(col) + 1)))

In [4]:
user_list = np.empty(shape=count)
movie_list = np.empty(shape=count)
true = np.empty(shape=count)
counter = 0

for user, info in R.items():
    for movie, rating in info.items():
        user_list[counter] = user
        movie_list[counter] = movie
        true[counter] = rating
        counter += 1

In [19]:
# predict random

np.random.seed(0)
pred = np.random.uniform(0.5, 5, count)

In [13]:
# predict 3.5

pred = np.full(shape=count, fill_value=3.5)

In [20]:
rounder = lambda x: int(round(x * 2))
vfunc = np.vectorize(rounder)

print("MAE:  " + str(mean_absolute_error(true, pred)))
print("RMSE: " + str(mean_squared_error(true, pred, squared=False)))
print("MSE:  " + str(mean_squared_error(true, pred)))
print("F1:  " + str(f1_score(vfunc(true), vfunc(pred), average='weighted')))
print("Recall:  " + str(recall_score(vfunc(true), vfunc(pred), average='weighted')))

MAE:  1.5050328707452476
RMSE: 1.8433856578623602
MSE:  3.3980706836126466
F1:  0.11627098689859201
Recall:  0.10246383707600831


In [15]:
split = int(0.75 * count)

random.Random(0).shuffle(user_list)
random.Random(0).shuffle(movie_list)
random.Random(0).shuffle(true)

ratings_dict = {
    "user": user_list[:split].tolist(),
    "movie": movie_list[:split].tolist(),
    "rating": true[:split].tolist()
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(0.5, 5))

data = Dataset.load_from_df(df[["user", "movie", "rating"]], reader)

sim_options = {
    "name": "cosine",
    "user_based": False,
}

algo = KNNWithMeans(sim_options=sim_options)

trainset = data.build_full_trainset()

algo.fit(trainset)

pred = np.empty(shape=count)

for x in range(split, count):
    pred[x] = algo.predict(user_list[x], movie_list[x]).est

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [17]:
print("MAE:  " + str(mean_absolute_error(true[split:], pred[split:])))
print("RMSE: " + str(mean_squared_error(true[split:], pred[split:], squared=False)))
print("MSE:  " + str(mean_squared_error(true[split:], pred[split:])))
print("F1:  " + str(f1_score(vfunc(true[split:]), vfunc(pred[split:]), average='weighted')))
print("Recall:  " + str(recall_score(vfunc(true[split:]), vfunc(pred[split:]), average='weighted')))

MAE:  0.6652407275158023
RMSE: 0.8788317162470406
MSE:  0.7723451854817188
F1:  0.2479312521937782
Recall:  0.26075341712933553
