## Sample Data Generation

In [1]:
from samples import generate_sample_data
import json

sample_data = generate_sample_data(1000, 1000, 200)
with open("samples/data.json", "w") as f:
    json.dump(sample_data, f)

In [2]:
import json
from random import randrange
with open("samples/data.json", "r") as f:
    sample_data = json.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(len(user_ratings) * len(user_ratings[0]) * 0.2)
test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

In [1]:
user_ratings = [
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
]

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]

In [4]:
user_ratings = [
    [5, 4, 4, -1, 5],
    [-1, 3, 5, 3, 4],
    [5, 2, -1, 2, 3],
    [-1, 2, 3, 1, 2],
    [4, -1, 5, 4, 5],
    [5, 3, -1, 3, 5],
    [3, 2, 3, 2, -1],
    [5, 3, 4, -1, 5],
    [4, 2, 5, 4, -1],
    [5, -1, 5, 3, 4]
]
test_set = [(0, 4), (1, 3), (2, 3), (3, 1), (4, 2),
                   (5, 0), (6, 1), (7, 1), (8, 0), (9, 0)]

In [15]:
import requests
import zipfile
import io

# Download the zip file
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

In [2]:
import pandas as pd
from random import sample

# Read the ratings CSV
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")

# Convert the CSV into a user ratings table
# Create a dense matrix where each row represents a user and each column a movie.
# Missing ratings are filled with 0.
ratings_pivot = ratings_df.pivot(index='userId', columns='movieId', values='rating').fillna(-1)
user_ratings = ratings_pivot.values.tolist()

print("User ratings table created with dimensions:",
  len(user_ratings), "rows x", len(user_ratings[0]) if user_ratings else 0, "columns")

# Get all indices with an existing (non -1) rating
valid_entries = [(i, j) for i in range(len(user_ratings))
         for j in range(len(user_ratings[i])) if user_ratings[i][j] != -1]
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = sample(valid_entries, min(test_set_size, len(valid_entries)))

User ratings table created with dimensions: 610 rows x 9724 columns


## Least Squares Optimiation Predictor (Baseline)

In [3]:
from predictors.least_squares import LeastSquaresPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error, root_mean_square_error_entries


training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
baseline = LeastSquaresPredictor(training_data=training_data, lmda=0.5)
baseline.train()
print(baseline.b)
test_predictions = baseline.predict(test_set)
all_predictions = baseline.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")
rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Constructing relevant matrices...
Calculating user and item biases...


  self.b = spsolve(AAT, rhs)


Training done.
[ 0.72214951 -0.03266515 -1.64916279 ... -0.01992345 -0.01992345
  0.        ]
Predicting entries...


100%|██████████| 20167/20167 [00:00<00:00, 1743225.46it/s]


Finished predicting entries.
Predicting all...


100%|██████████| 610/610 [00:01<00:00, 305.90it/s]


Finished predicting all.
test_predictions = array([2.13617428, 2.87920618, 4.30863644, ..., 3.90042121, 3.26603948,
       3.73401233], shape=(20167,))
all_predictions = array([[4.68351995, 4.30024199, 3.8850486 , ..., 4.20060214, 4.20060214,
        4.22052559],
       [3.92870529, 3.54542733, 3.13023395, ..., 3.44578749, 3.44578749,
        3.46571093],
       [2.31220765, 1.92892969, 1.51373631, ..., 1.82928985, 1.82928985,
        1.84921329],
       ...,
       [3.64523163, 3.26195367, 2.84676028, ..., 3.16231382, 3.16231382,
        3.18223727],
       [3.57322219, 3.18994423, 2.77475085, ..., 3.09030439, 3.09030439,
        3.11022783],
       [4.07563719, 3.69235923, 3.27716585, ..., 3.59271939, 3.59271939,
        3.61264283]], shape=(610, 9724))
rmse_training = np.float64(0.7840059420212793)
rmse_test = np.float64(0.8625055414499949)


## Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)

In [4]:
import numpy as np
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked, two_most_similar

# baseline.predict_all = lambda quiet=False: np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.USER)
improved.train(two_most_similar)
test_predictions = improved.predict(test_set)
all_predictions = improved.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")

Calculating cosine similarity coefficients...
Making neighbor table...


100%|██████████| 9724/9724 [00:19<00:00, 494.20it/s]


Finished training.
Predicting entries...


  result[idx] += self.cosine_coefficients[i][j] / d_sum * error[u][j]
100%|██████████| 20167/20167 [00:01<00:00, 16361.15it/s]


Finished predicting entries.
Predicting all...


  result[u][i] += (
100%|██████████| 9724/9724 [03:05<00:00, 52.37it/s]


Finished predicting all.
test_predictions = array([3.16605987, 4.01064942, 5.        , ..., 3.6392779 , 3.3685562 ,
       3.96203816], shape=(20167,))
all_predictions = array([[4.32168064, 3.96410379, 3.09015887, ...,        nan,        nan,
               nan],
       [4.46083327, 3.37709213, 3.49893619, ...,        nan,        nan,
               nan],
       [2.42861043, 1.84802949, 1.        , ...,        nan,        nan,
               nan],
       ...,
       [2.74954562, 3.13644934, 2.29970926, ...,        nan,        nan,
               nan],
       [3.21138288, 2.05000198, 3.569707  , ...,        nan,        nan,
               nan],
       [4.58947713, 3.41321729, 2.98726576, ...,        nan,        nan,
               nan]], shape=(610, 9724))


## Latent Factor Predictor

In [9]:
user_ratings = [
    [3, 4, 5, 3, 2, 3],
    [3, 2, 3, 4, 2, 1],
    [4, 4, 4, 5, 3, 2],
    [3, 5, 4, 4, 3, 4],
    [2, 1, 2, 2, 3, 1],
    [3, 5, 5, 4, 4, 3],
    [3, 5, 5, 3, 2, 2],
    [2, 3, 3, 2, 1, 2],
]
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]

In [None]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor
from utils import get_test_set_matrix, remove_test_set

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
u, i = training_data.shape
k = 2
latent = LatentFactorPredictor(
    training_data=training_data,
    k=k,
    p=np.ones(shape=(k,u), dtype=np.float64),
    q=np.ones(shape=(k,i), dtype=np.float64),
    lmda=0.5,
)
print(f"{latent.p = }")
print(f"{latent.q = }")
t = 0

latent.p = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 610))
latent.q = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 9724))


In [58]:
latent.train(20)
t += 20
print(f"Total: {t} iterations")

Preparing data for training...
Performing alternating least squares...


100%|██████████| 20/20 [00:05<00:00,  3.37it/s]

Finished training.
Total: 50 iterations





In [59]:
all_predictions = latent.predict_all()
test_predictions = latent.predict(test_set)
print(f"{all_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

Predicting all...


100%|██████████| 610/610 [01:26<00:00,  7.04it/s]


Finished predicting all.
Predicting entries...


100%|██████████| 20167/20167 [00:00<00:00, 189201.56it/s]

Finished predicting entries.
all_predictions = array([[ 4.80416546,  4.28099755,  3.82643674, ...,  1.67528032,
         1.67528032,  3.91038507],
       [ 3.98287644,  3.64771165,  3.25778744, ...,  0.05685052,
         0.05685052,  3.26882145],
       [ 1.8234691 ,  1.84229546,  1.64092992, ..., -2.30211272,
        -2.30211272,  1.54362462],
       ...,
       [ 3.73422912,  3.36201765,  3.00412435, ...,  0.83672787,
         0.83672787,  3.04891295],
       [ 3.52748774,  3.15165375,  2.81678823, ...,  1.11784635,
         1.11784635,  2.87349323],
       [ 4.21779697,  3.66341197,  3.27694102, ...,  2.75562431,
         2.75562431,  3.40713031]], shape=(610, 9724))
test_predictions = array([3.28590933, 3.99862446, 4.12939798, ..., 3.88483389, 3.17121486,
       3.77733723], shape=(20167,))
latent.p = array([[ 1.55849765e+00,  2.71055468e+00,  3.72020919e+00, ...,
         1.70706408e+00,  1.26385942e+00,  6.90071853e-05],
       [ 2.35188742e+00,  5.58266770e-01, -2.17658457e+00, 




In [60]:
from utils import root_mean_square_error_entries, root_mean_square_error

rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

rmse_training = np.float64(0.7179494373525677)
rmse_test = np.float64(0.9718108515217198)
