## Sample Data Generation

In [1]:
from samples import generate_sample_data
import json

sample_data = generate_sample_data(1000, 1000, 200)
with open("samples/data.json", "w") as f:
    json.dump(sample_data, f)

In [9]:
import json
from random import randrange
with open("samples/data.json", "r") as f:
    sample_data = json.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(len(user_ratings) * len(user_ratings[0]) * 0.2)
test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

In [3]:
user_ratings = [
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
]

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]

In [4]:
user_ratings = [
    [5, 4, 4, -1, 5],
    [-1, 3, 5, 3, 4],
    [5, 2, -1, 2, 3],
    [-1, 2, 3, 1, 2],
    [4, -1, 5, 4, 5],
    [5, 3, -1, 3, 5],
    [3, 2, 3, 2, -1],
    [5, 3, 4, -1, 5],
    [4, 2, 5, 4, -1],
    [5, -1, 5, 3, 4]
]
test_set = [(0, 4), (1, 3), (2, 3), (3, 1), (4, 2),
                   (5, 0), (6, 1), (7, 1), (8, 0), (9, 0)]

## Least Squares Optimiation Predictor (Baseline)

In [16]:
from predictors.least_squares import LeastSquaresPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error


training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
baseline = LeastSquaresPredictor(training_data=training_data, lmda=0)
baseline.train()
test_predictions = baseline.predict(test_set)
all_predictions = baseline.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")
rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error(all_predictions, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Constructing relevant matrices...
Calculating user and item biases...


  self.b = spsolve(AAT, rhs)


Training done.
Predicting entries...


100%|██████████| 200000/200000 [00:00<00:00, 2559734.40it/s]


Finish predicting entries.
Predicting all...


100%|██████████| 1000/1000 [00:00<00:00, 3061.74it/s]

Finish predicting all.
test_predictions = array([2.97127183, 2.96800439, 2.95684838, ..., 3.01460806, 2.94151792,
       2.94466463], shape=(200000,))
all_predictions = array([[2.91371091, 2.96432825, 2.93525303, ..., 2.94395739, 2.91429782,
        2.96025262],
       [2.94957233, 3.00018967, 2.97111445, ..., 2.97981881, 2.95015924,
        2.99611404],
       [2.93657768, 2.98719502, 2.9581198 , ..., 2.96682416, 2.93716459,
        2.9831194 ],
       ...,
       [2.95093625, 3.00155359, 2.97247838, ..., 2.98118274, 2.95152317,
        2.99747797],
       [2.94461929, 2.99523663, 2.96616141, ..., 2.97486577, 2.9452062 ,
        2.99116101],
       [2.91271923, 2.96333657, 2.93426136, ..., 2.94296572, 2.91330615,
        2.95926095]], shape=(1000, 1000))
rmse_training = np.float64(0.4520819118695603)
rmse_test = np.float64(0.4534208831219679)





## Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)

In [6]:
import numpy as np
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked

# baseline.predict_all = lambda : np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.ITEM)
improved.train(two_most_similar_skip_masked)
test_predictions = improved.predict(test_set)
all_predictions = improved.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")

Calculating cosine similarity coefficients...
Predicting all...


100%|██████████| 10/10 [00:00<00:00, 158875.15it/s]


Finish predicting all.
Making neighbor table...


100%|██████████| 10/10 [00:00<00:00, 14553.45it/s]


Finish training.
Predicting entries...


100%|██████████| 10/10 [00:00<00:00, 262144.00it/s]


Finish predicting entries.
Predicting all...


100%|██████████| 10/10 [00:00<00:00, 182361.04it/s]

Finish predicting all.
test_predictions = array([5.        , 3.37752525, 2.34823468, 1.        , 4.30218855,
       4.96604771, 1.16377497, 3.64015152, 4.24705387, 4.86694105])
all_predictions = array([[5.        , 3.98947811, 3.98947811, 3.30046397, 5.        ],
       [5.        , 2.58459596, 4.85927655, 3.37752525, 4.10858586],
       [4.81355219, 2.18644781, 4.44934389, 2.34823468, 2.81355219],
       [2.86694105, 1.        , 2.71212121, 1.28787879, 1.71212121],
       [4.45700839, 2.57112795, 4.30218855, 4.48779461, 5.        ],
       [4.96604771, 3.51557239, 4.47311398, 3.51557239, 4.48442761],
       [2.96927609, 1.16377497, 3.03072391, 1.96927609, 2.83495163],
       [4.2790404 , 3.64015152, 4.16414141, 3.34974747, 4.76910407],
       [4.24705387, 2.44155275, 5.        , 4.33038721, 4.06144781],
       [4.86694105, 2.55537792, 4.71212121, 3.28787879, 3.71212121]])





## Latent Factor Predictor

In [None]:
user_ratings = [
    [3, 4, 5, 3, 2, 3],
    [3, 2, 3, 4, 2, 1],
    [4, 4, 4, 5, 3, 2],
    [3, 5, 4, 4, 3, 4],
    [2, 1, 2, 2, 3, 1],
    [3, 5, 5, 4, 4, 3],
    [3, 5, 5, 3, 2, 2],
    [2, 3, 3, 2, 1, 2],
]
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]

In [None]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
u, i = training_data.shape
k = 2
latent = LatentFactorPredictor(
    training_data=training_data,
    k=k,
    p=np.full((k, u), 1, dtype=np.float64),
    q=np.full((k, i), 1, dtype=np.float64),
    lmda=0.2,
)
latent.train(20)
# latent.old_train(20)
all_predictions = latent.predict_all()
test_predictions = latent.predict(test_set)
print(f"{all_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

100%|██████████| 20/20 [00:01<00:00, 10.11it/s]
100%|██████████| 20/20 [00:01<00:00, 10.93it/s]


all_predictions = array([[2.96695063, 2.99222471, 2.88142223, ..., 2.91243597, 2.95074402,
        2.96091005],
       [2.94688007, 2.99882528, 2.97398004, ..., 2.98040445, 2.94858573,
        2.99579658],
       [2.94251972, 2.99024744, 2.95229495, ..., 2.9624706 , 2.94147664,
        2.98289255],
       ...,
       [2.95831017, 3.00555099, 2.96503621, ..., 2.97594132, 2.9567687 ,
        2.99737952],
       [3.06004992, 3.0562819 , 2.84729368, ..., 2.9063786 , 3.02354703,
        2.99278005],
       [2.91956061, 2.96665191, 2.92815743, ..., 2.93849349, 2.91835056,
        2.95907827]], shape=(1000, 1000))
test_predictions = array([2.97408996, 2.9643107 , 2.95656438, ..., 2.99682169, 3.01586525,
       3.0226445 ], shape=(200000,))
latent.p = array([[0.89285607, 1.01045071, 0.9898839 , ..., 0.9917736 , 0.78345222,
        0.98094422],
       [1.09664304, 1.00247261, 1.01437154, ..., 1.02221633, 1.2274797 ,
        1.00731017]], shape=(2, 1000))
latent.q = array([[1.20817119, 1.3566352

In [15]:
rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error(all_predictions, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

rmse_training = np.float64(0.4499102135656826)
rmse_test = np.float64(0.4527622419868144)
