# Demostration of Different Predictors and Recommenders

## Sample Data Generation

### Toy Example Data

In [None]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
])

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]

In [1]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 4, 0, 5],
    [0, 3, 5, 3, 4],
    [5, 2, 0, 2, 3],
    [0, 2, 3, 1, 2],
    [4, 0, 5, 4, 5],
    [5, 3, 0, 3, 5],
    [3, 2, 3, 2, 0],
    [5, 3, 4, 0, 5],
    [4, 2, 5, 4, 0],
    [5, 0, 5, 3, 4]
])
test_set = [(0, 4), (1, 3), (2, 3), (3, 1), (4, 2),
                   (5, 0), (6, 1), (7, 1), (8, 0), (9, 0)]

### Random Data

In [1]:
from samples import generate_sample_data
import json

sample_data = generate_sample_data(1000, 1000, 200)
with open("samples/data.json", "w") as f:
    json.dump(sample_data, f)

In [2]:
import json
from random import randrange
with open("samples/data.json", "r") as f:
    sample_data = json.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(len(user_ratings) * len(user_ratings[0]) * 0.2)
test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

### Real Datasets (from MovieLens)

In [1]:
import requests
import zipfile
import io

# Download the latest (small) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

# Download the latest (full) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

In [1]:
import pandas as pd
from random import sample
from scipy.sparse import csr_matrix

# Read the ratings and movies CSV
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")
movies_df = pd.read_csv("data/ml-latest-small/movies.csv")

# Read the ratings and movies CSV (WARNING: FULL DATASET)
# print("Reading CSV files...")
# ratings_df = pd.read_csv("data/ml-latest/ratings.csv")
# movies_df = pd.read_csv("data/ml-latest/movies.csv")

# Convert the CSV into a user ratings table
# Create a dense matrix where each row represents a user and each column a movie.
# Missing ratings are filled with 0.
print("Pivotting data...")
user_ids = sorted(ratings_df["userId"].unique())
movie_ids = sorted(ratings_df["movieId"].unique())
user_id_map = {uid: i for i, uid in enumerate(user_ids)}
movie_id_map = {mid: j for j, mid in enumerate(movie_ids)}

rows = ratings_df["userId"].map(user_id_map).values
cols = ratings_df["movieId"].map(movie_id_map).values
data = ratings_df["rating"].values

user_ratings = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))
movie_id_mappings = movies_df["movieId"].to_list()

print(
    "User ratings table created with dimensions:",
    user_ratings.shape[0],
    "rows x",
    user_ratings.shape[1],
    "columns",
)

print("Making test sets...")
# Get all indices with an existing (non zero) rating
valid_entries = list(zip(*user_ratings.nonzero()))
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = sample(valid_entries, min(test_set_size, len(valid_entries)))
print("Test set created with size:", len(test_set))
print("Done.")

Pivotting data...
User ratings table created with dimensions: 610 rows x 9724 columns
Making test sets...
Test set created with size: 20167
Done.


## Rating Predictors

### Least Squares Optimiation Predictor (Baseline)

In [2]:
from predictors.least_squares import LeastSquaresPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error, root_mean_square_error_entries


training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
baseline = LeastSquaresPredictor(training_data=training_data, lmda=0)
baseline.train()
test_predictions = baseline.predict(test_set)
all_predictions = baseline.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions.data = }")
print(f"{training_data.data = }")
rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Constructing relevant matrices...


Calculating user and item biases...
Training done.
Predicting entries...


100%|██████████| 10/10 [00:00<00:00, 153637.51it/s]


Finished predicting entries.
Predicting all...


100%|██████████| 30/30 [00:00<00:00, 655360.00it/s]

Finished predicting all.
test_predictions = array([4.61784512, 3.48611111, 2.78114478, 1.        , 4.78998316,
       4.87710438, 1.22895623, 2.91919192, 4.78451178, 4.60984848])
all_predictions.data = array([5.        , 3.09385522, 4.8956229 , 2.89141414, 4.69318182,
       4.41540404, 4.10311448, 2.18644781, 3.71043771, 2.49494949,
       1.28787879, 2.21717172, 4.90488215, 3.58291246, 4.51220539,
       2.96043771, 3.55513468, 4.48442761, 3.1456229 , 3.03072391,
       1.8236532 , 4.83585859, 4.7209596 , 4.44318182, 2.86784512,
       4.66961279, 3.46254209, 4.49494949, 3.28787879, 4.21717172])
training_data.data = array([5, 4, 4, 3, 5, 4, 5, 2, 3, 3, 1, 2, 4, 4, 5, 3, 3, 5, 3, 3, 2, 5,
       4, 5, 2, 5, 4, 5, 3, 4])
rmse_training = np.float64(0.3970719531355135)
rmse_test = np.float64(0.5848469310964864)





### Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)

In [10]:
import numpy as np
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked, two_most_similar

# baseline.predict_all = lambda quiet=False: np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.USER)
improved.train(two_most_similar)
test_predictions = improved.predict(test_set)
all_predictions = improved.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")

Calculating cosine similarity coefficients...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

### Latent Factor Predictor

In [9]:
user_ratings = [
    [3, 4, 5, 3, 2, 3],
    [3, 2, 3, 4, 2, 1],
    [4, 4, 4, 5, 3, 2],
    [3, 5, 4, 4, 3, 4],
    [2, 1, 2, 2, 3, 1],
    [3, 5, 5, 4, 4, 3],
    [3, 5, 5, 3, 2, 2],
    [2, 3, 3, 2, 1, 2],
]
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]

In [2]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor
from utils import get_test_set_matrix, remove_test_set

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
u, i = training_data.shape
k = 2
latent = LatentFactorPredictor(
    training_data=training_data,
    k=k,
    p=np.ones(shape=(k,u), dtype=np.float64),
    q=np.ones(shape=(k,i), dtype=np.float64),
    lmda=0.2,
)
print(f"{latent.p = }")
print(f"{latent.q = }")
t = 0

latent.p = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 610))
latent.q = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 9724))


In [6]:
latent.train(100)
# t += 20
# print(f"Total: {t} iterations")

Preparing data for training...
Performing alternating least squares...


100%|██████████| 100/100 [00:08<00:00, 11.91it/s]

Finished training.





In [7]:
# all_predictions = latent.predict_all()
test_predictions = latent.predict(test_set)
# print(f"{all_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

Predicting entries...


100%|██████████| 20167/20167 [00:00<00:00, 124545.44it/s]

Finished predicting entries.
test_predictions = array([4.07793265, 2.55603855, 2.40243762, ..., 2.77691777, 3.50032545,
       4.57620272], shape=(20167,))
latent.p = array([[ 1.62403868,  1.83073924, -4.85863659, ...,  1.1920012 ,
         1.16988134,  0.14669571],
       [ 2.66737572,  1.76125342,  4.56756953, ...,  2.05877798,
         1.99417846,  3.338372  ]], shape=(2, 610))
latent.q = array([[ 1.04391677,  0.98010346,  0.83856835, ...,  0.30166179,
         0.30166179, -0.24922798],
       [ 1.19333163,  1.03091986,  0.99673874, ...,  1.13461325,
         1.13461325,  0.92082421]], shape=(2, 9724))





In [8]:
from utils import root_mean_square_error_entries, root_mean_square_error

# rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
# print(f"{rmse_training = }")
print(f"{rmse_test = }")

rmse_test = np.float64(0.936999622670139)


## Making Recommendations

### Plain Recommender

In [6]:
from recommenders.plain import PlainRecommender

recommender = PlainRecommender(
    predictor=latent, users=len(user_ratings), items=len(user_ratings[0])
)

### Pure Score Boost

In [7]:
import random
from recommenders.score_boost import ScoreBoostRecommender

bids = [
    (idx, random.random()) for idx in random.sample(range(len(user_ratings[0])), k=50)
]
paid_recommender = ScoreBoostRecommender(
    predictor=latent,
    users=len(user_ratings),
    items=len(user_ratings[0]),
    bids=bids,
    alpha=0.1,
    beta=50,
    promotion_slots=[True if x % 4 == 0 else False for x in range(20)]
)
print("Bids:", sorted(bids, reverse=True, key=lambda x: x[1]))

Bids: [(7453, 0.9589942423383562), (7332, 0.9536443366636993), (8700, 0.9489936905621729), (9248, 0.9397598989113214), (6302, 0.9068013002138909), (8436, 0.846906227825697), (2173, 0.8406623230266436), (1982, 0.8340563027496675), (1280, 0.8333641553038239), (1146, 0.8098692294726698), (2051, 0.8023050293707962), (7933, 0.781652244287606), (6719, 0.7415775243675684), (3773, 0.7402966517616344), (3800, 0.7199998517457488), (4164, 0.7121445257941096), (6918, 0.6308846214258115), (9120, 0.6225530927666422), (8139, 0.6132826052726236), (2774, 0.6067683021854938), (1473, 0.6039860630527522), (9001, 0.5883723503072925), (4188, 0.5732438926473697), (5501, 0.5509626691825118), (7510, 0.5501842325241326), (1601, 0.5131548547013771), (3207, 0.4720190037075762), (5275, 0.4595292982980288), (2571, 0.4360051641613203), (4163, 0.4159653177675847), (8110, 0.4064911143371722), (4341, 0.3683120391126532), (5872, 0.32961915273822406), (30, 0.3188970688401671), (1989, 0.2835109754433609), (7002, 0.2406703

In [8]:
from random import randint

# print(recommender.users, recommender.items)
user = randint(0, len(user_ratings))
# user = 261
print("Without promotion:", user, [x for x in recommender.recommend_items(user, 20)])
print("With promotion:", user, [x for x in paid_recommender.recommend_items(user, 20)])

[3.19136317 2.38667721 2.38502876 2.29403273 2.18955267 2.15025637
 2.08836099 2.05003535 1.96335046 1.94384418 1.93884868 1.90443902
 1.90443902 1.89399796 1.88095726 1.87526962 1.85883522 1.85883522
 1.84525491 1.84233014]
Without promotion: 441 [np.int64(5022), np.int64(9302), np.int64(4392), np.int64(1947), np.int64(374), np.int64(2800), np.int64(4770), np.int64(6218), np.int64(3106), np.int64(2374), np.int64(7242), np.int64(8136), np.int64(7794), np.int64(5641), np.int64(1028), np.int64(6601), np.int64(2304), np.int64(1508), np.int64(9314), np.int64(5445)]
With promotion: 441 [np.int64(2051), np.int64(5022), np.int64(9302), np.int64(4392), np.int64(4188), np.int64(1947), np.int64(374), np.int64(2800), np.int64(2173), np.int64(4770), np.int64(6218), np.int64(3106), np.int64(6302), np.int64(2374), np.int64(7242), np.int64(7794), np.int64(1982), np.int64(8136), np.int64(5641), np.int64(1028)]


## Work Cited
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>