# Demostration of Different Predictors and Recommenders

## Sample Data Generation

### Toy Example Data

In [10]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
])

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

In [1]:
from scipy.sparse import csr_matrix

user_ratings = csr_matrix(
    [
        [5, 4, 4, 0, 5],
        [0, 3, 5, 3, 4],
        [5, 2, 0, 2, 3],
        [0, 2, 3, 1, 2],
        [4, 0, 5, 4, 5],
        [5, 3, 0, 3, 5],
        [3, 2, 3, 2, 0],
        [5, 3, 4, 0, 5],
        [4, 2, 5, 4, 0],
        [5, 0, 5, 3, 4],
    ]
)
test_set = [
    (0, 4),
    (1, 3),
    (2, 3),
    (3, 1),
    (4, 2),
    (5, 0),
    (6, 1),
    (7, 1),
    (8, 0),
    (9, 0),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

### Random Data

In [1]:
from samples import generate_sample_data
import json

sample_data = generate_sample_data(1000, 1000, 200)
with open("samples/data.json", "w") as f:
    json.dump(sample_data, f)

In [2]:
import json
from random import randrange
with open("samples/data.json", "r") as f:
    sample_data = json.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(len(user_ratings) * len(user_ratings[0]) * 0.2)
test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

### Real Datasets (from MovieLens)

In [1]:
import requests
import zipfile
import io

# Download the latest (small) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

# Download the latest (full) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

In [1]:
import pandas as pd
from random import sample
from scipy.sparse import csr_matrix

print("Reading CSV files...")
# Read the ratings and movies CSV
# ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")
# movies_df = pd.read_csv("data/ml-latest-small/movies.csv")

# Read the ratings and movies CSV (WARNING: FULL DATASET)
ratings_df = pd.read_csv("data/ml-latest/ratings.csv")
movies_df = pd.read_csv("data/ml-latest/movies.csv")

# Convert the CSV into a user ratings table
# Create a dense matrix where each row represents a user and each column a movie.
# Missing ratings are filled with 0.
print("Pivotting data...")
user_ids = sorted(ratings_df["userId"].unique())
movie_ids = sorted(ratings_df["movieId"].unique())
user_id_map = {uid: i for i, uid in enumerate(user_ids)}
movie_id_map = {mid: j for j, mid in enumerate(movie_ids)}

rows = ratings_df["userId"].map(user_id_map).values
cols = ratings_df["movieId"].map(movie_id_map).values
data = ratings_df["rating"].values

user_ratings = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))
movie_id_mappings = movies_df["movieId"].to_list()

print(
    "User ratings table created with dimensions:",
    user_ratings.shape[0],
    "rows x",
    user_ratings.shape[1],
    "columns",
)

print("Making test sets...")
# Get all indices with an existing (non zero) rating
valid_entries = list(zip(*user_ratings.nonzero()))
shuffuled_valid_entries = sample(valid_entries, k=len(valid_entries))
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = shuffuled_valid_entries[:test_set_size]
training_set = shuffuled_valid_entries[test_set_size:]
print("Test set created with size:", len(test_set))
print("Done.")

Reading CSV files...
Pivotting data...
User ratings table created with dimensions: 330975 rows x 83239 columns
Making test sets...
Test set created with size: 6766432
Done.


In [2]:
from utils import get_test_set_matrix, remove_test_set

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)

Removing test set entries from training data...


100%|██████████| 6766432/6766432 [00:13<00:00, 503934.02it/s] 
100%|██████████| 305637/305637 [00:06<00:00, 47508.14it/s]


Done removing test set entries.
Getting test set matrix...


100%|██████████| 6766432/6766432 [00:12<00:00, 535814.91it/s] 
100%|██████████| 305637/305637 [00:08<00:00, 37454.17it/s]


Done getting test set matrix.


## Rating Predictors

### Least Squares Optimiation Predictor (Baseline)

In [2]:
from predictors.least_squares import LeastSquaresPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error, root_mean_square_error_entries


training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
baseline = LeastSquaresPredictor(shape=user_ratings.shape, lmda=0.2)
baseline.train(training_data=training_data)
test_predictions = baseline.predict(test_set)
training_predictions = baseline.predict(training_set)
print(f"{test_predictions = }")
print(f"{training_predictions = }")
print(f"{training_data.data = }")
rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Removing test set entries from training data...


100%|██████████| 20167/20167 [00:00<00:00, 2659870.09it/s]
100%|██████████| 609/609 [00:00<00:00, 35635.20it/s]


Done removing test set entries.
Getting test set matrix...


100%|██████████| 20167/20167 [00:00<00:00, 3867166.31it/s]
100%|██████████| 609/609 [00:00<00:00, 30638.13it/s]

Done getting test set matrix.
Constructing relevant matrices...
Calculating user and item biases...





Training done.
Predicting entries...


100%|██████████| 20167/20167 [00:00<00:00, 1702867.33it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 80669/80669 [00:00<00:00, 1834682.49it/s]

Finished predicting entries.
test_predictions = array([3.1549098 , 2.13517279, 3.91370611, ..., 3.16089698, 2.94281606,
       3.80851318], shape=(20167,))
training_predictions = array([2.20713821, 4.42210064, 3.24345784, ..., 4.06639076, 2.31312133,
       3.88218236], shape=(80669,))
training_data.data = array([4., 4., 4., ..., 4., 5., 3.], shape=(80669,))





rmse_training = np.float64(0.7784721291176996)
rmse_test = np.float64(0.8750824501836783)


### Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)
As this requires calculating cosine coefficient for every single pair of items (or users, depending on the correlation chosen), this is extremely computationally expensive. Not recommended for data with a large number of items (or users).

In [3]:
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked, two_most_similar

# baseline.predict_all = lambda quiet=False: np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.USER)
improved.train(training_data, most_similar)
test_predictions = improved.predict(test_set)
training_predictions = improved.predict(training_set)
print(f"{test_predictions = }")
print(f"{training_predictions = }")

Calculating cosine similarity coefficients...
Making neighbor table...


100%|██████████| 9724/9724 [00:09<00:00, 1035.61it/s]


Finished training.
Predicting entries...


100%|██████████| 20167/20167 [00:00<00:00, 89265.66it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 80669/80669 [00:00<00:00, 93488.19it/s]

Finished predicting entries.
test_predictions = array([2.6100543 , 1.53398421, 3.19146798, ..., 3.68492274, 3.60581692,
       3.53463773], shape=(20167,))
training_predictions = array([2.20713821, 5.        , 5.        , ..., 3.78632189, 2.49114979,
       4.06934193], shape=(80669,))





### Latent Factor Predictor

In [12]:
from scipy.sparse import csr_matrix

user_ratings = csr_matrix(
    [
        [3, 4, 5, 3, 2, 3],
        [3, 2, 3, 4, 2, 1],
        [4, 4, 4, 5, 3, 2],
        [3, 5, 4, 4, 3, 4],
        [2, 1, 2, 2, 3, 1],
        [3, 5, 5, 4, 4, 3],
        [3, 5, 5, 3, 2, 2],
        [2, 3, 3, 2, 1, 2],
    ]
)
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

In [3]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor

u, i = training_data.shape
k = 2
latent = LatentFactorPredictor(
    shape=training_data.shape,
    k=k,
    p=np.ones(shape=(k,u), dtype=np.float64),
    q=np.ones(shape=(k,i), dtype=np.float64),
    lmda=0.2,
)
print(f"{latent.p = }")
print(f"{latent.q = }")
t = 0

latent.p = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 330975))
latent.q = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 83239))


In [4]:
latent.train(training_data=training_data, iterations=100)
# t += 20
# print(f"Total: {t} iterations")

Preparing data for training...
Performing alternating least squares...


100%|██████████| 100/100 [06:42<00:00,  4.02s/it]

Finished training.





In [5]:
training_predictions = latent.predict(training_set)
test_predictions = latent.predict(test_set)
print(f"{training_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

Predicting entries...


100%|██████████| 27065730/27065730 [05:30<00:00, 81902.65it/s] 


Finished predicting entries.
Predicting entries...


100%|██████████| 6766432/6766432 [01:21<00:00, 83480.80it/s]

Finished predicting entries.
training_predictions = array([3.96516049, 1.07460525, 1.51735067, ..., 2.37272822, 3.72034409,
       4.55430946], shape=(27065730,))
test_predictions = array([2.93717671, 3.78370921, 2.32394171, ..., 2.8064506 , 3.87181031,
       3.53199553], shape=(6766432,))
latent.p = array([[1.23401304, 1.01703323, 1.31431579, ..., 1.07690985, 0.80702365,
        0.46423131],
       [0.90829139, 0.97683121, 1.12561913, ..., 0.89869366, 0.97421212,
        0.77556854]], shape=(2, 330975))
latent.q = array([[1.54200499, 2.90340254, 2.99129576, ..., 1.72557883, 1.46030548,
        1.45280941],
       [2.50311193, 0.44089136, 0.14449253, ..., 1.91721313, 1.2657258 ,
        2.02169956]], shape=(2, 83239))





In [6]:
from utils import root_mean_square_error_entries, root_mean_square_error

rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

rmse_training = np.float64(0.8073885656987553)
rmse_test = np.float64(0.8371278613256798)


In [7]:
import pickle

with open('models/latent', 'wb') as f:
    pickle.dump(latent, f)

## Making Recommendations

### Plain Recommender

In [8]:
from recommenders.plain import PlainRecommender

recommender = PlainRecommender(
    predictor=latent, users=user_ratings.shape[0], items=user_ratings.shape[1]
)

### Pure Score Boost

In [9]:
import random
from recommenders.score_boost import ScoreBoostRecommender

bids = [
    (idx, random.random()) for idx in random.sample(range(user_ratings.shape[1]), k=50)
]
paid_recommender = ScoreBoostRecommender(
    predictor=latent,
    users=user_ratings.shape[0],
    items=user_ratings.shape[1],
    bids=bids,
    alpha=0.1,
    beta=50,
    promotion_slots=[True if x % 4 == 0 else False for x in range(20)]
)
print("Bids:", sorted(bids, reverse=True, key=lambda x: x[1]))

Bids: [(55286, 0.9902403880506729), (54738, 0.9772386033079312), (64276, 0.9651193228294344), (3869, 0.947772013374295), (42794, 0.8986451166915099), (28595, 0.8941967902781269), (80144, 0.8873161444495223), (40783, 0.8747367723727055), (54554, 0.8731391878636571), (65552, 0.8685857450545172), (45586, 0.8535867166808995), (64479, 0.8516122509142184), (70450, 0.8505061069245037), (47754, 0.8293073384094706), (16924, 0.825801620356268), (46910, 0.8083320033082336), (78917, 0.8067832103131113), (1474, 0.804966835097827), (47686, 0.7959335915196324), (3767, 0.7862560632745693), (29478, 0.7627977849820808), (80449, 0.7494890094926858), (81921, 0.7311121142057868), (67775, 0.6868019702258791), (41341, 0.6801606656452964), (4174, 0.6639739807788596), (46939, 0.6589829717782755), (51980, 0.6462623107839929), (53783, 0.5875176115718594), (18528, 0.5862603713234524), (36710, 0.5839041896510366), (76385, 0.5366528932021754), (37402, 0.5172234829056793), (27860, 0.5120540635978971), (50837, 0.4615

In [12]:
from random import randint

# print(recommender.users, recommender.items)
# user = randint(0, user_ratings.shape[0])
user = 158336
print("Without promotion:", user, [movie_id_mappings[x] for x in recommender.recommend_items(user, 20)])
print("With promotion:", user, [movie_id_mappings[x] for x in paid_recommender.recommend_items(user, 20)])

Without promotion: 158336 [236125, 207796, 155637, 276235, 179189, 207369, 155399, 162514, 178403, 192287, 276123, 276097, 213644, 238032, 66904, 271739, 103741, 103769, 154592, 87700]
With promotion: 158336 [188597, 215003, 214142, 215001, 220766, 205663, 249232, 266080, 151773, 214342, 214260, 214174, 210173, 213644, 189143, 162514, 172303, 236125, 277580, 131777]


## Work Cited
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>