# Demostration of Different Predictors and Recommenders

## Sample Data Generation

### Toy Example Data

In [None]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
])

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]

In [1]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 4, 0, 5],
    [0, 3, 5, 3, 4],
    [5, 2, 0, 2, 3],
    [0, 2, 3, 1, 2],
    [4, 0, 5, 4, 5],
    [5, 3, 0, 3, 5],
    [3, 2, 3, 2, 0],
    [5, 3, 4, 0, 5],
    [4, 2, 5, 4, 0],
    [5, 0, 5, 3, 4]
])
test_set = [(0, 4), (1, 3), (2, 3), (3, 1), (4, 2),
                   (5, 0), (6, 1), (7, 1), (8, 0), (9, 0)]

### Random Data

In [1]:
from samples import generate_sample_data
import json

sample_data = generate_sample_data(1000, 1000, 200)
with open("samples/data.json", "w") as f:
    json.dump(sample_data, f)

In [2]:
import json
from random import randrange
with open("samples/data.json", "r") as f:
    sample_data = json.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(len(user_ratings) * len(user_ratings[0]) * 0.2)
test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

### Real Datasets (from MovieLens)

In [1]:
import requests
import zipfile
import io

# Download the latest (small) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

# Download the latest (full) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

In [1]:
import pandas as pd
from random import sample
from scipy.sparse import csr_matrix

print("Reading CSV files...")
# Read the ratings and movies CSV
# ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")
# movies_df = pd.read_csv("data/ml-latest-small/movies.csv")

# Read the ratings and movies CSV (WARNING: FULL DATASET)
ratings_df = pd.read_csv("data/ml-latest/ratings.csv")
movies_df = pd.read_csv("data/ml-latest/movies.csv")

# Convert the CSV into a user ratings table
# Create a dense matrix where each row represents a user and each column a movie.
# Missing ratings are filled with 0.
print("Pivotting data...")
user_ids = sorted(ratings_df["userId"].unique())
movie_ids = sorted(ratings_df["movieId"].unique())
user_id_map = {uid: i for i, uid in enumerate(user_ids)}
movie_id_map = {mid: j for j, mid in enumerate(movie_ids)}

rows = ratings_df["userId"].map(user_id_map).values
cols = ratings_df["movieId"].map(movie_id_map).values
data = ratings_df["rating"].values

user_ratings = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))
movie_id_mappings = movies_df["movieId"].to_list()

print(
    "User ratings table created with dimensions:",
    user_ratings.shape[0],
    "rows x",
    user_ratings.shape[1],
    "columns",
)

print("Making test sets...")
# Get all indices with an existing (non zero) rating
valid_entries = list(zip(*user_ratings.nonzero()))
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = sample(valid_entries, min(test_set_size, len(valid_entries)))
print("Test set created with size:", len(test_set))
print("Done.")

Reading CSV files...
Pivotting data...
User ratings table created with dimensions: 330975 rows x 83239 columns
Making test sets...
Test set created with size: 6766432
Done.


## Rating Predictors

### Least Squares Optimiation Predictor (Baseline)

In [None]:
from predictors.least_squares import LeastSquaresPredictor
from utils import get_test_set_matrix, remove_test_set, root_mean_square_error, root_mean_square_error_entries


training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
baseline = LeastSquaresPredictor(training_data=training_data, lmda=0.2)
baseline.train()
test_predictions = baseline.predict(test_set)
all_predictions = baseline.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions.data = }")
print(f"{training_data.data = }")
rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Removing test set entries from training data...


100%|██████████| 6766432/6766432 [00:29<00:00, 231213.38it/s]
100%|██████████| 305862/305862 [00:06<00:00, 46284.61it/s]


Done removing test set entries.
Getting test set matrix...


100%|██████████| 6766432/6766432 [00:38<00:00, 174396.53it/s]
100%|██████████| 305862/305862 [00:08<00:00, 34573.11it/s]


Done getting test set matrix.
Constructing relevant matrices...
Calculating user and item biases...


### Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)

In [3]:
import numpy as np
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked, two_most_similar

# baseline.predict_all = lambda quiet=False: np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.USER)
improved.train(two_most_similar)
test_predictions = improved.predict(test_set)
all_predictions = improved.predict_all()
print(f"{test_predictions = }")
print(f"{all_predictions = }")

Calculating cosine similarity coefficients...


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all().

### Latent Factor Predictor

In [9]:
user_ratings = [
    [3, 4, 5, 3, 2, 3],
    [3, 2, 3, 4, 2, 1],
    [4, 4, 4, 5, 3, 2],
    [3, 5, 4, 4, 3, 4],
    [2, 1, 2, 2, 3, 1],
    [3, 5, 5, 4, 4, 3],
    [3, 5, 5, 3, 2, 2],
    [2, 3, 3, 2, 1, 2],
]
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]

In [3]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor
from utils import get_test_set_matrix, remove_test_set

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)
u, i = training_data.shape
k = 2
latent = LatentFactorPredictor(
    training_data=training_data,
    k=k,
    p=np.ones(shape=(k,u), dtype=np.float64),
    q=np.ones(shape=(k,i), dtype=np.float64),
    lmda=0.2,
)
print(f"{latent.p = }")
print(f"{latent.q = }")
t = 0

Removing test set entries from training data...


100%|██████████| 6766432/6766432 [00:07<00:00, 856388.87it/s] 
100%|██████████| 305867/305867 [00:06<00:00, 46375.05it/s]


Done removing test set entries.
Getting test set matrix...


100%|██████████| 6766432/6766432 [00:08<00:00, 787318.25it/s] 
100%|██████████| 305867/305867 [00:08<00:00, 37817.07it/s]


Done getting test set matrix.
latent.p = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 330975))
latent.q = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(2, 83239))


In [4]:
latent.train(100)
# t += 20
# print(f"Total: {t} iterations")

Preparing data for training...
Performing alternating least squares...


100%|██████████| 100/100 [06:20<00:00,  3.81s/it]

Finished training.





In [5]:
# all_predictions = latent.predict_all()
test_predictions = latent.predict(test_set)
# print(f"{all_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

Predicting entries...


100%|██████████| 6766432/6766432 [00:43<00:00, 154736.41it/s]

Finished predicting entries.
test_predictions = array([3.58489921, 4.18723421, 3.87480545, ..., 4.67600189, 1.50073983,
       2.93470728], shape=(6766432,))
latent.p = array([[1.0491592 , 0.97073347, 1.21726924, ..., 0.90782191, 0.88881196,
        0.62112248],
       [0.65405623, 0.92501878, 1.01531951, ..., 0.87314036, 1.03018454,
        0.9212989 ]], shape=(2, 330975))
latent.q = array([[ 3.60450119,  4.55718967,  4.60851564, ...,  1.6804029 ,
         1.54695769,  1.        ],
       [ 0.46085957, -1.21818368, -1.49743794, ...,  1.84620913,
         1.33473082,  1.        ]], shape=(2, 83239))





In [6]:
from utils import root_mean_square_error_entries, root_mean_square_error

# rmse_training = root_mean_square_error(all_predictions, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
# print(f"{rmse_training = }")
print(f"{rmse_test = }")

rmse_test = np.float64(0.8370787523136615)


## Making Recommendations

### Plain Recommender

In [8]:
from recommenders.plain import PlainRecommender

recommender = PlainRecommender(
    predictor=latent, users=user_ratings.shape[0], items=user_ratings.shape[1]
)

### Pure Score Boost

In [15]:
import random
from recommenders.score_boost import ScoreBoostRecommender

bids = [
    (idx, random.random()) for idx in random.sample(range(user_ratings.shape[1]), k=50)
]
paid_recommender = ScoreBoostRecommender(
    predictor=latent,
    users=user_ratings.shape[0],
    items=user_ratings.shape[1],
    bids=bids,
    alpha=0.1,
    beta=50,
    promotion_slots=[True if x % 4 == 0 else False for x in range(20)]
)
print("Bids:", sorted(bids, reverse=True, key=lambda x: x[1]))

Bids: [(35578, 0.9857356122403614), (24940, 0.9836144124647167), (66678, 0.9565546327049057), (66980, 0.912111671217647), (45457, 0.9109811666088713), (61321, 0.9066169493684465), (13989, 0.874543573549682), (10456, 0.849657074742543), (29338, 0.8305855852107319), (4331, 0.7455251112075354), (80947, 0.7098419354265786), (49215, 0.7008575274819981), (75060, 0.6799938029094219), (36085, 0.6738949284639714), (32880, 0.6550419867271192), (45070, 0.654454699965262), (69238, 0.6331872130915799), (10159, 0.6159643584504674), (8027, 0.6057263002464343), (80124, 0.5888029371139819), (19807, 0.5665873627163729), (19786, 0.5621361453866365), (33317, 0.5488157070109523), (1082, 0.5336509278215987), (15585, 0.5307869398675993), (25761, 0.5091825033199299), (81222, 0.49727522358567844), (5469, 0.4799005630039437), (15637, 0.4437888806072867), (77804, 0.4308181462664381), (77318, 0.42239667167058803), (52673, 0.40805810448361746), (79821, 0.36670815096569376), (70801, 0.35681284956365134), (9629, 0.3

In [16]:
from random import randint

# print(recommender.users, recommender.items)
user = randint(0, user_ratings.shape[0])
# user = 261
print("Without promotion:", user, [x for x in recommender.recommend_items(user, 20)])
print("With promotion:", user, [x for x in paid_recommender.recommend_items(user, 20)])

[5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5. 5.]
Without promotion: 21251 [np.int64(73246), np.int64(36634), np.int64(64467), np.int64(63232), np.int64(54986), np.int64(61627), np.int64(66531), np.int64(79181), np.int64(81303), np.int64(33769), np.int64(50025), np.int64(79845), np.int64(77454), np.int64(42340), np.int64(70074), np.int64(62918), np.int64(54266), np.int64(36404), np.int64(50038), np.int64(36109)]
With promotion: 21251 [np.int64(19807), np.int64(80868), np.int64(57313), np.int64(70867), np.int64(44393), np.int64(73246), np.int64(44646), np.int64(28092), np.int64(9629), np.int64(57450), np.int64(44596), np.int64(70946), np.int64(66980), np.int64(64743), np.int64(36109), np.int64(49520), np.int64(24168), np.int64(73161), np.int64(74935), np.int64(53074)]


## Work Cited
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>