# Demostration of Different Predictors and Recommenders

## Sample Data Generation

### Toy Example Data

In [10]:
from scipy.sparse import csr_matrix
user_ratings = csr_matrix([
    [5, 4, 5, 3, 3],
    [3, 2, 2, 4, 1],
    [3, 4, 3, 5, 4],
    [5, 1, 4, 2, 4],
    [2, 3, 4, 1, 1],
    [2, 3, 4, 2, 5],
])

test_set = [
    (0, 0),
    (0, 3),
    (1, 1),
    (1, 4),
    (2, 0),
    (2, 4),
    (3, 2),
    (4, 1),
    (4, 3),
    (5, 0),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

In [1]:
from scipy.sparse import csr_matrix

user_ratings = csr_matrix(
    [
        [5, 4, 4, 0, 5],
        [0, 3, 5, 3, 4],
        [5, 2, 0, 2, 3],
        [0, 2, 3, 1, 2],
        [4, 0, 5, 4, 5],
        [5, 3, 0, 3, 5],
        [3, 2, 3, 2, 0],
        [5, 3, 4, 0, 5],
        [4, 2, 5, 4, 0],
        [5, 0, 5, 3, 4],
    ]
)
test_set = [
    (0, 4),
    (1, 3),
    (2, 3),
    (3, 1),
    (4, 2),
    (5, 0),
    (6, 1),
    (7, 1),
    (8, 0),
    (9, 0),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

### Random Data

In [1]:
import pickle
from samples import generate_sample_data

sample_data = generate_sample_data(100000, 5000, 20)
with open("samples/data", "wb") as f:
    pickle.dump(sample_data, f)

In [6]:
import pickle
from random import randrange, sample
with open("samples/data", "rb") as f:
    sample_data = pickle.load(f)

user_ratings = sample_data["ratings"]
test_set_size = int(user_ratings.shape[0] * user_ratings.shape[1] * 0.2)
valid_entries = list(zip(*user_ratings.nonzero()))

shuffuled_valid_entries = sample(valid_entries, k=len(valid_entries))
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = shuffuled_valid_entries[:test_set_size]
training_set = shuffuled_valid_entries[test_set_size:]
# test_set = [(randrange(0, len(user_ratings)), randrange(0, len(user_ratings[0]))) for _ in range(test_set_size)]

### Real Datasets (from MovieLens)

In [1]:
import requests
import zipfile
import io

# Download the latest (small) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

# Download the latest (full) dataset
url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

# Unzip the dataset into a folder
z.extractall("data/")

In [1]:
import pandas as pd
from random import sample
from scipy.sparse import csr_matrix

print("Reading CSV files...")
# Read the ratings and movies CSV
ratings_df = pd.read_csv("data/ml-latest-small/ratings.csv")
movies_df = pd.read_csv("data/ml-latest-small/movies.csv")

# Read the ratings and movies CSV (WARNING: FULL DATASET)
# ratings_df = pd.read_csv("data/ml-latest/ratings.csv")
# movies_df = pd.read_csv("data/ml-latest/movies.csv")

# Convert the CSV into a user ratings table
# Create a dense matrix where each row represents a user and each column a movie.
# Missing ratings are filled with 0.
print("Pivotting data...")
user_ids = sorted(ratings_df["userId"].unique())
movie_ids = sorted(ratings_df["movieId"].unique())
user_id_map = {uid: i for i, uid in enumerate(user_ids)}
movie_id_map = {mid: j for j, mid in enumerate(movie_ids)}

rows = ratings_df["userId"].map(user_id_map).values
cols = ratings_df["movieId"].map(movie_id_map).values
data = ratings_df["rating"].values

user_ratings = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))
movie_id_mappings = movies_df["movieId"].to_list()

print(
    "User ratings table created with dimensions:",
    user_ratings.shape[0],
    "rows x",
    user_ratings.shape[1],
    "columns",
)

print("Making test sets...")
# Get all indices with an existing (non zero) rating
valid_entries = list(zip(*user_ratings.nonzero()))
shuffuled_valid_entries = sample(valid_entries, k=len(valid_entries))
test_set_size = int(len(valid_entries) * 0.2)

# Randomly select test_set_size indices from the valid entries
test_set = shuffuled_valid_entries[:test_set_size]
training_set = shuffuled_valid_entries[test_set_size:]
print("Test set created with size:", len(test_set))
print("Done.")

Reading CSV files...
Pivotting data...
User ratings table created with dimensions: 610 rows x 9724 columns
Making test sets...
Test set created with size: 20167
Done.


In [7]:
len(valid_entries)

10000000

In [8]:
from utils import get_test_set_matrix, remove_test_set

training_data = remove_test_set(user_ratings, test_set)
test_data = get_test_set_matrix(user_ratings, test_set)

Removing test set entries from training data...


100%|██████████| 2000000/2000000 [00:02<00:00, 896399.90it/s] 
100%|██████████| 100000/100000 [00:02<00:00, 43343.22it/s]


Done removing test set entries.
Getting test set matrix...


100%|██████████| 2000000/2000000 [00:01<00:00, 1046648.60it/s]
100%|██████████| 100000/100000 [00:02<00:00, 35032.83it/s]


Done getting test set matrix.


## Rating Predictors

### Least Squares Optimiation Predictor (Baseline)

In [15]:
from predictors.least_squares import LeastSquaresPredictor
from utils import root_mean_square_error_entries


baseline = LeastSquaresPredictor(shape=user_ratings.shape, lmda=0.2)
baseline.iterative_train(training_data=training_data)
test_predictions = baseline.predict(test_set)
training_predictions = baseline.predict(training_set)
# print(f"{test_predictions = }")
# print(f"{training_predictions = }")
# print(f"{training_data.data = }")
rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Constructing relevant matrices for iterative training...
Calculating user and item biases using Conjugate Gradient...


  0%|          | 16/10000 [00:00<03:37, 45.82it/s]


Training done.
Predicting entries...


100%|██████████| 2000000/2000000 [00:08<00:00, 238452.09it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 8000000/8000000 [00:31<00:00, 250849.68it/s]


Finished predicting entries.
Gathering entries from predictions...


100%|██████████| 8000000/8000000 [00:25<00:00, 312365.70it/s]


Gathering entries from predictions...


100%|██████████| 2000000/2000000 [00:05<00:00, 352209.79it/s]


rmse_training = np.float64(0.3365181452088841)
rmse_test = np.float64(0.34137219775811245)


In [15]:
baseline.train(training_data=training_data)
test_predictions = baseline.predict(test_set)
training_predictions = baseline.predict(training_set)
print(f"{test_predictions = }")
print(f"{training_predictions = }")
print(f"{training_data.data = }")
rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Constructing relevant matrices...
Calculating user and item biases...
Training done.
Predicting entries...


100%|██████████| 10/10 [00:00<00:00, 108660.73it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 30/30 [00:00<00:00, 275337.24it/s]


Finished predicting entries.
test_predictions = [np.float64(4.6178451178451185), np.float64(3.486111111111111), np.float64(2.7811447811447816), np.float64(1.0), np.float64(4.789983164983165), np.float64(4.877104377104377), np.float64(1.2289562289562284), np.float64(2.9191919191919187), np.float64(4.784511784511785), np.float64(4.609848484848484)]
training_predictions = [np.float64(5.0), np.float64(3.093855218855219), np.float64(4.895622895622896), np.float64(2.891414141414141), np.float64(4.693181818181818), np.float64(4.415404040404041), np.float64(4.103114478114478), np.float64(2.1864478114478114), np.float64(3.7104377104377106), np.float64(2.4949494949494944), np.float64(1.2878787878787876), np.float64(2.2171717171717167), np.float64(4.904882154882155), np.float64(3.582912457912458), np.float64(4.5122053872053876), np.float64(2.96043771043771), np.float64(3.5551346801346804), np.float64(4.48442760942761), np.float64(3.1456228956228953), np.float64(3.0307239057239053), np.float64(1.8

100%|██████████| 30/30 [00:00<00:00, 236077.15it/s]


Gathering entries from predictions...


100%|██████████| 10/10 [00:00<00:00, 156503.88it/s]

rmse_training = np.float64(0.5126176872466319)
rmse_test = np.float64(0.5848469310964864)





In [None]:
import pickle

with open('models/baseline', 'wb') as f:
    pickle.dump(baseline, f)
with open('data/training_set', 'wb') as f:
    pickle.dump(training_set, f)
with open('data/training_predictions', 'wb') as f:
    pickle.dump(training_predictions, f)

### Neighbor Correlations Predictor (based on Least Sqaures Optimization) (Improved)
As this requires calculating cosine coefficient for every single pair of items (or users, depending on the correlation chosen), this is extremely computationally expensive. Not recommended for data with a large number of items (or users).

In [22]:
import pickle

with open('models/baseline', 'rb') as f:
    baseline = pickle.load(f)
with open('data/training_set', 'rb') as f:
    training_set = pickle.load(f)
with open('data/baseline_predictions', 'rb') as f:
    training_predictions = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'data/training_set'

In [16]:
from predictors.neighbor_correlations import Correlation, NeighborCorrelationsPredictor
from utils.neighbor_selection import most_similar, two_most_similar_skip_masked, two_most_similar
import numpy as np
from scipy.sparse import csr_matrix

# baseline.predict_all = lambda quiet=False: np.array(
#     [
#         [np.nan, 2.7, 3.3, np.nan, 4.5],
#         [4.1, np.nan, 3.5, 4.9, np.nan],
#         [np.nan, 3.8, 2.5, 4.2, np.nan],
#         [2.8, 3.1, np.nan, 2.6, 4.8],
#         [3.3, np.nan, 3.7, np.nan, 2.4],
#         [np.nan, 3.9, 4.0, 1.5, 3.9],
#     ]
# )

improved = NeighborCorrelationsPredictor(baseline=baseline, correlation=Correlation.ITEM)
print("Creating sparse matrix from baseline predictions...")
rows, cols = zip(*training_set)
baseline_prediction = csr_matrix((np.array(training_predictions), (rows, cols)), shape=baseline.shape)
improved.train(training_data, two_most_similar_skip_masked, baseline_predictions=baseline_prediction)
test_predictions = improved.predict(test_set)
training_predictions = improved.predict(training_set)
print(f"{test_predictions = }")
print(f"{training_predictions = }")

Creating sparse matrix from baseline predictions...
Calculating cosine similarity coefficients...
Calculating numerators...
Calculating denominators...
Making neighbor table...


100%|██████████| 100000/100000 [04:45<00:00, 350.78it/s]


Finished training.
Predicting entries...


100%|██████████| 2000000/2000000 [06:20<00:00, 5250.86it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 8000000/8000000 [24:18<00:00, 5484.98it/s]


Finished predicting entries.
test_predictions = array([2.53018404, 2.6618057 , 2.65310744, ..., 2.59309105, 2.59112454,
       2.56499384], shape=(2000000,))
training_predictions = array([2.60047641, 2.55844541, 2.67766904, ..., 2.61555986, 2.55858723,
       2.57491603], shape=(8000000,))


In [17]:
rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_training = }")
print(f"{rmse_test = }")

Gathering entries from predictions...


100%|██████████| 8000000/8000000 [00:25<00:00, 309402.99it/s]


Gathering entries from predictions...


100%|██████████| 2000000/2000000 [00:05<00:00, 334018.34it/s]


rmse_training = np.float64(0.3340520910273014)
rmse_test = np.float64(0.3437479297468506)


### Latent Factor Predictor

In [17]:
from scipy.sparse import csr_matrix

user_ratings = csr_matrix(
    [
        [3, 4, 5, 3, 2, 3],
        [3, 2, 3, 4, 2, 1],
        [4, 4, 4, 5, 3, 2],
        [3, 5, 4, 4, 3, 4],
        [2, 1, 2, 2, 3, 1],
        [3, 5, 5, 4, 4, 3],
        [3, 5, 5, 3, 2, 2],
        [2, 3, 3, 2, 1, 2],
    ]
)
test_set = [
    (0, 0),
    (1, 1),
    (2, 3),
    (2, 4),
    (3, 0),
    (3, 1),
    (5, 1),
    (5, 4),
    (6, 0),
    (6, 2),
    (7, 1),
    (7, 3),
]
training_set = []
for i in range(user_ratings.shape[0]):
    for j in range(user_ratings.shape[1]):
        if (i, j) not in test_set and user_ratings[i, j] != 0:
            training_set.append((i, j))

In [11]:
import numpy as np
from predictors.latent_factor import LatentFactorPredictor

u, i = training_data.shape
k = 5
latent = LatentFactorPredictor(
    shape=training_data.shape,
    k=k,
    p=np.ones(shape=(k,u), dtype=np.float64),
    q=np.ones(shape=(k,i), dtype=np.float64),
    lmda=0.2,
)
print(f"{latent.p = }")
print(f"{latent.q = }")
t = 0

latent.p = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(5, 100000))
latent.q = array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], shape=(5, 5000))


In [12]:
latent.train(training_data=training_data, max_iterations=10000, tol=1e-4)
# t += 20
# print(f"Total: {t} iterations")

Preparing data for training...
Performing alternating least squares...


 87%|████████▋ | 8660/10000 [3:01:44<28:07,  1.26s/it]  

Finished training.





In [13]:
training_predictions = latent.predict(training_set)
test_predictions = latent.predict(test_set)
print(f"{training_predictions = }")
print(f"{test_predictions = }")
print(f"{latent.p = }")
print(f"{latent.q = }")

Predicting entries...


100%|██████████| 8000000/8000000 [00:46<00:00, 173505.78it/s]


Finished predicting entries.
Predicting entries...


100%|██████████| 2000000/2000000 [00:10<00:00, 184370.66it/s]

Finished predicting entries.
training_predictions = array([2.67166896, 2.62244164, 2.65717642, ..., 2.45798514, 2.65019089,
       2.4884156 ], shape=(8000000,))
test_predictions = array([2.45227672, 2.91494366, 2.48036351, ..., 2.61548735, 2.60261286,
       2.55135824], shape=(2000000,))
latent.p = array([[0.57688248, 0.15458079, 0.36187863, ..., 0.38679631, 0.46600835,
        0.24079034],
       [0.0499925 , 0.28714986, 0.44058903, ..., 0.14953331, 0.35944786,
        0.35633317],
       [0.36054538, 0.45050216, 0.16488546, ..., 0.44992167, 0.28915843,
        0.25110507],
       [0.53410211, 0.16454649, 0.39368848, ..., 0.40303429, 0.29763438,
        0.47025231],
       [0.21567791, 0.65270152, 0.38527362, ..., 0.31313011, 0.31733939,
        0.38812071]], shape=(5, 100000))
latent.q = array([[1.07538268, 1.07047689, 2.34703052, ..., 0.86255349, 1.07254012,
        1.81381774],
       [1.07523625, 1.13362167, 0.72532287, ..., 2.68978183, 1.99626121,
        1.63366244],
       [1




In [14]:
from utils import root_mean_square_error_entries, root_mean_square_error

rmse_test = root_mean_square_error_entries(test_predictions, test_set, test_data)
print(f"{rmse_test = }")
rmse_training = root_mean_square_error_entries(training_predictions, training_set, training_data)
print(f"{rmse_training = }")

Gathering entries from predictions...


  0%|          | 0/2000000 [00:00<?, ?it/s]

100%|██████████| 2000000/2000000 [00:04<00:00, 415989.81it/s]


rmse_test = np.float64(0.3235141820151544)
Gathering entries from predictions...


100%|██████████| 8000000/8000000 [00:20<00:00, 386326.31it/s]


rmse_training = np.float64(0.30021525799094384)


In [12]:
import pickle

with open('models/latent_' + str(k), 'wb') as f:
    pickle.dump(latent, f)

## Making Recommendations

### Model Import

In [25]:
import pickle

with open('models/latent_5', 'rb') as f:
    latent = pickle.load(f)

### Plain Recommender

In [26]:
from recommenders.plain import PlainRecommender

recommender = PlainRecommender(
    predictor=latent, users=user_ratings.shape[0], items=user_ratings.shape[1]
)

### Pure Score Boost

In [27]:
import random
from recommenders.rating_boost import ScoreBoostRecommender

bids = [
    (idx, random.random()) for idx in random.sample(range(user_ratings.shape[1]), k=50)
]
paid_recommender = ScoreBoostRecommender(
    predictor=latent,
    users=user_ratings.shape[0],
    items=user_ratings.shape[1],
    bids=bids,
    alpha=0.1,
    beta=50,
    promotion_slots=[True if x % 4 == 0 else False for x in range(20)]
)
print("Bids:", sorted(bids, reverse=True, key=lambda x: x[1]))

ImportError: cannot import name 'ScoreBoostRecommender' from 'recommenders.rating_boost' (/Users/matthewngan/ESTR3302-project/recommenders/rating_boost.py)

In [None]:
from random import randint

# print(recommender.users, recommender.items)
# user = randint(0, user_ratings.shape[0])
user = 158336
print("Without promotion:", user, [movie_id_mappings[x] for x in recommender.recommend_items(user, 20)])
print("With promotion:", user, [movie_id_mappings[x] for x in paid_recommender.recommend_items(user, 20)])

Without promotion: 158336 [87700, 231059, 203427, 189087, 227901, 215003, 227913, 218125, 257751, 209717, 188597, 189143, 257811, 270476, 188033, 148677, 170835, 211225, 172727, 267538]
With promotion: 158336 [158242, 203427, 87700, 257751, 138446, 170835, 231059, 189087, 179985, 225455, 215003, 148677, 73868, 218125, 263479, 270476, 173121, 209717, 188597, 257871]


### Accuracies

Baseline lambda = 0.2, rmse test = 0.8620847479490242, rmse train = 0.8476892585276561
Latent 2, rmse test = 0.8363286658663873, rmse train = 0.8070657315515645
Latent 5, rmse test = 0.8169738957584134, rmse train = 0.7484386486325613
Latent 8, rmse test = 0.8173996547778605, rmse train = 0.7102392483099188
Latent 10, rmse test = 0.8216205577284467, rmse train = 0.6902388489455976 


## Work Cited
> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>