# TensorFlow Collaborative Filtering with Matrix Factorization (ALS)

PoC

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops.py

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py

## Movielens Dataset

https://grouplens.org/datasets/movielens/

http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html

http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. These data were created by 671 users between January 09, 1995 and October 16, 2016. This dataset was generated on October 17, 2016.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

(README for more)

In [1]:
import os
import requests
import zipfile

DATA_DIR = 'movielens'
DATASET_URL = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_FILENAME = DATASET_URL.split('/')[-1]
DATASET_PACKAGE = os.path.join(DATA_DIR, DATASET_FILENAME)
DATASET_PATH = os.path.join(DATA_DIR, DATASET_FILENAME[:-4])

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)
    
if not os.path.isfile(DATASET_PACKAGE):
    print('Downloading {}...'.format(DATASET_FILENAME))
    r = requests.get(DATASET_URL, stream=True)
    with open(DATASET_PACKAGE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print('Done!')

if not os.path.isdir(DATASET_PATH):
    print('Unpacking {}...'.format(DATASET_PACKAGE))
    with zipfile.ZipFile(DATASET_PACKAGE, 'r') as f:
        f.extractall(DATA_DIR)
    print('Done!')

In [2]:
import collections
import csv
import os

Rating = collections.namedtuple('Rating', ['user_id', 'item_id', 'rating', 'timestamp'])

class Dataset(collections.namedtuple('Dataset', ['users', 'items', 'ratings'])):

    #users: set[str]
    #items: set[str]
    #ratings: list[Rating]

    __slots__ = ()

    def __str__(self):
        out = 'Users: {:,d}\n'.format(self.n_users)
        out += 'Items: {:,d}\n'.format(self.n_items)
        out += 'Ratings: {:,d}\n'.format(self.n_ratings)
        return out
    
    @property
    def n_users(self):
        return len(self.users)
    
    @property
    def n_items(self):
        return len(self.items)
    
    @property
    def n_ratings(self):
        return len(self.ratings)
    
    def user_ratings(self, user_id):
        return list(r for r in self.ratings if r.user_id == user_id)

    def item_ratings(self, item_id):
        return list(r for r in self.ratings if r.item_id == item_id)

    def filter_ratings(self, users, items):
        return list(((r.user_id, r.item_id), r.rating)
                    for r in self.ratings
                    if r.user_id in users
                    and r.item_id in items)


def new_dataset(ratings):
    users = set(r.user_id for r in ratings)
    items = set(r.item_id for r in ratings)
    return Dataset(users, items, ratings)


def load_movielens_ratings(dataset_path):
    ratings_csv = os.path.join(dataset_path, 'ratings.csv')
    if not os.path.isfile(ratings_csv):
        raise Exception('File not found: \'{}\''.format(ratings_csv))
    ratings = list()
    with open(ratings_csv, newline='') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for user_id, item_id, rating, timestamp in reader:
            ratings.append(Rating(user_id,
                                  item_id,
                                  float(rating),
                                  int(timestamp)))
    return ratings

def load_movielens(dataset_path):
    if not os.path.isdir(dataset_path):
        raise Exception('Path not found: \'{}\''.format(dataset_path))

    ratings = load_movielens_ratings(dataset_path)
    dataset = new_dataset(ratings)

    return dataset


small_dataset = load_movielens('movielens/ml-latest-small')

print('Dataset\n\n{}'.format(small_dataset))

Dataset

Users: 671
Items: 9,066
Ratings: 100,004



In [3]:
def split_by_time(dataset, train_ratio=0.80):
    ratings = sorted(dataset.ratings, key=lambda r: r.timestamp)
    size = int(len(ratings) * train_ratio)
    train_ratings = ratings[:size]
    test_ratings = ratings[size:]
    return new_dataset(train_ratings), \
            new_dataset(test_ratings)

train_valid_data, test_data = split_by_time(small_dataset)
train_data, valid_data = split_by_time(train_valid_data)

print('Train\n\n{}'.format(train_data))
print('Validation\n\n{}'.format(valid_data))
print('Test\n\n{}'.format(test_data))

Train

Users: 435
Items: 5,668
Ratings: 64,002

Validation

Users: 136
Items: 4,112
Ratings: 16,001

Test

Users: 147
Items: 4,753
Ratings: 20,001



**Train data**

In [4]:
train_eval = list(((r.user_id, r.item_id), r.rating) for r in train_data.ratings)
print('Evaluation ratings for train: {:,d}'.format(len(train_eval)))

Evaluation ratings for train: 64,002


**Validation data**

In [5]:
# only items in train will be available for validation
valid_items = train_data.items & valid_data.items
print('Items in train and validation: {:,d}'.format(len(valid_items)))

Items in train and validation: 2,424


In [6]:
# users from validation that has any item from train
valid_users = set(r.user_id for r in valid_data.ratings if r.item_id in train_data.items)
print('Users in validation with train items: {:,d}'.format(len(valid_users)))

Users in validation with train items: 135


In [7]:
# only users in train are available for validation
valid_users &= train_data.users
print('Users in train and validation: {:,d}'.format(len(valid_users)))

Users in train and validation: 23


In [8]:
valid_eval = valid_data.filter_ratings(valid_users, valid_items)
print('Evaluation ratings for validation: {:,d}'.format(len(valid_eval)))

Evaluation ratings for validation: 944


In [9]:
from enum import Enum

class Cluster(Enum):
    HEAVY = 'heavy'
    MODERATE = 'moderate'
    LIGHT = 'light'
    ACCIDENTAL = 'accidental'
    
    def __str__(self):
        return self.value
    
list(Cluster)

[<Cluster.HEAVY: 'heavy'>,
 <Cluster.MODERATE: 'moderate'>,
 <Cluster.LIGHT: 'light'>,
 <Cluster.ACCIDENTAL: 'accidental'>]

In [10]:
valid_user_clusters = collections.defaultdict(list)

for user_id in valid_users:
    n_train = len(list(r for r in train_data.user_ratings(user_id) if r.item_id in valid_items))
    n_valid = len(list(r for r in valid_data.user_ratings(user_id) if r.item_id in valid_items))
    
    cluster = None
    if n_train < 10 or n_valid < 10 \
        or n_train + n_valid < 30:
        cluster = Cluster.ACCIDENTAL
    elif n_train + n_valid > 1000:
        cluster = Cluster.HEAVY
    elif n_train + n_valid > 100:
        cluster = Cluster.MODERATE
    else:
        cluster = Cluster.LIGHT
    
    valid_user_clusters[cluster].append(user_id)
    print('user_id={}, (train, valid) = ({}, {}), {}'.format(user_id, n_train, n_valid, cluster))

user_id=624, (train, valid) = (679, 91), moderate
user_id=547, (train, valid) = (893, 158), heavy
user_id=16, (train, valid) = (19, 7), accidental
user_id=580, (train, valid) = (585, 144), moderate
user_id=587, (train, valid) = (338, 35), moderate
user_id=77, (train, valid) = (238, 56), moderate
user_id=529, (train, valid) = (318, 32), moderate
user_id=15, (train, valid) = (827, 53), moderate
user_id=105, (train, valid) = (421, 17), moderate
user_id=561, (train, valid) = (248, 43), moderate
user_id=388, (train, valid) = (597, 6), accidental
user_id=596, (train, valid) = (438, 3), accidental
user_id=380, (train, valid) = (548, 52), moderate
user_id=356, (train, valid) = (12, 3), accidental
user_id=430, (train, valid) = (186, 93), moderate
user_id=292, (train, valid) = (263, 1), accidental
user_id=384, (train, valid) = (349, 31), moderate
user_id=427, (train, valid) = (162, 4), accidental
user_id=150, (train, valid) = (358, 6), accidental
user_id=648, (train, valid) = (123, 48), moderate

In [11]:
for cluster in Cluster:
    users = valid_user_clusters[cluster]
    print(cluster, len(users))

heavy 1
moderate 14
light 0
accidental 8


In [12]:
valid_clusters = dict()

for cluster in Cluster:
    users = valid_user_clusters[cluster]
    if not users:
        continue
    eval_data = valid_data.filter_ratings(users, valid_items)
    if not eval_data:
        continue
    valid_clusters[cluster] = eval_data
    print('Evaluation ratings for {}: {:,d}'.format(cluster, len(eval_data)))

Evaluation ratings for heavy: 158
Evaluation ratings for moderate: 748
Evaluation ratings for accidental: 38


**Test data**

In [13]:
# only items in train will be available for test
test_items = train_data.items & test_data.items
print('Items in train and test: {:,d}'.format(len(test_items)))

Items in train and test: 2,332


In [14]:
# users from test that has any item from train
test_users = set(r.user_id for r in test_data.ratings if r.item_id in train_data.items)
print('Users in test with train items: {:,d}'.format(len(test_users)))

Users in test with train items: 145


In [15]:
# only users in train are available for test
test_users &= train_data.users
print('Users in train and test: {:,d}'.format(len(test_users)))

Users in train and test: 5


In [16]:
test_eval = test_data.filter_ratings(test_users, test_items)
print('Evaluation ratings for test: {:,d}'.format(len(test_eval)))

Evaluation ratings for test: 278


## TensorFlow WALS

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops.py

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py

In [17]:
# Map User <-> index
# Map Item <-> index
IndexMapping = collections.namedtuple('IndexMapping', ['users_to_idx',
                                                       'users_from_idx',
                                                       'items_to_idx',
                                                       'items_from_idx'])

def map_index(values):
    values_from_idx = dict(enumerate(values))
    values_to_idx = dict((value, idx) for idx, value in values_from_idx.items())
    return values_to_idx, values_from_idx

def new_mapping(dataset):
    users_to_idx, users_from_idx = map_index(dataset.users)
    items_to_idx, items_from_idx = map_index(dataset.items)
    return IndexMapping(users_to_idx, users_from_idx, items_to_idx, items_from_idx)

In [18]:
import tensorflow as tf
import numpy as np

from tensorflow.contrib.factorization import WALSModel

class ALSRecommenderModel:
    
    def __init__(self, user_factors, item_factors, mapping):
        self.user_factors = user_factors
        self.item_factors = item_factors
        self.mapping = mapping
    
    def transform(self, x):
        for user_id, item_id in x:
            if user_id not in self.mapping.users_to_idx \
                or item_id not in self.mapping.items_to_idx:
                yield (user_id, item_id), 0.0
                continue
            i = self.mapping.users_to_idx[user_id]
            j = self.mapping.items_to_idx[item_id]
            u = self.user_factors[i]
            v = self.item_factors[j]
            r = np.dot(u, v)
            yield (user_id, item_id), r
    
    def recommend(self, user_id, num_items=10, items_exclude=set()):
        i = self.mapping.users_to_idx[user_id]
        u = self.user_factors[i]
        V = self.item_factors
        P = np.dot(V, u)
        rank = sorted(enumerate(P), key=lambda p: p[1], reverse=True)

        top = list()
        k = 0
        while k < len(rank) and len(top) < num_items:
            j, r = rank[k]
            k += 1

            item_id = self.mapping.items_from_idx[j]
            if item_id in items_exclude:
                continue

            top.append((item_id, r))

        return top        
    
class ALSRecommender:
    
    def __init__(self, num_factors=10, num_iters=10, reg=1e-1):
        self.num_factors = num_factors
        self.num_iters = num_iters
        self.regularization = reg

    def fit(self, dataset, verbose=False):
        with tf.Graph().as_default(), tf.Session() as sess:
            input_matrix, mapping = self.sparse_input(dataset)
            model = self.als_model(dataset)
            self.train(model, input_matrix, verbose)
            row_factor = model.row_factors[0].eval()
            col_factor = model.col_factors[0].eval()
            return ALSRecommenderModel(row_factor, col_factor, mapping)

    def sparse_input(self, dataset):
        mapping = new_mapping(dataset)

        indices = [(mapping.users_to_idx[r.user_id],
                    mapping.items_to_idx[r.item_id])
                   for r in dataset.ratings]
        values = [r.rating for r in dataset.ratings]
        shape = (dataset.n_users, dataset.n_items)

        return tf.SparseTensor(indices, values, shape), mapping
    
    def als_model(self, dataset):
        return WALSModel(
            dataset.n_users,
            dataset.n_items,
            self.num_factors,
            regularization=self.regularization,
            unobserved_weight=0)

    def train(self, model, input_matrix, verbose=False):
        rmse_op = self.rmse_op(model, input_matrix) if verbose else None

        row_update_op = model.update_row_factors(sp_input=input_matrix)[1]
        col_update_op = model.update_col_factors(sp_input=input_matrix)[1]

        model.initialize_op.run()
        model.worker_init.run()
        for _ in range(self.num_iters):
            # Update Users
            model.row_update_prep_gramian_op.run()
            model.initialize_row_update_op.run()
            row_update_op.run()
            # Update Items
            model.col_update_prep_gramian_op.run()
            model.initialize_col_update_op.run()
            col_update_op.run()

            if verbose:
                print('RMSE: {:,.3f}'.format(rmse_op.eval()))

    def approx_sparse(self, model, indices, shape):
        row_factors = tf.nn.embedding_lookup(
            model.row_factors,
            tf.range(model._input_rows),
            partition_strategy="div")
        col_factors = tf.nn.embedding_lookup(
            model.col_factors,
            tf.range(model._input_cols),
            partition_strategy="div")

        row_indices, col_indices = tf.split(indices,
                                            axis=1,
                                            num_or_size_splits=2)
        gathered_row_factors = tf.gather(row_factors, row_indices)
        gathered_col_factors = tf.gather(col_factors, col_indices)
        approx_vals = tf.squeeze(tf.matmul(gathered_row_factors,
                                           gathered_col_factors,
                                           adjoint_b=True))

        return tf.SparseTensor(indices=indices,
                               values=approx_vals,
                               dense_shape=shape)

    def rmse_op(self, model, input_matrix):
        approx_matrix = self.approx_sparse(model, input_matrix.indices, input_matrix.dense_shape)
        err = tf.sparse_add(input_matrix, approx_matrix * (-1))
        err2 = tf.square(err)
        n = input_matrix.values.shape[0].value
        return tf.sqrt(tf.sparse_reduce_sum(err2) / n)


als = ALSRecommender()
als_model = als.fit(train_data, verbose=True)

RMSE: 2.203
RMSE: 0.802
RMSE: 0.641
RMSE: 0.593
RMSE: 0.569
RMSE: 0.554
RMSE: 0.544
RMSE: 0.536
RMSE: 0.530
RMSE: 0.526


In [19]:
for k in range(10):
    x, y  = valid_eval[k]
    _,  y_hat = list(als_model.transform([x]))[0]
    print(*x, y, y_hat)

561 39292 4.0 2.14771
561 2916 3.0 4.14338
561 4643 2.5 3.76398
561 33679 4.0 3.10178
561 5991 4.5 2.4092
561 2804 3.0 3.10694
561 5956 4.0 4.69892
561 4025 3.0 2.50858
561 2701 3.5 2.92227
561 368 4.0 3.83185


In [20]:
def _rmse(model, data):
    x, y = zip(*data)
    y_hat = list(r_hat for _, r_hat in model.transform(x))
    return np.sqrt(np.mean(np.square(np.subtract(y, y_hat))))

def eval_rmse(model):
    rmse = _rmse(model, train_eval)
    print('RMSE (train): {:,.3f}'.format(rmse))
    
    rmse = _rmse(model, valid_eval)
    print('RMSE (validation): {:,.3f}'.format(rmse))

    for cluster in Cluster:
        eval_data = valid_clusters.get(cluster, None)
        if not eval_data:
            continue
        rmse = _rmse(model, eval_data)
        print('RMSE for {}: {:,.3f}'.format(cluster, rmse))

eval_rmse(als_model)

RMSE (train): 0.526
RMSE (validation): 1.789
RMSE for heavy: 2.339
RMSE for moderate: 1.651
RMSE for accidental: 1.758


In [21]:
als = ALSRecommender(num_factors=10, num_iters=10, reg=0.1)
print('Training...\n')
als_model = als.fit(train_data, verbose=True)
print('\nEvaluation...\n')
eval_rmse(als_model)

Training...

RMSE: 1.946
RMSE: 0.761
RMSE: 0.627
RMSE: 0.585
RMSE: 0.563
RMSE: 0.549
RMSE: 0.539
RMSE: 0.531
RMSE: 0.525
RMSE: 0.520

Evaluation...

RMSE (train): 0.520
RMSE (validation): 1.666
RMSE for heavy: 2.241
RMSE for moderate: 1.506
RMSE for accidental: 1.833


In [22]:
# valid_clusters: dict[cluster: Cluster, list[((user_id: str, item_id: str), rating: float)]]
# 0 -> first user-item-pair-and-rating, 0 -> user-item-pair, 0 -> user
user_id = valid_clusters[Cluster.HEAVY][0][0][0] 

user_items = sorted([(r.item_id, r.rating)
                     for r in valid_data.ratings
                     if r.user_id == user_id \
                         and r.item_id in train_data.items],
                    key=lambda r: r[1],
                    reverse=True)

items_exclude = set(r.item_id for r in train_data.ratings if r.user_id == user_id)

rec_items = als_model.recommend(user_id, items_exclude=items_exclude)

user_top = dict()
p_rating = None
p = 0
print('Top items for {}:\n'.format(user_id))
for i, (item_id, rating) in enumerate(user_items):
    if p_rating is None or p_rating > rating:
        p_rating = rating
        p += 1
    user_top[item_id] = p
    if i < 20:
        print('[{}] {}, {:,.2f}'.format(p, item_id, rating))
print()

p_rating = None
p = 0
print('Recommendations for {}:\n'.format(user_id))
for item_id, rating in rec_items:
    if p_rating is None or p_rating > rating:
        p_rating = rating
        p += 1
    print('[{}] {}, {:,.2f}, {}'.format(p, item_id, rating, user_top.get(item_id, '-')))

Top items for 547:

[1] 48516, 5.00
[1] 3310, 5.00
[1] 39183, 5.00
[2] 357, 4.50
[2] 165, 4.50
[2] 1183, 4.50
[2] 30749, 4.50
[2] 4235, 4.50
[2] 3809, 4.50
[2] 3989, 4.50
[2] 954, 4.50
[2] 3022, 4.50
[2] 48774, 4.50
[2] 48696, 4.50
[2] 46578, 4.50
[2] 45210, 4.50
[2] 2245, 4.50
[2] 2150, 4.50
[2] 1288, 4.50
[2] 4349, 4.50

Recommendations for 547:

[1] 940, 8.29, -
[2] 5378, 7.02, -
[3] 2202, 6.44, -
[4] 446, 6.43, -
[5] 383, 6.42, -
[6] 1701, 6.41, -
[7] 125, 6.37, -
[8] 1125, 6.27, -
[9] 2099, 6.23, -
[10] 1627, 5.92, -


## Spark ALS

http://spark.apache.org/docs/2.1.1/ml-collaborative-filtering.html

http://spark.apache.org/docs/2.1.1/api/python/pyspark.ml.html#module-pyspark.ml.recommendation

https://github.com/apache/spark/blob/v2.1.1/examples/src/main/python/ml/als_example.py

In [23]:
import os
import sys

SPARK_HOME=os.path.abspath('../../software/spark-2.1.1-bin-hadoop2.7')

if not os.path.isdir(SPARK_HOME):
    raise Exception('File not found: {}'.format(SPARK_HOME))

os.environ['SPARK_HOME'] = SPARK_HOME
os.environ['SPARK_DRIVER_MEMORY'] = '4g'
os.environ['PYSPARK_PYTHON'] = sys.executable

sys.path.append(os.path.join(SPARK_HOME, 'python'))
sys.path.append(os.path.join(SPARK_HOME, 'python', 'lib', 'py4j-0.10.4-src.zip'))

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

import atexit
atexit.register(lambda: spark.stop())

spark.version

'2.1.1'

In [24]:
train_plain = list((int(user_id), int(item_id), rating)
                   for (user_id, item_id), rating in train_eval)
train_df = spark.createDataFrame(train_plain, ['user', 'item', 'rating'])
train_df.printSchema()
train_df.show(5, False)

root
 |-- user: long (nullable = true)
 |-- item: long (nullable = true)
 |-- rating: double (nullable = true)

+----+----+------+
|user|item|rating|
+----+----+------+
|383 |21  |3.0   |
|383 |47  |5.0   |
|383 |1079|3.0   |
|409 |16  |4.0   |
|409 |21  |5.0   |
+----+----+------+
only showing top 5 rows



In [25]:
valid_plain = list((int(user_id), int(item_id), rating)
                   for (user_id, item_id), rating in valid_eval)
valid_df = spark.createDataFrame(valid_plain, ['user', 'item', 'rating'])
valid_df.printSchema()
valid_df.show(5, False)

root
 |-- user: long (nullable = true)
 |-- item: long (nullable = true)
 |-- rating: double (nullable = true)

+----+-----+------+
|user|item |rating|
+----+-----+------+
|561 |39292|4.0   |
|561 |2916 |3.0   |
|561 |4643 |2.5   |
|561 |33679|4.0   |
|561 |5991 |4.5   |
+----+-----+------+
only showing top 5 rows



In [26]:
test_plain = list((int(user_id), int(item_id), rating)
                  for (user_id, item_id), rating in test_eval)
test_df = spark.createDataFrame(test_plain, ['user', 'item', 'rating'])
test_df.printSchema()
test_df.show(5, False)

root
 |-- user: long (nullable = true)
 |-- item: long (nullable = true)
 |-- rating: double (nullable = true)

+----+----+------+
|user|item|rating|
+----+----+------+
|547 |7396|4.0   |
|547 |1734|3.5   |
|529 |7158|4.5   |
|529 |3967|3.5   |
|529 |3911|4.0   |
+----+----+------+
only showing top 5 rows



In [27]:
valid_clusters_spark = dict()
for cluster, data in valid_clusters.items():
    eval_data = list((int(user_id), int(item_id), rating) for (user_id, item_id), rating in data)
    eval_df = spark.createDataFrame(eval_data, ['user', 'item', 'rating'])
    valid_clusters_spark[cluster] = eval_df

In [28]:
from pyspark.ml.recommendation import ALS as SparkALS
from pyspark.ml.evaluation import RegressionEvaluator

def _rmse_spark(model, df):
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    pred = model.transform(df)
    return evaluator.evaluate(pred)

def eval_rmse_spark(model):
    rmse = _rmse_spark(model, train_df)
    print('RMSE (train): {:,.3f}'.format(rmse))

    rmse = _rmse_spark(model, valid_df)
    print('RMSE (validation): {:,.3f}'.format(rmse))

    for cluster in Cluster:
        eval_df = valid_clusters_spark.get(cluster, None)
        if not eval_df:
            continue
        rmse = _rmse_spark(model, eval_df)
        print('RMSE for {}: {:,.3f}'.format(cluster, rmse))

spark_als = SparkALS(rank=10, maxIter=10, regParam=0.1)
spark_model = spark_als.fit(train_df)
eval_rmse_spark(spark_model)

RMSE (train): 0.597
RMSE (validation): 1.013
RMSE for heavy: 1.123
RMSE for moderate: 0.972
RMSE for accidental: 1.279


In [29]:
spark_als = SparkALS(rank=10, maxIter=10, regParam=0.1)
spark_model = spark_als.fit(train_df)
eval_rmse_spark(spark_model)

RMSE (train): 0.597
RMSE (validation): 1.013
RMSE for heavy: 1.123
RMSE for moderate: 0.972
RMSE for accidental: 1.279


## Tuning

In [30]:
default_params = dict(num_factors=[5, 10, 20, 50, 100, 200],
                      num_iters=[5, 10, 25],
                      reg = [1e-5, 1e-3, 1e-1, 0.0, 1])

small_params = dict(num_factors=[5, 10, 20],
                    num_iters=[5],
                    reg = [1e-3, 1e-1, 1])

def grid_search(eval_func, params=default_params, verbose=False):
    best_rmse = None
    best_params = None
    for reg in params['reg']:
        for num_iters in params['num_iters']:
            for num_factors in params['num_factors']:
                if verbose:
                    print('\nParams:', num_factors, num_iters, reg)
                try:
                    rmse = eval_func(num_factors, num_iters, reg)
                except:
                    rmse = None
                if verbose:
                    print('RMSE:', '{:,.3f}'.format(rmse) if rmse is not None else '-')
                if rmse is not None and (best_rmse is None or rmse < best_rmse):
                    if verbose:
                        print('best update!')
                    best_rmse = rmse
                    best_params = (num_factors, num_iters, reg)
    return best_params, best_rmse

def tf_eval(num_factors, num_iters, reg):
    als = ALSRecommender(num_factors=num_factors, num_iters=num_iters, reg=reg)
    model = als.fit(train_data)
    return _rmse(model, valid_eval)

tf_params, tf_score = grid_search(tf_eval, params=small_params, verbose=True)
print()
print('Best Params:\n\nn_factors={}, n_iters={}, reg={}, RMSE={:.3f}'.format(*tf_params, tf_score))


Params: 5 5 0.001
RMSE: 1.693
best update!

Params: 10 5 0.001
RMSE: 1.417
best update!

Params: 20 5 0.001
RMSE: 1.752

Params: 5 5 0.1
RMSE: 1.222
best update!

Params: 10 5 0.1
RMSE: 1.384

Params: 20 5 0.1
RMSE: 1.559

Params: 5 5 1
RMSE: 1.498

Params: 10 5 1
RMSE: 1.744

Params: 20 5 1
RMSE: 2.016

Best Params:

n_factors=5, n_iters=5, reg=0.1, RMSE=1.222


In [31]:
def spark_eval(num_factors, num_iters, reg):
    als = SparkALS(rank=num_factors, maxIter=num_iters, regParam=reg)
    model = als.fit(train_df)
    return _rmse_spark(model, valid_df)

spark_params, spark_score = grid_search(spark_eval, params=small_params, verbose=True)
print()
print('Best Params:\n\nn_factors={}, n_iters={}, reg={}, RMSE={:.3f}'.format(*spark_params, spark_score))


Params: 5 5 0.001
RMSE: 1.327
best update!

Params: 10 5 0.001
RMSE: 1.412

Params: 20 5 0.001
RMSE: 1.862

Params: 5 5 0.1
RMSE: 0.990
best update!

Params: 10 5 0.1
RMSE: 1.020

Params: 20 5 0.1
RMSE: 1.028

Params: 5 5 1
RMSE: 1.257

Params: 10 5 1
RMSE: 1.259

Params: 20 5 1
RMSE: 1.257

Best Params:

n_factors=5, n_iters=5, reg=0.1, RMSE=0.990


**Test evaluation**

In [32]:
als = ALSRecommender(*tf_params)
model = als.fit(train_data)
rmse = _rmse(model, test_eval)
print('TensorFlow RMSE for test: {:,.3f}'.format(rmse))

TensorFlow RMSE for test: 1.279


In [33]:
num_factors, num_iters, reg = spark_params
als = SparkALS(rank=num_factors, maxIter=num_iters, regParam=reg)
model = als.fit(train_df)
rmse = _rmse_spark(model, test_df)
print('Spark RMSE for test: {:,.3f}'.format(rmse))

Spark RMSE for test: 1.151
