# TensorFlow Collaborative Filtering with Matrix Factorization (ALS)

PoC

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops.py

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py

## Movielens Dataset

https://grouplens.org/datasets/movielens/

http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html

http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. These data were created by 671 users between January 09, 1995 and October 16, 2016. This dataset was generated on October 17, 2016.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

(README for more)

In [1]:
import os
import requests
import zipfile

DATA_DIR = 'movielens'
DATASET_URL = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_FILENAME = DATASET_URL.split('/')[-1]
DATASET_PACKAGE = os.path.join(DATA_DIR, DATASET_FILENAME)
DATASET_PATH = os.path.join(DATA_DIR, DATASET_FILENAME[:-4])

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)
    
if not os.path.isfile(DATASET_PACKAGE):
    print('Downloading {}...'.format(DATASET_FILENAME))
    r = requests.get(DATASET_URL, stream=True)
    with open(DATASET_PACKAGE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print('Done!')

if not os.path.isdir(DATASET_PATH):
    print('Unpacking {}...'.format(DATASET_PACKAGE))
    with zipfile.ZipFile(DATASET_PACKAGE, 'r') as f:
        f.extractall(DATA_DIR)
    print('Done!')

In [2]:
import collections
import csv
import os

Rating = collections.namedtuple('Rating', ['user_id', 'item_id', 'rating', 'timestamp'])

class Dataset(collections.namedtuple('Dataset', ['users', 'items', 'ratings'])):

    #users: set[str]
    #items: set[str]
    #ratings: list[Rating]

    __slots__ = ()

    def __str__(self):
        out = 'Users: {:,d}\n'.format(self.n_users)
        out += 'Items: {:,d}\n'.format(self.n_items)
        out += 'Ratings: {:,d}\n'.format(self.n_ratings)
        return out
    
    @property
    def n_users(self):
        return len(self.users)
    
    @property
    def n_items(self):
        return len(self.items)
    
    @property
    def n_ratings(self):
        return len(self.ratings)
    
    def user_ratings(self, user_id):
        return list(r for r in self.ratings if r.user_id == user_id)

    def item_ratings(self, item_id):
        return list(r for r in self.ratings if r.item_id == item_id)

def new_dataset(ratings):
    users = set(r.user_id for r in ratings)
    items = set(r.item_id for r in ratings)
    return Dataset(users, items, ratings)


def load_movielens_ratings(dataset_path):
    ratings_csv = os.path.join(dataset_path, 'ratings.csv')
    if not os.path.isfile(ratings_csv):
        raise Exception('File not found: \'{}\''.format(ratings_csv))
    ratings = list()
    with open(ratings_csv, newline='') as f:
        reader = csv.reader(f)
        next(reader) # skip header
        for user_id, item_id, rating, timestamp in reader:
            ratings.append(Rating(user_id,
                                  item_id,
                                  float(rating),
                                  int(timestamp)))
    return ratings

def load_movielens(dataset_path):
    if not os.path.isdir(dataset_path):
        raise Exception('Path not found: \'{}\''.format(dataset_path))

    ratings = load_movielens_ratings(dataset_path)
    dataset = new_dataset(ratings)

    return dataset


small_dataset = load_movielens('movielens/ml-latest-small')
print(small_dataset)

Users: 671
Items: 9,066
Ratings: 100,004



In [3]:
def split_by_time(dataset, train_ratio=0.80):
    ratings = sorted(dataset.ratings, key=lambda r: r.timestamp)
    size = int(len(ratings) * train_ratio)
    train_ratings = ratings[:size]
    test_ratings = ratings[size:]
    return new_dataset(train_ratings), \
            new_dataset(test_ratings)

train_data, test_data = split_by_time(small_dataset)

print('Dataset\n\n{}'.format(small_dataset))
print('Train\n\n{}'.format(train_data))
print('Test\n\n{}'.format(test_data))

Dataset

Users: 671
Items: 9,066
Ratings: 100,004

Train

Users: 547
Items: 7,356
Ratings: 80,003

Test

Users: 147
Items: 4,753
Ratings: 20,001



In [4]:
# only items in train will be available for test evaluation
common_items = train_data.items & test_data.items
print('Items in train and test: {:,d}'.format(len(common_items)))

Items in train and test: 3,043


In [5]:
# users from test that has any item from train
test_users = set(r.user_id for r in test_data.ratings if r.item_id in train_data.items)
print('Users in test with train items: {:,d}'.format(len(test_users)))

Users in test with train items: 146


In [6]:
# only users in train are available for test
common_users = train_data.users & test_users
print('Users in train and test: {:,d}'.format(len(common_users)))

Users in train and test: 22


In [7]:
from enum import Enum

class Cluster(Enum):
    HEAVY = 'heavy'
    MODERATE = 'moderate'
    LIGHT = 'light'
    ACCIDENTAL = 'accidental'
    
    def __str__(self):
        return self.value
    
list(Cluster)

[<Cluster.HEAVY: 'heavy'>,
 <Cluster.MODERATE: 'moderate'>,
 <Cluster.LIGHT: 'light'>,
 <Cluster.ACCIDENTAL: 'accidental'>]

In [8]:
users_clusters = collections.defaultdict(list)

for user_id in common_users:
    n_train = len(list(r for r in train_data.user_ratings(user_id) if r.item_id in common_items))
    n_test = len(list(r for r in test_data.user_ratings(user_id) if r.item_id in common_items))
    
    cluster = None
    if n_train < 10 or n_test < 10 \
        or n_train + n_test < 30:
        cluster = Cluster.ACCIDENTAL
    elif n_train + n_test > 1000:
        cluster = Cluster.HEAVY
    elif n_train + n_test > 100:
        cluster = Cluster.MODERATE
    else:
        cluster = Cluster.LIGHT
    
    users_clusters[cluster].append(user_id)
    print('user_id={}, (train, test) = ({}, {}), {}'.format(user_id, n_train, n_test, cluster))

user_id=359, (train, test) = (12, 32), light
user_id=480, (train, test) = (220, 132), moderate
user_id=73, (train, test) = (881, 336), heavy
user_id=652, (train, test) = (5, 165), accidental
user_id=648, (train, test) = (187, 2), accidental
user_id=78, (train, test) = (171, 87), moderate
user_id=624, (train, test) = (912, 63), moderate
user_id=163, (train, test) = (30, 39), light
user_id=199, (train, test) = (300, 37), moderate
user_id=15, (train, test) = (946, 276), heavy
user_id=68, (train, test) = (112, 2), accidental
user_id=275, (train, test) = (33, 140), moderate
user_id=599, (train, test) = (85, 73), moderate
user_id=529, (train, test) = (314, 85), moderate
user_id=157, (train, test) = (205, 83), moderate
user_id=547, (train, test) = (1052, 63), heavy
user_id=303, (train, test) = (207, 6), accidental
user_id=380, (train, test) = (789, 39), moderate
user_id=458, (train, test) = (18, 58), light
user_id=637, (train, test) = (13, 8), accidental
user_id=48, (train, test) = (296, 81),

In [9]:
for cluster in Cluster:
    users = users_clusters[cluster]
    print(cluster, len(users))

heavy 3
moderate 10
light 4
accidental 5


In [10]:
def eval_pairs(users):
    return list(((r.user_id, r.item_id), r.rating)
                for r in test_data.ratings
                if r.user_id in users
                and r.item_id in common_items)

eval_clusters = dict()

for cluster in Cluster:
    users = users_clusters[cluster]
    pairs = eval_pairs(users)
    eval_clusters[cluster] = pairs
    print('Evaluation ratings for {}: {:,d}'.format(cluster, len(pairs)))

Evaluation ratings for heavy: 675
Evaluation ratings for moderate: 820
Evaluation ratings for light: 183
Evaluation ratings for accidental: 183


In [11]:
eval_test = eval_pairs(common_users)
print('Evaluation ratings for test: {:,d}'.format(len(eval_test)))

Evaluation ratings for test: 1,861


In [12]:
eval_train = list(((r.user_id, r.item_id), r.rating) for r in train_data.ratings)
print('Evaluation ratings for train: {:,d}'.format(len(eval_train)))

Evaluation ratings for train: 80,003


In [13]:
IndexMapping = collections.namedtuple('IndexMapping', ['users_to_idx',
                                                       'users_from_idx',
                                                       'items_to_idx',
                                                       'items_from_idx'])

def map_index(values):
    values_from_idx = dict(enumerate(values))
    values_to_idx = dict((value, idx) for idx, value in values_from_idx.items())
    return values_to_idx, values_from_idx

def new_mapping(dataset):
    users_to_idx, users_from_idx = map_index(dataset.users)
    items_to_idx, items_from_idx = map_index(dataset.items)
    return IndexMapping(users_to_idx, users_from_idx, items_to_idx, items_from_idx)

## TensorFlow WALS

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops.py

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py

In [14]:
import tensorflow as tf
import numpy as np

from tensorflow.contrib.factorization import WALSModel

class ALSRecommenderModel:
    
    def __init__(self, user_factors, item_factors, mapping):
        self.user_factors = user_factors
        self.item_factors = item_factors
        self.mapping = mapping
    
    def transform(self, x):
        for user_id, item_id in x:
            if user_id not in self.mapping.users_to_idx \
                or item_id not in self.mapping.items_to_idx:
                yield (user_id, item_id), 0.0
                continue
            i = self.mapping.users_to_idx[user_id]
            j = self.mapping.items_to_idx[item_id]
            u = self.user_factors[i]
            v = self.item_factors[j]
            r = np.dot(u, v)
            yield (user_id, item_id), r
    
    def recommend(self, user_id, num_items=10, items_exclude=set()):
        i = self.mapping.users_to_idx[user_id]
        u = self.user_factors[i]
        V = self.item_factors
        P = np.dot(V, u)
        rank = sorted(enumerate(P), key=lambda p: p[1], reverse=True)

        top = list()
        k = 0
        while k < len(rank) and len(top) < num_items:
            j, r = rank[k]
            k += 1

            item_id = self.mapping.items_from_idx[j]
            if item_id in items_exclude:
                continue

            top.append((item_id, r))

        return top        
    
class ALSRecommender:
    
    def __init__(self, num_factors=10, num_iters=10, reg=1e-1):
        self.num_factors = num_factors
        self.num_iters = num_iters
        self.regularization = reg

    def fit(self, dataset, verbose=False):
        with tf.Graph().as_default(), tf.Session() as sess:
            input_matrix, mapping = self.sparse_input(dataset)
            model = self.als_model(dataset)
            self.train(model, input_matrix, verbose)
            row_factor = model.row_factors[0].eval()
            col_factor = model.col_factors[0].eval()
            return ALSRecommenderModel(row_factor, col_factor, mapping)

    def sparse_input(self, dataset):
        mapping = new_mapping(dataset)

        indices = [(mapping.users_to_idx[r.user_id],
                    mapping.items_to_idx[r.item_id])
                   for r in dataset.ratings]
        values = [r.rating for r in dataset.ratings]
        shape = (dataset.n_users, dataset.n_items)

        return tf.SparseTensor(indices, values, shape), mapping
    
    def als_model(self, dataset):
        return WALSModel(
            dataset.n_users,
            dataset.n_items,
            self.num_factors,
            regularization=self.regularization,
            unobserved_weight=0)

    def train(self, model, input_matrix, verbose=False):
        rmse_op = self.rmse_op(model, input_matrix) if verbose else None

        row_update_op = model.update_row_factors(sp_input=input_matrix)[1]
        col_update_op = model.update_col_factors(sp_input=input_matrix)[1]

        model.initialize_op.run()
        model.worker_init.run()
        for _ in range(self.num_iters):
            # Update Users
            model.row_update_prep_gramian_op.run()
            model.initialize_row_update_op.run()
            row_update_op.run()
            # Update Items
            model.col_update_prep_gramian_op.run()
            model.initialize_col_update_op.run()
            col_update_op.run()

            if verbose:
                print('RMSE: {:,.3f}'.format(rmse_op.eval()))

    def approx_sparse(self, model, indices, shape):
        row_factors = tf.nn.embedding_lookup(
            model.row_factors,
            tf.range(model._input_rows),
            partition_strategy="div")
        col_factors = tf.nn.embedding_lookup(
            model.col_factors,
            tf.range(model._input_cols),
            partition_strategy="div")

        row_indices, col_indices = tf.split(indices,
                                            axis=1,
                                            num_or_size_splits=2)
        gathered_row_factors = tf.gather(row_factors, row_indices)
        gathered_col_factors = tf.gather(col_factors, col_indices)
        approx_vals = tf.squeeze(tf.matmul(gathered_row_factors,
                                           gathered_col_factors,
                                           adjoint_b=True))

        return tf.SparseTensor(indices=indices,
                               values=approx_vals,
                               dense_shape=shape)

    def rmse_op(self, model, input_matrix):
        approx_matrix = self.approx_sparse(model, input_matrix.indices, input_matrix.dense_shape)
        err = tf.sparse_add(input_matrix, approx_matrix * (-1))
        err2 = tf.square(err)
        n = input_matrix.values.shape[0].value
        return tf.sqrt(tf.sparse_reduce_sum(err2) / n)


als = ALSRecommender()
als_model = als.fit(train_data, verbose=True)

RMSE: 2.281
RMSE: 0.825
RMSE: 0.649
RMSE: 0.601
RMSE: 0.577
RMSE: 0.561
RMSE: 0.550
RMSE: 0.542
RMSE: 0.536
RMSE: 0.531


In [15]:
for k in range(10):
    x, y  = eval_test[k]
    _,  y_hat = list(als_model.transform([x]))[0]
    print(*x, y, y_hat)

359 6385 4.5 5.37492
359 1295 4.5 1.39123
359 3258 3.5 4.17737
359 318 4.0 5.48446
359 858 3.5 5.13797
359 527 4.0 4.98466
359 912 3.0 4.75844
359 922 3.5 4.23665
359 44555 2.5 3.70919
359 1193 4.0 4.95885


In [16]:
def _rmse(model, data):
    x, y = zip(*data)
    y_hat = list(r_hat for _, r_hat in model.transform(x))
    return np.sqrt(np.mean(np.square(np.subtract(y, y_hat))))

def eval_rmse(model):
    rmse = _rmse(model, eval_train)
    print('RMSE (train): {:,.3f}'.format(rmse))
    
    rmse = _rmse(model, eval_test)
    print('RMSE (test): {:,.3f}'.format(rmse))

    for cluster in Cluster:
        eval_data = eval_clusters[cluster]
        rmse = _rmse(model, eval_data)
        print('RMSE for {}: {:,.3f}'.format(cluster, rmse))

eval_rmse(als_model)

RMSE (train): 0.531
RMSE (test): 1.890
RMSE for heavy: 1.855
RMSE for moderate: 1.924
RMSE for light: 1.555
RMSE for accidental: 2.155


In [17]:
als = ALSRecommender(num_factors=10, num_iters=10, reg=0.1)
print('Training...\n')
als_model = als.fit(train_data, verbose=True)
print('\nEvaluation...\n')
eval_rmse(als_model)

Training...

RMSE: 2.033
RMSE: 0.781
RMSE: 0.639
RMSE: 0.597
RMSE: 0.575
RMSE: 0.560
RMSE: 0.549
RMSE: 0.541
RMSE: 0.534
RMSE: 0.529

Evaluation...

RMSE (train): 0.529
RMSE (test): 1.652
RMSE for heavy: 1.663
RMSE for moderate: 1.650
RMSE for light: 1.426
RMSE for accidental: 1.824


In [18]:
# eval_clusters: dict[cluster: Cluster, list[((user_id: str, item_id: str), rating: float)]]
# 0 -> first user-item-pair-and-rating, 0 -> user-item-pair, 0 -> user
user_id = eval_clusters[Cluster.HEAVY][0][0][0] 

user_items = sorted([(r.item_id, r.rating)
                     for r in test_data.ratings
                     if r.user_id == user_id \
                         and r.item_id in train_data.items],
                    key=lambda r: r[1],
                    reverse=True)

items_exclude = set(r.item_id for r in train_data.ratings if r.user_id == user_id)

rec_items = als_model.recommend(user_id, items_exclude=items_exclude)

user_top = dict()
p_rating = None
p = 0
print('Test top items for {}:\n'.format(user_id))
for i, (item_id, rating) in enumerate(user_items):
    if p_rating is None or p_rating > rating:
        p_rating = rating
        p += 1
    user_top[item_id] = p
    if i < 20:
        print('[{}] {}, {:,.2f}'.format(p, item_id, rating))
print()

p_rating = None
p = 0
print('Recommendations for {}:\n'.format(user_id))
for item_id, rating in rec_items:
    if p_rating is None or p_rating > rating:
        p_rating = rating
        p += 1
    print('[{}] {}, {:,.2f}, {}'.format(p, item_id, rating, user_top.get(item_id, '-')))

Test top items for 73:

[1] 58559, 5.00
[1] 1201, 5.00
[1] 27773, 5.00
[1] 32, 5.00
[1] 4369, 5.00
[1] 215, 5.00
[2] 36, 4.50
[2] 1218, 4.50
[2] 26547, 4.50
[2] 87192, 4.50
[2] 7099, 4.50
[2] 48774, 4.50
[2] 3328, 4.50
[2] 2948, 4.50
[2] 2692, 4.50
[2] 27022, 4.50
[2] 2144, 4.50
[2] 74458, 4.50
[2] 2467, 4.50
[2] 27317, 4.50

Recommendations for 73:

[1] 994, 5.41, -
[2] 1635, 5.36, -
[3] 2022, 5.30, -
[4] 3060, 5.29, -
[5] 1931, 5.24, -
[6] 1253, 5.13, -
[7] 7256, 5.11, -
[8] 2729, 5.05, -
[9] 1066, 5.02, -
[10] 7771, 5.02, -


## Spark ALS

http://spark.apache.org/docs/2.1.0/ml-collaborative-filtering.html

http://spark.apache.org/docs/2.1.0/api/python/pyspark.ml.html#module-pyspark.ml.recommendation

https://github.com/apache/spark/blob/v2.1.0/examples/src/main/python/ml/als_example.py

In [19]:
import os
import sys

SPARK_HOME=os.path.abspath('../../software/spark-2.1.0-bin-hadoop2.7')

if not os.path.isdir(SPARK_HOME):
    raise Exception('File not found: {}'.format(SPARK_HOME))

os.environ['SPARK_HOME'] = SPARK_HOME
os.environ['SPARK_DRIVER_MEMORY'] = '4g'
os.environ['PYSPARK_PYTHON'] = sys.executable

sys.path.append(os.path.join(SPARK_HOME, 'python'))
sys.path.append(os.path.join(SPARK_HOME, 'python', 'lib', 'py4j-0.10.4-src.zip'))

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

import atexit
atexit.register(lambda: spark.stop())

spark.version

'2.1.0'

In [20]:
train_raw = [(int(r.user_id), int(r.item_id), r.rating) for r in train_data.ratings]
train_df = spark.createDataFrame(train_raw, ['user', 'item', 'rating'])
train_df.printSchema()
train_df.show(5, False)

root
 |-- user: long (nullable = true)
 |-- item: long (nullable = true)
 |-- rating: double (nullable = true)

+----+----+------+
|user|item|rating|
+----+----+------+
|383 |21  |3.0   |
|383 |47  |5.0   |
|383 |1079|3.0   |
|409 |16  |4.0   |
|409 |21  |5.0   |
+----+----+------+
only showing top 5 rows



In [21]:
test_raw = [(int(r.user_id), int(r.item_id), r.rating)
            for r in test_data.ratings
            if r.user_id in common_users \
                and r.item_id in common_items]
test_df = spark.createDataFrame(test_raw, ['user', 'item', 'rating'])
test_df.printSchema()
test_df.show(5, False)

root
 |-- user: long (nullable = true)
 |-- item: long (nullable = true)
 |-- rating: double (nullable = true)

+----+----+------+
|user|item|rating|
+----+----+------+
|359 |6385|4.5   |
|359 |1295|4.5   |
|359 |3258|3.5   |
|359 |318 |4.0   |
|359 |858 |3.5   |
+----+----+------+
only showing top 5 rows



In [22]:
eval_clusters_spark = dict()
for cluster, data in eval_clusters.items():
    eval_data = [(int(user_id), int(item_id), rating) for (user_id, item_id), rating in data]
    eval_df = spark.createDataFrame(eval_data, ['user', 'item', 'rating'])
    eval_clusters_spark[cluster] = eval_df

In [23]:
from pyspark.ml.recommendation import ALS as SparkALS
from pyspark.ml.evaluation import RegressionEvaluator

def eval_rmse_spark(model):
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    pred_df = model.transform(train_df)
    rmse = evaluator.evaluate(pred_df)
    print('RMSE (train): {:,.3f}'.format(rmse))

    pred_df = model.transform(test_df)
    rmse = evaluator.evaluate(pred_df)
    print('RMSE (test): {:,.3f}'.format(rmse))

    for cluster in Cluster:
        eval_df = eval_clusters_spark[cluster]
        pred_df = model.transform(eval_df)
        rmse = evaluator.evaluate(pred_df)
        print('RMSE for {}: {:,.3f}'.format(cluster, rmse))

spark_als = SparkALS(rank=10, maxIter=10, regParam=0.1)
spark_model = spark_als.fit(train_df)
eval_rmse_spark(spark_model)

RMSE (train): 0.605
RMSE (test): 1.007
RMSE for heavy: 1.043
RMSE for moderate: 0.960
RMSE for light: 1.075
RMSE for accidental: 1.009


In [24]:
spark_als = SparkALS(rank=10, maxIter=10, regParam=0.1)
spark_model = spark_als.fit(train_df)
eval_rmse_spark(spark_model)

RMSE (train): 0.605
RMSE (test): 1.007
RMSE for heavy: 1.043
RMSE for moderate: 0.960
RMSE for light: 1.075
RMSE for accidental: 1.009


## Tuning

In [25]:
default_params = dict(num_factors=[5, 10, 20, 50, 100, 200],
                      num_iters=[5, 10, 25],
                      reg = [1e-5, 1e-3, 1e-1, 0.0, 1])

small_params = dict(num_factors=[5, 10, 20, 50],
                    num_iters=[5],
                    reg = [1e-5, 1e-3, 1e-1, 1])

def grid_search(eval_func, params=default_params, verbose=False):
    best_score = None
    best_params = None
    for reg in params['reg']:
        for num_iters in params['num_iters']:
            for num_factors in params['num_factors']:
                if verbose:
                    print('\nParams:', num_factors, num_iters, reg)
                try:
                    score = eval_func(num_factors, num_iters, reg)
                except:
                    score = None
                if verbose:
                    print('Score:', '{:,.3f}'.format(score) if score is not None else '-')
                if score is not None and (best_score is None or score < best_score):
                    if verbose:
                        print('best update!')
                    best_score = score
                    best_params = (num_factors, num_iters, reg)
    return best_params, best_score

def tf_eval(num_factors, num_iters, reg):
    als = ALSRecommender(num_factors=num_factors, num_iters=num_iters, reg=reg)
    model = als.fit(train_data)
    return _rmse(model, eval_test)

grid_search(tf_eval, params=small_params, verbose=True)


Params: 5 5 1e-05
Score: 3.791
best update!

Params: 10 5 1e-05
Score: 12.805

Params: 20 5 1e-05
Score: 32.945

Params: 50 5 1e-05
Score: 923.598

Params: 5 5 0.001
Score: 19.489

Params: 10 5 0.001
Score: 3.581
best update!

Params: 20 5 0.001
Score: 4.634

Params: 50 5 0.001
Score: 6.230

Params: 5 5 0.1
Score: 1.870
best update!

Params: 10 5 0.1
Score: 1.557
best update!

Params: 20 5 0.1
Score: 1.724

Params: 50 5 0.1
Score: 2.196

Params: 5 5 1
Score: 1.526
best update!

Params: 10 5 1
Score: 1.763

Params: 20 5 1
Score: 2.047

Params: 50 5 1
Score: 2.377


((5, 5, 1), 1.5261158099815511)

In [27]:
def spark_eval(num_factors, num_iters, reg):
    als = SparkALS(rank=num_factors, maxIter=num_iters, regParam=reg)
    model = als.fit(train_df)
    pred_df = model.transform(test_df)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    return evaluator.evaluate(pred_df)

grid_search(spark_eval, params=small_params, verbose=True)


Params: 5 5 1e-05
Score: 3.510
best update!

Params: 10 5 1e-05
Score: 2.218
best update!

Params: 20 5 1e-05
Score: 2.032
best update!

Params: 50 5 1e-05
Score: 2.727

Params: 5 5 0.001
Score: 1.611
best update!

Params: 10 5 0.001
Score: 1.457
best update!

Params: 20 5 0.001
Score: 1.662

Params: 50 5 0.001
Score: 2.306

Params: 5 5 0.1
Score: 0.994
best update!

Params: 10 5 0.1
Score: 1.000

Params: 20 5 0.1
Score: 1.006

Params: 50 5 0.1
Score: 1.023

Params: 5 5 1
Score: 1.311

Params: 10 5 1
Score: 1.311

Params: 20 5 1
Score: 1.311

Params: 50 5 1
Score: 1.311


((5, 5, 0.1), 0.9938391167559549)