# TensorFlow Collaborative Filtering with Matrix Factorization (ALS)

PoC

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops.py

https://github.com/tensorflow/tensorflow/blob/v1.0.0/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py

## Movielens Dataset

https://grouplens.org/datasets/movielens/

http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html

http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. These data were created by 671 users between January 09, 1995 and October 16, 2016. This dataset was generated on October 17, 2016.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

(README for more)

In [1]:
import os
import requests
import zipfile

DATA_DIR = 'movielens'
DATASET_URL = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
DATASET_FILENAME = DATASET_URL.split('/')[-1]
DATASET_PACKAGE = os.path.join(DATA_DIR, DATASET_FILENAME)
DATASET_PATH = os.path.join(DATA_DIR, DATASET_FILENAME[:-4])

if not os.path.isdir(DATA_DIR):
    os.makedirs(DATA_DIR)
    
if not os.path.isfile(DATASET_PACKAGE):
    print('Downloading {}...'.format(DATASET_FILENAME))
    r = requests.get(DATASET_URL, stream=True)
    with open(DATASET_PACKAGE, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    print('Done!')

if not os.path.isdir(DATASET_PATH):
    print('Unpacking {}...'.format(DATASET_PACKAGE))
    with zipfile.ZipFile(DATASET_PACKAGE, 'r') as f:
        f.extractall(DATA_DIR)
    print('Done!')

In [2]:
os.listdir(DATASET_PATH)

['links.csv',
 'movies.csv',
 'ratings.csv',
 'README.txt',
 'tags.csv',
 '.ipynb_checkpoints']

In [3]:
RATINGS_CSV = os.path.join(DATASET_PATH, 'ratings.csv')

with open(RATINGS_CSV, 'r') as f:
    for _ in range(10):
        print(f.readline().strip())

userId,movieId,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125


In [4]:
import collections
import csv

Rating = collections.namedtuple('Rating', ['user_id', 'item_id', 'rating', 'timestamp'])

ratings = list()
with open(RATINGS_CSV, newline='') as f:
    reader = csv.reader(f)
    next(reader) # skip header
    for user_id, item_id, rating, timestamp in reader:
        ratings.append(Rating(user_id, item_id, float(rating), int(timestamp)))

ratings = sorted(ratings, key=lambda r: r.timestamp)

print('Ratings: {:,}'.format(len(ratings)))

Ratings: 100,004


In [5]:
users_from_idx = sorted(set(r.user_id for r in ratings), key=int)
users_from_idx = dict(enumerate(users_from_idx))
users_to_idx = dict((user_id, idx) for idx, user_id in users_from_idx.items())
print('User Index:')
for i in range(10):
    print('i={}, user_id={}'.format(i, users_from_idx[i]))

User Index:
i=0, user_id=1
i=1, user_id=2
i=2, user_id=3
i=3, user_id=4
i=4, user_id=5
i=5, user_id=6
i=6, user_id=7
i=7, user_id=8
i=8, user_id=9
i=9, user_id=10


In [6]:
items_from_idx = sorted(set(r.item_id for r in ratings), key=int)
items_from_idx = dict(enumerate(items_from_idx))
items_to_idx = dict((item_id, idx) for idx, item_id in items_from_idx.items())
print('Item Index:')
for j in range(10):
    print('j={}, item_id={}'.format(j, items_from_idx[j]))

Item Index:
j=0, item_id=1
j=1, item_id=2
j=2, item_id=3
j=3, item_id=4
j=4, item_id=5
j=5, item_id=6
j=6, item_id=7
j=7, item_id=8
j=8, item_id=9
j=9, item_id=10


## Draft

In [7]:
import tensorflow as tf
import numpy as np

sess = tf.InteractiveSession()

In [8]:
indices = [(users_to_idx[r.user_id], items_to_idx[r.item_id]) for r in ratings]
values = [r.rating for r in ratings]
n_rows = len(users_from_idx)
n_cols = len(items_from_idx)
shape = (n_rows, n_cols)

P = tf.SparseTensor(indices, values, shape)

print(P)
print()
print('Total values: {:,}'.format(n_rows * n_cols))

SparseTensor(indices=Tensor("SparseTensor/indices:0", shape=(100004, 2), dtype=int64), values=Tensor("SparseTensor/values:0", shape=(100004,), dtype=float32), dense_shape=Tensor("SparseTensor/dense_shape:0", shape=(2,), dtype=int64))

Total values: 6,083,286


In [9]:
from tensorflow.contrib.factorization import WALSModel

In [10]:
help(WALSModel)

Help on class WALSModel in module tensorflow.contrib.factorization.python.ops.factorization_ops:

class WALSModel(builtins.object)
 |  A model for Weighted Alternating Least Squares matrix factorization.
 |  
 |  It minimizes the following loss function over U, V:
 |   \\( ||W \odot (A - U V^T) ||_F^2 + \lambda (||U||_F^2 + ||V||_F^2) )\\
 |    where,
 |    A: input matrix,
 |    W: weight matrix,
 |    U, V: row_factors and column_factors matrices,
 |    \\(\lambda)\\: regularization.
 |  Also we assume that W is of the following special form:
 |  \\( W_{ij} = W_0 + R_i * C_j )\\  if \\(A_{ij} \ne 0)\\,
 |  \\(W_{ij} = W_0)\\ otherwise.
 |  where,
 |  \\(W_0)\\: unobserved_weight,
 |  \\(R_i)\\: row_weights,
 |  \\(C_j)\\: col_weights.
 |  
 |  Note that the current implementation supports two operation modes: The default
 |  mode is for the condition where row_factors and col_factors can individually
 |  fit into the memory of each worker and these will be cached. When this
 |  condi

In [11]:
k = 10
n = 10
reg = 1e-1

model = WALSModel(
    n_rows,
    n_cols,
    k,
    regularization=reg,
    unobserved_weight=0)

row_factors = tf.nn.embedding_lookup(
    model.row_factors,
    tf.range(model._input_rows),
    partition_strategy="div")
col_factors = tf.nn.embedding_lookup(
    model.col_factors,
    tf.range(model._input_cols),
    partition_strategy="div")

row_indices, col_indices = tf.split(P.indices,
                                    axis=1,
                                    num_or_size_splits=2)
gathered_row_factors = tf.gather(row_factors, row_indices)
gathered_col_factors = tf.gather(col_factors, col_indices)
approx_vals = tf.squeeze(tf.matmul(gathered_row_factors,
                                   gathered_col_factors,
                                   adjoint_b=True))
P_approx = tf.SparseTensor(indices=P.indices,
                           values=approx_vals,
                           dense_shape=P.dense_shape)

E = tf.sparse_add(P, P_approx * (-1))
E2 = tf.square(E)
n_P = P.values.shape[0].value
rmse_op = tf.sqrt(tf.sparse_reduce_sum(E2) / n_P)

row_update_op = model.update_row_factors(sp_input=P)[1]
col_update_op = model.update_col_factors(sp_input=P)[1]

model.initialize_op.run()
model.worker_init.run()
for _ in range(n):
    # Update Users
    model.row_update_prep_gramian_op.run()
    model.initialize_row_update_op.run()
    row_update_op.run()
    # Update Items
    model.col_update_prep_gramian_op.run()
    model.initialize_col_update_op.run()
    col_update_op.run()

    print('RMSE: {:,.3f}'.format(rmse_op.eval()))

RMSE: 2.223
RMSE: 0.813
RMSE: 0.644
RMSE: 0.597
RMSE: 0.573
RMSE: 0.559
RMSE: 0.549
RMSE: 0.542
RMSE: 0.537
RMSE: 0.533


In [12]:
user_factors = model.row_factors[0].eval()
item_factors = model.col_factors[0].eval()

print('User factors shape:', user_factors.shape)
print('Item factors shape:', item_factors.shape)

User factors shape: (671, 10)
Item factors shape: (9066, 10)


In [13]:
c = collections.Counter(r.user_id for r in ratings)
user_id, n_ratings = c.most_common(1)[0]
print('Most havy user {}: {:,d}'.format(user_id, n_ratings))

Most havy user 547: 2,391


In [14]:
r = next(r for r in reversed(ratings) if r.user_id == user_id and r.rating == 5.0)
print('Last 5-rating from {}:\n'.format(user_id))
print(r)

Last 5-rating from 547:

Rating(user_id='547', item_id='163949', rating=5.0, timestamp=1476419239)


**Calculating ratings 'predictions'**

In [15]:
i = users_to_idx[r.user_id]
j = items_to_idx[r.item_id]

u = user_factors[i]
print('Factors for user {}:\n'.format(r.user_id))
print(u)
print()

v = item_factors[j]
print('Factors for item {}:\n'.format(r.item_id))
print(v)
print()

p = np.dot(u, v)
print('Approx. rating: {:,.3f}, diff={:,.3f}, {:,.3%}'.format(p, r.rating - p, p/r.rating))

Factors for user 547:

[ 0.17587891 -0.09659031 -0.1773065  -0.198863    0.17917389  0.12711532
 -1.15592897  0.12968333  0.27661979 -0.48321217]

Factors for item 163949:

[ 0.45741466 -0.25120869 -0.46113154 -0.51719838  0.46598965  0.33059701
 -3.00630188  0.33727586  0.71942407 -1.2567178 ]

Approx. rating: 4.740, diff=0.260, 94.798%


**Calculating recommendations**

In [16]:
V = item_factors

user_P = np.dot(V, u)
print('User preference shape:', user_P.shape)
print()

user_items = set(ur.item_id for ur in ratings if ur.user_id == user_id)

user_ranking_idx = sorted(enumerate(user_P), key=lambda p: p[1], reverse=True)
user_ranking_raw = ((items_from_idx[j], p) for j, p in user_ranking_idx)
user_ranking = [(item_id, p) for item_id, p in user_ranking_raw if item_id not in user_items]

top10 = user_ranking[:10]

print('Top 10 items:\n')
for k, (item_id, p) in enumerate(top10):
    print('[{}] {} {:,.2f}'.format(k+1, item_id, p))

User preference shape: (9066,)

Top 10 items:

[1] 2622 6.54
[2] 1959 6.10
[3] 2594 5.94
[4] 2912 5.91
[5] 2287 5.91
[6] 1920 5.73
[7] 585 5.68
[8] 282 5.64
[9] 4016 5.51
[10] 2126 5.39
