## Recommendations with Spark ALS

In [1]:
import pyspark

In [2]:
sc = pyspark.SparkContext("local[*]")

In [3]:
sc.version

'2.2.0'

In [4]:
from pyspark.mllib.recommendation import ALS
from pyspark.mllib.recommendation import Rating

In [5]:
def expand_user(a, user):
    return [Rating(user, item, ranking) for item, ranking in enumerate(a) if ranking != 0]

In [6]:
def expand_all(a):
    return [expand_user(items, user) for user, items in enumerate(a)]

### Here we have ratings from eight users for six different movies: Titanic, Dirty Dancing, Die Hard, Terminator 2, Wayne's World, and Zoolander. Or in other words, two romantic films, two action films, and two comedies. Each row is a user, each column is a movie.

### The ratings are constructed so that if a user has seen both movies in one of these pairs, their ratings for the two movies are similar.

### There is no evidence in this data that anyone likes all three film genres.

In [7]:
rawdata = [
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ]
list_of_ratings = expand_all(rawdata)

In [8]:
# construct an RDD of Ratings for every non-zero rating
ratings = [val for sublist in list_of_ratings for val in sublist]
ratingsRDD = sc.parallelize(ratings)
ratingsRDD.take(5)

[Rating(user=0, product=0, rating=5.0),
 Rating(user=0, product=1, rating=5.0),
 Rating(user=1, product=2, rating=5.0),
 Rating(user=1, product=3, rating=5.0),
 Rating(user=2, product=4, rating=5.0)]

In [9]:
rank = 3
numIterations = 5
als_lambda = 0.1
model = ALS.train(ratingsRDD, rank, numIterations, als_lambda, seed=4242, nonnegative=True)
# there is also a trainImplicit method that one uses when
# working with implicit ratings (it uses a different cost function)

In [10]:
# here we see the model's vector of features for each user
users = model.userFeatures().collect()
sorted(users, key=lambda x: x[0])

[(0, array('d', [0.0548248365521431, 2.1439249515533447, 0.1868879497051239])),
 (1, array('d', [1.116753339767456, 1.0875924825668335, 0.09992443770170212])),
 (2,
  array('d', [1.4131990671157837, 0.17979858815670013, 0.8607355952262878])),
 (3,
  array('d', [1.6533339023590088, 0.4600432813167572, 0.22208528220653534])),
 (4, array('d', [1.4229881763458252, 0.3441064953804016, 0.7850479483604431])),
 (5,
  array('d', [0.12839074432849884, 2.1138052940368652, 0.011434999294579029])),
 (6, array('d', [0.16750510036945343, 2.0847885608673096, 0.0])),
 (7, array('d', [0.08329737186431885, 2.131591796875, 0.13894794881343842]))]

In [11]:
# and the features for the "products"
products = model.productFeatures().collect()
sorted(products, key=lambda x: x[0])

[(0,
  array('d', [0.14415070414543152, 2.2892513275146484, 0.10815300792455673])),
 (1,
  array('d', [0.01397976279258728, 2.252133369445801, 0.24288420379161835])),
 (2, array('d', [2.3235769271850586, 2.1165413856506348, 1.0717264413833618])),
 (3, array('d', [2.2422642707824707, 2.17822265625, 0.2330417037010193])),
 (4, array('d', [2.7039616107940674, 0.3365514874458313, 1.2686758041381836])),
 (5,
  array('d', [2.4558310508728027, 0.31472620368003845, 1.6110131740570068]))]

In [12]:
# recommend 3 items for user 2
model.recommendProducts(2, 3)

[Rating(user=2, product=4, rating=4.9737419315999345),
 Rating(user=2, product=5, rating=4.913821860454357),
 Rating(user=2, product=2, rating=4.586700995228753)]

### Display the original matrix side-by-side with the reconstructed matrix. The values that were originally non-zero should be closely approximated, and the values that were zero (empty) now have predictions.

In [13]:
import sys
print(" original      reconstructed")
for user in range(0, len(rawdata)):
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%d " % rawdata[user][product])
    sys.stdout.write("    ")
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%0.0f " % model.predict(user, product))
    print(" ")

 original      reconstructed
5 5 0 0 0 0     5 5 5 5 1 1  
0 0 5 5 0 0     3 2 5 5 4 3  
0 0 0 0 5 5     1 1 5 4 5 5  
0 1 5 5 5 0     1 1 5 5 5 5  
1 1 5 0 5 5     1 1 5 4 5 5  
5 5 0 5 1 1     5 5 5 5 1 1  
5 0 0 5 0 1     5 5 5 5 1 1  
5 5 5 0 1 0     5 5 5 5 1 1  


In [14]:
print(" original         errors        predictions")
for user in range(0, len(rawdata)):
    for product in range (0, len(rawdata[0])):
        sys.stdout.write("%d " % rawdata[user][product])
    sys.stdout.write("    ")
    for product in range (0, len(rawdata[0])):
        if rawdata[user][product] != 0:
            prediction = model.predict(user, product)
            if rawdata[user][product] != round(prediction, 0):
                sys.stdout.write("%0.0f " % prediction)
            else:
                sys.stdout.write("- ")
        else:
            sys.stdout.write("- ")
    sys.stdout.write("    ")
    for product in range (0, len(rawdata[0])):
        if rawdata[user][product] == 0:
            prediction = model.predict(user, product)
            sys.stdout.write("%0.0f " % prediction)
        else:
            sys.stdout.write("- ")
    print(" ")

 original         errors        predictions
5 5 0 0 0 0     - - - - - -     - - 5 5 1 1  
0 0 5 5 0 0     - - - - - -     3 2 - - 4 3  
0 0 0 0 5 5     - - - - - -     1 1 5 4 - -  
0 1 5 5 5 0     - - - - - -     1 - - - - 5  
1 1 5 0 5 5     - - - - - -     - - - 4 - -  
5 5 0 5 1 1     - - - - - -     - - 5 - - -  
5 0 0 5 0 1     - - - - - -     - 5 5 - 1 -  
5 5 5 0 1 0     - - - - - -     - - - 5 - 1  


### Compute the mean squared error of the reconstructed matrix. This can be used to decide if the rank is sufficiently large.

In [15]:
evalRDD = ratingsRDD.map(lambda p: (p[0], p[1]))
evalRDD.take(5)

[(0, 0), (0, 1), (1, 2), (1, 3), (2, 4)]

In [16]:
predictions = model.predictAll(evalRDD).map(lambda r: ((r[0], r[1]), r[2]))
predictions.take(5)

[((0, 0), 4.936098574134385),
 ((0, 1), 4.8745634940532785),
 ((1, 2), 5.0038884557410155),
 ((1, 3), 4.896361260535194),
 ((2, 4), 4.9737419315999345)]

In [17]:
ratingsAndPreds = ratingsRDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
ratingsAndPreds.take(5)

[((4, 2), (5.0, 4.876094776769769)),
 ((5, 1), (5.0, 4.765143692061115)),
 ((4, 5), (5.0, 4.867640466522756)),
 ((3, 1), (1.0, 1.1131330479573036)),
 ((4, 4), (5.0, 4.9594862914862325))]

In [18]:
ratingsAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

0.013936872542062233

With a larger dataset we would separate the rating data into training and test sets, and see how well our predicted ratings match the actual data.

### Questions