## Collabrative Filtering Algorithm Pyspark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
from pyspark.ml.recommendation import ALS
import sys
import codecs

In [2]:
spark = SparkSession.builder.appName("ALSExample").getOrCreate()
    
moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

In [3]:
def loadMovieNames():
    movieNames = {}
    # CHANGE THIS TO THE PATH TO YOUR u.ITEM FILE:
    with codecs.open("ml-100k/u.ITEM", "r", encoding='ISO-8859-1', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [20]:
names = loadMovieNames()
    
ratings = spark.read.option("sep", "\t").schema(moviesSchema) \
    .csv("ml-100k/u.data")
    
ratings.show()

+------+-------+------+---------+
|userID|movieID|rating|timestamp|
+------+-------+------+---------+
|     0|    133|     1|881250949|
|     0|     50|     5|881250949|
|     0|    172|     5|881250949|
|   196|    242|     3|881250949|
|   186|    302|     3|891717742|
|    22|    377|     1|878887116|
|   244|     51|     2|880606923|
|   166|    346|     1|886397596|
|   298|    474|     4|884182806|
|   115|    265|     2|881171488|
|   253|    465|     5|891628467|
|   305|    451|     3|886324817|
|     6|     86|     3|883603013|
|    62|    257|     2|879372434|
|   286|   1014|     5|879781125|
|   200|    222|     5|876042340|
|   210|     40|     3|891035994|
|   224|     29|     3|888104457|
|   303|    785|     3|879485318|
|   122|    387|     5|879270459|
+------+-------+------+---------+
only showing top 20 rows



In [21]:
print("Training recommendation model...")

als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \
    .setRatingCol("rating")
    
model = als.fit(ratings)
model

Training recommendation model...


ALSModel: uid=ALS_1662081da2d1, rank=10

In [22]:
# Manually construct a dataframe of the user ID's we want recs for
userID = int(0)
userSchema = StructType([StructField("userID", IntegerType(), True)])
users = spark.createDataFrame([[userID,]], userSchema)

recommendations = model.recommendForUserSubset(users, 10).collect()

print("Top 10 recommendations for user ID " + str(userID))
for userRecs in recommendations:
    myRecs = userRecs[1]  #userRecs is (userID, [Row(movieId, rating), Row(movieID, rating)...])
    for rec in myRecs: #my Recs is just the column of recs for the user
        movie = rec[0] #For each rec in the list, extract the movie ID and rating
        rating = rec[1]
        movieName = names[movie]
        print(movieName + str(rating))

Top 10 recommendations for user ID 0
Kicking and Screaming (1995)8.128525733947754
Shall We Dance? (1937)8.030218124389648
Crooklyn (1994)7.691278457641602
Love in the Afternoon (1957)7.023828029632568
Hearts and Minds (1996)6.709479331970215
Truman Show, The (1998)6.520021438598633
Man in the Iron Mask, The (1998)6.430258750915527
Angel Baby (1995)6.333723068237305
Chairman of the Board (1998)6.140254497528076
Lost in Space (1998)6.133683204650879
