In [2]:
import findspark
findspark.init()

In [3]:

# import findspark
import findspark
findspark.init()

# import SparkSession
from pyspark.sql import SparkSession

# create session
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()

In [4]:
print(spark)

<pyspark.sql.session.SparkSession object at 0x7f67e0829d90>


In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [6]:
# read dataset
df = spark.read.csv("/home/fariz/serendipity-sac2018/training.csv", header=True, inferSchema=True)

In [7]:
df.show()

+------+-------+------+-------------+
|userId|movieId|rating|    timestamp|
+------+-------+------+-------------+
|142882|  91658|   2.5|1515209647000|
|142882|   4344|   1.0|1515209646000|
|142882|  45720|   2.0|1515209643000|
|142882|   4734|   2.0|1515209641000|
|142882|  91542|   2.0|1515209637000|
|142882|   4446|   1.5|1515209635000|
|142882|   3354|   2.5|1515209632000|
|142882|   4701|   1.0|1515209631000|
|142882|  31696|   0.5|1515209629000|
|142882|  97921|   2.0|1515209625000|
|142882|   4299|   2.0|1515209623000|
|142882|   5388|   2.5|1515209621000|
|142882|   3301|   2.5|1515209613000|
|142882|   8972|   1.0|1515209611000|
|142882|   8950|   3.0|1515209608000|
|142882|  85414|   1.5|1515209606000|
|142851| 103339|   3.0|1515209598000|
|142882|  54272|   2.0|1515209595000|
|142882|  46976|   2.0|1515209593000|
|142882|  96079|   2.5|1515209590000|
+------+-------+------+-------------+
only showing top 20 rows



In [9]:
df.schema

StructType(List(StructField(userId,IntegerType,true),StructField(movieId,IntegerType,true),StructField(rating,DoubleType,true),StructField(timestamp,LongType,true)))

In [10]:
(training, test) = df.randomSplit([0.8, 0.2])

In [11]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [12]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.838531089218


In [13]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [14]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|100170|[[84838, 11.76872...|
|100274|[[167532, 13.6281...|
|100446|[[170113, 16.8581...|
|100800|[[166812, 11.2536...|
|100884|[[84838, 11.00514...|
|100986|[[153018, 11.0922...|
|101055|[[160826, 22.4140...|
|101094|[[150252, 9.81708...|
|101475|[[119165, 10.5947...|
|101627|[[170731, 17.7995...|
|101775|[[150667, 23.7395...|
|102119|[[84838, 13.57463...|
|102524|[[150252, 11.6173...|
|102594|[[158398, 15.6268...|
|102793|[[170113, 18.7712...|
|102798|[[80792, 16.80975...|
|102960|[[150252, 12.8992...|
|103011|[[176389, 15.6038...|
|103357|[[160826, 13.9831...|
|103747|[[143203, 20.6795...|
+------+--------------------+
only showing top 20 rows



In [15]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|    148|[[179335, 15.7472...|
|    463|[[181655, 20.0932...|
|    471|[[130169, 8.34912...|
|    496|[[114877, 30.6769...|
|    833|[[206080, 10.0441...|
|   1088|[[163855, 6.88318...|
|   1238|[[113105, 7.90653...|
|   1342|[[171721, 9.43546...|
|   1580|[[105788, 6.35815...|
|   1591|[[197150, 8.44356...|
|   1645|[[174864, 6.36123...|
|   1829|[[113105, 8.93149...|
|   1959|[[170369, 8.04614...|
|   2122|[[134307, 8.58994...|
|   2142|[[115510, 7.63863...|
|   2366|[[170369, 9.17588...|
|   2659|[[173669, 6.24899...|
|   2866|[[172551, 11.5307...|
|   3175|[[181823, 7.88067...|
|   3749|[[183396, 13.1389...|
+-------+--------------------+
only showing top 20 rows

