## Spark Initialization

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Frequent Itemsets Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x00000000062AA1D0>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Load Dataset

In [4]:
ratings = spark.read.option("header", "true") \
        .option("delimiter", "\t") \
        .option("inferSchema", "true") \
        .csv("D:/TC/6BigData/Dataset/hetrec2011-movielens-2k-v2/user_ratedmovies-timestamps.dat")
ratings.show()

+------+-------+------+-------------+
|userID|movieID|rating|    timestamp|
+------+-------+------+-------------+
|    75|      3|   1.0|1162160236000|
|    75|     32|   4.5|1162160624000|
|    75|    110|   4.0|1162161008000|
|    75|    160|   2.0|1162160212000|
|    75|    163|   4.0|1162160970000|
|    75|    165|   4.5|1162160715000|
|    75|    173|   3.5|1162160257000|
|    75|    296|   5.0|1162160689000|
|    75|    353|   3.5|1162160220000|
|    75|    420|   2.0|1162160202000|
|    75|    589|   4.0|1162160901000|
|    75|    653|   3.0|1162160225000|
|    75|    832|   4.5|1162160269000|
|    75|    920|   0.5|1162160228000|
|    75|    996|   4.5|1162160777000|
|    75|   1036|   4.0|1162160685000|
|    75|   1127|   3.5|1162160932000|
|    75|   1215|   4.5|1162160936000|
|    75|   1233|   4.0|1162161005000|
|    75|   1304|   2.5|1162160216000|
+------+-------+------+-------------+
only showing top 20 rows



In [5]:
ratings.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: long (nullable = true)



In [6]:
ratings.count()

855598

## Create Model

In [7]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [8]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(userCol="userID", itemCol="movieID", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [9]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.776208723821


In [10]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userID|     recommendations|
+------+--------------------+
| 14570|[[61742, 4.413881...|
| 57380|[[61742, 5.337688...|
| 24171|[[2934, 5.08661],...|
| 47711|[[61742, 4.518031...|
| 64423|[[61742, 5.546025...|
| 16574|[[61742, 5.822988...|
| 32445|[[61742, 4.869812...|
| 53565|[[61742, 4.569808...|
| 59355|[[61742, 4.965732...|
| 12046|[[61742, 5.542124...|
| 15846|[[61742, 5.782717...|
| 41946|[[2934, 5.184189]...|
| 43527|[[61742, 5.523022...|
| 65867|[[61742, 4.458104...|
| 36538|[[61742, 5.519878...|
| 49308|[[61742, 5.361971...|
| 18979|[[130, 7.3697224]...|
| 62880|[[2934, 4.85392],...|
| 18161|[[61742, 5.44845]...|
| 69341|[[61742, 4.929091...|
+------+--------------------+
only showing top 20 rows



In [11]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.show()

+-------+--------------------+
|movieID|     recommendations|
+-------+--------------------+
|   1580|[[55124, 4.817816...|
|   4900|[[37225, 4.47273]...|
|   5300|[[66190, 4.723829...|
|   6620|[[27297, 4.691498...|
|   7240|[[31156, 3.287021...|
|   7340|[[38854, 4.922385...|
|   7880|[[14614, 4.076333...|
|  30970|[[53192, 4.655610...|
|  32460|[[48038, 5.692684...|
|  54190|[[20832, 4.640445...|
|  57370|[[68943, 4.027431...|
|    471|[[48038, 4.736878...|
|   1591|[[26136, 3.737648...|
|   1342|[[27297, 4.295312...|
|   2122|[[48395, 3.623956...|
|   2142|[[27297, 4.495956...|
|   7982|[[48038, 5.155616...|
|   8592|[[11451, 3.675028...|
|  33722|[[20832, 4.766207...|
|  44022|[[20832, 4.544161...|
+-------+--------------------+
only showing top 20 rows

