In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.master("local[*]").appName("ML Job").getOrCreate()

## Movie Recommender

In [35]:
movies = spark.read.format("csv") \
.option('InferSchema','True') \
.option('Header','True') \
.load("movies.csv")

In [36]:
movies.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [37]:
ratings = spark.read.format("csv") \
.option('InferSchema','True') \
.option('Header','True') \
.load("ratings.csv")

In [10]:
ratings.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows



In [38]:
from pyspark.sql.functions import *
user2 = movies.join(ratings, movies.movieId == ratings.movieId) \
.filter(ratings.userId == 2) \
.orderBy(desc("rating"))

user2.show(truncate=False)

print(user2.count())
print(movies.count())

+-------+----------------------------------------------------+-----------------------------------------------+------+-------+------+----------+
|movieId|title                                               |genres                                         |userId|movieId|rating|timestamp |
+-------+----------------------------------------------------+-----------------------------------------------+------+-------+------+----------+
|131724 |The Jinx: The Life and Deaths of Robert Durst (2015)|Documentary                                    |2     |131724 |5.0   |1445714851|
|60756  |Step Brothers (2008)                                |Comedy                                         |2     |60756  |5.0   |1445714980|
|89774  |Warrior (2011)                                      |Drama                                          |2     |89774  |5.0   |1445715189|
|106782 |Wolf of Wall Street, The (2013)                     |Comedy|Crime|Drama                             |2     |106782 |5.0   |1445

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
 (training, test) = ratings.randomSplit([0.8, 0.2])

In [40]:
als = ALS(maxIter=5, regParam=0.01, rank=5, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")

In [39]:
dir(ALS)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__metaclass__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_call_java',
 '_copyValues',
 '_copy_params',
 '_create_from_java_class',
 '_create_model',
 '_create_params_from_java',
 '_dummy',
 '_empty_java_param_map',
 '_fit',
 '_fit_java',
 '_from_java',
 '_make_java_param_pair',
 '_new_java_array',
 '_new_java_obj',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_to_java',
 '_transfer_param_map_from_java',
 '_transfer_param_map_to_java',
 '_transfer_params_from_java',
 '_transfer_params_to_java',
 'alpha',
 'blockSize',
 'checkpointInterval',
 'clear',
 'coldStartStrategy',
 'copy',
 'explainParam',
 'expla

In [41]:
model = als.fit(training)

In [42]:
predictions = model.transform(test)

In [43]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [44]:
rmse = evaluator.evaluate(predictions)

In [45]:
 print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9931438185652565


In [46]:
dir(model)

['__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__metaclass__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_call_java',
 '_copyValues',
 '_copy_params',
 '_create_from_java_class',
 '_create_params_from_java',
 '_defaultParamMap',
 '_dummy',
 '_empty_java_param_map',
 '_from_java',
 '_java_obj',
 '_make_java_param_pair',
 '_new_java_array',
 '_new_java_obj',
 '_paramMap',
 '_params',
 '_randomUID',
 '_resetUid',
 '_resolveParam',
 '_set',
 '_setDefault',
 '_shouldOwn',
 '_to_java',
 '_transfer_param_map_from_java',
 '_transfer_param_map_to_java',
 '_transfer_params_from_java',
 '_transfer_params_to_java',
 '_transform',
 'blockSize',
 'clear',
 'coldStartStrategy',
 'copy',
 'explainParam',
 'exp

In [47]:
userRecs = model.recommendForAllUsers(10)

In [48]:
userRecs.printSchema()

root
 |-- userId: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- movieId: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [49]:
userRecs.show(truncate=False)

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                       |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|471   |[[158872, 12.095217], [5485, 11.306278], [3786, 10.584253], [68073, 10.019034], [5034, 9.410655], [2851, 9.264359], [2771, 9.2458935], [3134, 8.865047], [104863, 8.688386], [1354, 8.615828]]        |
|463   |[[3134, 8.19841], [158872, 8.195681], [5485, 8.172865], [5650, 7.9868374], [2771, 7.5298457], [85774, 7.470315], [68073, 7.4603996], [6187, 7.252477], [8607, 7.

In [50]:
 userRec2 = userRecs.filter(userRecs.userId == 2) \
        .withColumn('ratings', explode(userRecs.recommendations)) \
        .select('userId', 'ratings.movieId', 'ratings.rating')

In [51]:
userRec2.join(movies, movies.movieId == userRec2.movieId) \
        .show(truncate=False)

+------+-------+---------+-------+--------------------------------------------------+--------------------------------+
|userId|movieId|rating   |movieId|title                                             |genres                          |
+------+-------+---------+-------+--------------------------------------------------+--------------------------------+
|2     |7982   |8.559359 |7982   |Tale of Two Sisters, A (Janghwa, Hongryeon) (2003)|Drama|Horror|Mystery|Thriller   |
|2     |70946  |8.168563 |70946  |Troll 2 (1990)                                    |Fantasy|Horror                  |
|2     |48322  |7.472438 |48322  |Jackass Number Two (2006)                         |Comedy|Documentary              |
|2     |5909   |7.3413954|5909   |Visitor Q (Bizita Q) (2001)                       |Comedy|Drama|Horror             |
|2     |5048   |7.3184037|5048   |Snow Dogs (2002)                                  |Adventure|Children|Comedy       |
|2     |6975   |7.270154 |6975   |Funny Games (1