In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [7]:
spark = SparkSession.builder.appName("Recommender_system").getOrCreate()



# use ratings dataset to create a Recommeder system



download_path = http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

filename = ml-latest-small.zip
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.


In [54]:
#download data and save it to file in folder

In [45]:
import requests

In [47]:
response = requests.get('http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')

In [49]:
with open('data/ml-latest-small.zip', 'wb') as handle:
    for block in response.iter_content(1024):
        handle.write(block)

In [50]:
import zipfile
with zipfile.ZipFile('data/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall("data/")

In [51]:
ratings = spark.read.csv(path= "data/ml-latest-small/ratings.csv", sep=",", header=True, quote ='"', schema ="userId INT, movieId INT, rating DOUBLE, timestamp INT").cache()

In [52]:
ratings.show(2)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [10]:
ratings = ratings.withColumn("timestamp", f.to_timestamp(f.from_unixtime("timestamp"))).cache()

In [11]:
ratings.show(10)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      1|   4.0|2000-07-30 13:45:03|
|     1|      3|   4.0|2000-07-30 13:20:47|
|     1|      6|   4.0|2000-07-30 13:37:04|
|     1|     47|   5.0|2000-07-30 14:03:35|
|     1|     50|   5.0|2000-07-30 13:48:51|
|     1|     70|   3.0|2000-07-30 13:40:00|
|     1|    101|   5.0|2000-07-30 13:14:28|
|     1|    110|   4.0|2000-07-30 13:36:16|
|     1|    151|   5.0|2000-07-30 14:07:21|
|     1|    157|   5.0|2000-07-30 14:08:20|
+------+-------+------+-------------------+
only showing top 10 rows



In [12]:
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [13]:
ratings.summary().show()

+-------+------------------+----------------+------------------+
|summary|            userId|         movieId|            rating|
+-------+------------------+----------------+------------------+
|  count|            100836|          100836|            100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|
|    min|                 1|               1|               0.5|
|    25%|               177|            1199|               3.0|
|    50%|               325|            2991|               3.5|
|    75%|               477|            8092|               4.0|
|    max|               610|          193609|               5.0|
+-------+------------------+----------------+------------------+



In [14]:
ratings = ratings.drop("timestamp")

# pyspark  matrix factorisation technique — Alternating Least Squares (ALS) to create a reccomender system

In [15]:
from pyspark.ml.recommendation import ALS

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.ml.recommendation.ALS.html#pyspark.ml.recommendation.ALS

class pyspark.ml.recommendation.ALS(*, rank=10, maxIter=10, regParam=0.1, numUserBlocks=10, numItemBlocks=10, implicitPrefs=False, alpha=1.0, userCol='user', itemCol='item', seed=None, ratingCol='rating', nonnegative=False, checkpointInterval=10, intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK', coldStartStrategy='nan', blockSize=4096)



In [17]:
als =  ALS(
     userCol='userId', 
        itemCol='movieId', 
        
        ratingCol='rating'
        )



(training_data, validation_data) = ratings.randomSplit([8.0, 2.0])

In [18]:
evaluator = RegressionEvaluator(
                                metricName ='rmse', labelCol ="rating", predictionCol='prediction'
                                )

In [19]:
model = als.fit(training_data)

In [20]:
type(model)

pyspark.ml.recommendation.ALSModel

In [21]:
type(evaluator)

pyspark.ml.evaluation.RegressionEvaluator

In [22]:
predictons = model.transform(validation_data)

In [23]:
predictons.show(10)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    673|   3.0|  2.608757|
|     1|    333|   5.0| 4.0548954|
|     1|    157|   5.0| 3.0268917|
|     1|    367|   4.0| 3.8231585|
|     1|      1|   4.0|  4.594063|
|     1|    919|   5.0| 4.0097704|
|     1|    553|   5.0|  4.488731|
|     1|    780|   3.0|  3.885131|
|     1|    356|   4.0| 4.6681347|
|     1|     70|   3.0| 3.9938636|
+------+-------+------+----------+
only showing top 10 rows



In [24]:
rmse = evaluator.evaluate((predictons).dropna())

In [25]:
# RMSE is high but we can fine tune the model parameters

In [26]:
print(rmse)

0.8789006339623343


# user CrossValidator, ParamGridBuilder to fine tune hyper parameters of the model

In [27]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [28]:
parameter_grid = (
            ParamGridBuilder()
    .addGrid(als.rank, [1, 5, 10])
    .addGrid(als.maxIter, [20, 10])
    .addGrid(als.regParam,[0.5,0.1])
    .build()
)

In [29]:
type(parameter_grid)

list

In [30]:
from pprint import pprint

pprint(parameter_grid)

[{Param(parent='ALS_84b5c621ac3e', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_84b5c621ac3e', name='rank', doc='rank of the factorization'): 1,
  Param(parent='ALS_84b5c621ac3e', name='regParam', doc='regularization parameter (>= 0).'): 0.5},
 {Param(parent='ALS_84b5c621ac3e', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_84b5c621ac3e', name='rank', doc='rank of the factorization'): 1,
  Param(parent='ALS_84b5c621ac3e', name='regParam', doc='regularization parameter (>= 0).'): 0.1},
 {Param(parent='ALS_84b5c621ac3e', name='maxIter', doc='max number of iterations (>= 0).'): 10,
  Param(parent='ALS_84b5c621ac3e', name='rank', doc='rank of the factorization'): 1,
  Param(parent='ALS_84b5c621ac3e', name='regParam', doc='regularization parameter (>= 0).'): 0.5},
 {Param(parent='ALS_84b5c621ac3e', name='maxIter', doc='max number of iterations (>= 0).'): 10,
  Param(parent='ALS_84b5c621ac3e', name='rank', doc='rank of the

In [31]:
crossvalidator = CrossValidator(
    estimator = als
    ,estimatorParamMaps= parameter_grid
    ,evaluator = evaluator 
    ,numFolds = 3
)

In [32]:
crossval_model= crossvalidator.fit(training_data)

In [33]:
predictions = crossval_model.transform(validation_data)

In [34]:
rmse = evaluator.evaluate((predictons).dropna())
print(rmse)

0.8789006339623343


In [35]:
model = crossval_model.bestModel

# load movies dataset to get predictions for user for unseen movie


In [53]:
movies = spark.read.csv(path = 'data/ml-latest-small/movies.csv', sep = ',', header = True, quote='"', schema= "movieId INT, tittle STRING, genres STRING")

In [37]:
movies.show(4)

+-------+--------------------+--------------------+
|movieId|              tittle|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
+-------+--------------------+--------------------+
only showing top 4 rows



# predict top 5 movie recomendation for user with User_id 5

In [38]:
# find the movies to be rated
USER_ID = 150

movies_to_be_rated = (
                       ratings
                    #filter movies not rated by user
                .filter(f'userId != {USER_ID}')
                .select('movieId').distinct()
                #add the userid to the data
                .withColumn("userId", f.lit(USER_ID))

                )

In [39]:
movies_to_be_rated.show()

+-------+------+
|movieId|userId|
+-------+------+
|   1580|   150|
|   2366|   150|
|   3175|   150|
|   1088|   150|
|  32460|   150|
|  44022|   150|
|  96488|   150|
|   1238|   150|
|   1342|   150|
|   1591|   150|
|   1645|   150|
|   4519|   150|
|   2142|   150|
|    471|   150|
|   3997|   150|
|    833|   150|
|   3918|   150|
|   7982|   150|
|   1959|   150|
|  68135|   150|
+-------+------+
only showing top 20 rows



In [40]:
# predict the movies

In [41]:
user_movies_predicitons = model.transform(movies_to_be_rated)

In [42]:
user_movies_predicitons.show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   1580|   150| 3.0021908|
|   2366|   150| 3.1366751|
|   3175|   150| 3.1219451|
|   1088|   150| 3.0233843|
|  32460|   150| 3.5273237|
|  44022|   150| 2.7526877|
|  96488|   150| 3.6443875|
|   1238|   150| 3.4903786|
|   1342|   150| 2.3784082|
|   1591|   150|  2.274304|
|   1645|   150| 2.8916616|
|   4519|   150| 2.8688335|
|   2142|   150| 2.5902288|
|    471|   150| 3.1445718|
|   3997|   150| 1.6652061|
|    833|   150| 1.5043699|
|   3918|   150| 2.6934934|
|   7982|   150| 3.6648328|
|   1959|   150|    3.0877|
|  68135|   150| 3.1314673|
+-------+------+----------+
only showing top 20 rows



# top 5 predcition

In [43]:
recomendation_for_users = (
      user_movies_predicitons
        .dropna()
        .orderBy('prediction', ascending = False)
        .limit(5)
        .join(movies, "movieId", how = 'inner')
        .select("userId", "movieId", "tittle",  f.col("prediction").alias("rec_ratings"))
)

In [44]:
recomendation_for_users.show(5, truncate=False)

+------+-------+-----------------------------------------+-----------+
|userId|movieId|tittle                                   |rec_ratings|
+------+-------+-----------------------------------------+-----------+
|150   |5746   |Galaxy of Terror (Quest) (1981)          |5.4795556  |
|150   |6835   |Alien Contamination (1980)               |5.4795556  |
|150   |99764  |It's Such a Beautiful Day (2012)         |5.442037   |
|150   |117192 |Doctor Who: The Time of the Doctor (2013)|5.558827   |
|150   |136850 |Villain (1971)                           |5.433422   |
+------+-------+-----------------------------------------+-----------+

