# Import libraries

In [None]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 69kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.2MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=4fbf7839f5cb4f1a8106bfa424ebea63677f954bdbed732cab9acc996bf349ec
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Recommender System").config("spark.sql.crossJoin.enabled","true").getOrCreate()

In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as f

# Load and verify data

In [None]:
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructType, StructField
schema = StructType([StructField('userId', IntegerType(), True),
                     StructField('movieId', IntegerType(), True),
                     StructField('rating', IntegerType(), True),
                     StructField('timestamp', DoubleType(), True)])


In [None]:
data = spark.read.csv('/content/ratings.dat',sep = '::', header = False, schema = schema)

In [None]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: double (nullable = true)



In [None]:
data.head(3)

[Row(userId=1, movieId=1193, rating=5, timestamp=978300760.0),
 Row(userId=1, movieId=661, rating=3, timestamp=978302109.0),
 Row(userId=1, movieId=914, rating=3, timestamp=978301968.0)]

In [None]:
for item in data.head(1)[0]:
    print(item)

1
1193
5
978300760.0


In [None]:
data.columns

['userId', 'movieId', 'rating', 'timestamp']

In [None]:
data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|           1000209|           1000209|           1000209|             1000209|
|   mean| 3024.512347919285|1865.5398981612843| 3.581564453029317| 9.722436954046655E8|
| stddev|1728.4126948999626|1096.0406894572563|1.1171018453732653|1.2152558939921828E7|
|    min|                 1|                 1|                 1|        9.56703932E8|
|    max|              6040|              3952|                 5|        1.04645459E9|
+-------+------------------+------------------+------------------+--------------------+



# Train Test split

In [None]:
train_data,test_data = data.randomSplit([0.7,0.3])

In [None]:
train_data.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|            700645|            700645|            700645|              700645|
|   mean|3025.8179134939946|1864.4839369438162|3.5806563951787282| 9.722400821280506E8|
| stddev| 1727.604350903678| 1096.210770969995| 1.117081405963871|1.2154209165474277E7|
|    min|                 1|                 1|                 1|        9.56703932E8|
|    max|              6040|              3952|                 5|        1.04645459E9|
+-------+------------------+------------------+------------------+--------------------+



In [None]:
test_data.describe().show()

+-------+-----------------+------------------+------------------+--------------------+
|summary|           userId|           movieId|            rating|           timestamp|
+-------+-----------------+------------------+------------------+--------------------+
|  count|           299564|            299564|            299564|              299564|
|   mean|3021.458783431921|1868.0096673832636| 3.583688293653443| 9.722521464341409E8|
| stddev|1730.300880946896|1095.6406411732778|1.1171486318358028|1.2148714467659233E7|
|    min|                1|                 1|                 1|        9.56703954E8|
|    max|             6040|              3952|                 5|        1.04645426E9|
+-------+-----------------+------------------+------------------+--------------------+



# Build Model

In [None]:
recommender = ALS(maxIter = 5, regParam = 0.01, userCol='userId', itemCol='movieId', ratingCol='rating')

In [None]:
model = recommender.fit(train_data)

# Evaluate Model

In [None]:
pred_data = model.transform(test_data)

In [None]:
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')
print("Root-mean-square error = " + str(evaluator.evaluate(pred_data)))

Root-mean-square error = nan


In [None]:
avgRatings = data.select('rating').groupBy().avg().first()[0]
print ('The average rating in the dataset is: {}'.format(avgRatings))

The average rating in the dataset is: 3.581564453029317


In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print ('The root mean squared error for our model is: {}'.format(evaluator.evaluate(pred_data.na.fill(avgRatings))))

The root mean squared error for our model is: 0.9094308038080811


In [None]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
print ('The root mean squared error for our model is: {}'.format(evaluator.evaluate(pred_data.na.drop())))

The root mean squared error for our model is: 0.9092248779785612


# Define Movie details 

In [None]:
schema = StructType([StructField('movieId', IntegerType(), True),
                     StructField('title', StringType(), True),
                     StructField('genres', StringType(), True)])
movieDetails = spark.read.csv('movies.dat',sep = '::', header = False, schema = schema)
movieDetails.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
movieDetails.head(3)

[Row(movieId=1, title='Toy Story (1995)', genres="Animation|Children's|Comedy"),
 Row(movieId=2, title='Jumanji (1995)', genres="Adventure|Children's|Fantasy"),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance')]

In [None]:
schema = StructType([StructField('UserID', IntegerType(), True),
                     StructField('Gender', StringType(), True),
                     StructField('Age', IntegerType(), True),
                     StructField('Occupation', IntegerType(), True),
                     StructField('Zipcode', IntegerType(), True)])
occupation ={
  0:  "other", 
  1:  "academic/educator",
  2:  "artist",
  3:  "clerical/admin",
  4:  "college/grad student",
  5:  "customer service",
  6:  "doctor/health care",
  7:  "executive/managerial",
  8:  "farmer",
  9:  "homemaker",
 10:  "K-12 student",
 11:  "lawyer",
 12:  "programmer",
 13:  "retired",
 14:  "sales/marketing",
 15:  "scientist",
 16:  "self-employed",
 17:  "technician/engineer",
 18:  "tradesman/craftsman",
 19:  "unemployed",
 20:  "writer"  
    
}
userDetails = spark.read.csv('users.dat',sep = '::', header = False, schema = schema)
userDetails.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Occupation: integer (nullable = true)
 |-- Zipcode: integer (nullable = true)



In [None]:
userDetails.head(3)

[Row(UserID=1, Gender='F', Age=1, Occupation=10, Zipcode=48067),
 Row(UserID=2, Gender='M', Age=56, Occupation=16, Zipcode=70072),
 Row(UserID=3, Gender='M', Age=25, Occupation=15, Zipcode=55117)]

# Movie recommendations 

In [None]:

userRecs = model.recommendForAllUsers(13)
userRecs.show(20, False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                                                                   |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580  |[{1574, 8.63148}, {1846, 8.37485}, {2482, 8.179921}, {2129, 7.9769564}, {283, 7.6768317}, {681, 7.6043105}, {1749, 7.541596}, {1310, 7.5154037}, {1904, 7.464633}, {152, 7.4041233}, {2063, 7.316945}, {2156, 7.267761}, {3344, 7.2536316}] 

In [None]:
userRecsExplode = userRecs.select(userRecs.userId,f.explode(userRecs.recommendations)).orderBy(userRecs.userId)
userRecsExplode.show(20, False)

+------+-----------------+
|userId|col              |
+------+-----------------+
|1     |{831, 7.3106966} |
|1     |{1666, 7.8047237}|
|1     |{632, 8.264851}  |
|1     |{2305, 7.3872104}|
|1     |{2192, 8.426063} |
|1     |{96, 7.083954}   |
|1     |{2219, 6.9925656}|
|1     |{1930, 8.415944} |
|1     |{3490, 7.174725} |
|1     |{2426, 7.6490207}|
|1     |{2831, 7.599241} |
|1     |{561, 7.8688364} |
|1     |{2933, 7.769332} |
|2     |{3532, 5.899361} |
|2     |{561, 5.636262}  |
|2     |{1695, 5.7109065}|
|2     |{864, 6.7353077} |
|2     |{1780, 5.7233067}|
|2     |{3092, 6.0218434}|
|2     |{3492, 5.783537} |
+------+-----------------+
only showing top 20 rows



In [None]:

movieRecs = model.recommendForAllItems(10)
movieRecs.join(movieDetails,movieRecs.movieId==movieDetails.movieId,"left").select([movieRecs.movieId,movieDetails.title,movieDetails.genres,movieRecs.recommendations]).show(20,False)

+-------+------------------------------------------+--------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|movieId|title                                     |genres                          |recommendations                                                                                                                                                                             |
+-------+------------------------------------------+--------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580   |Men in Black (1997)                       |Action|Adventure|Comedy|Sci-Fi  |[{283, 5.2937593}, {3897, 5.2598047}, {5072, 5.1444917}, {2339, 5.1137643}, {3902, 5.10570

In [None]:
movieRecsExplode = movieRecs.select(movieRecs.movieId,f.explode(movieRecs.recommendations)).orderBy(movieRecs.movieId)
movieRecsExplode.join(movieDetails,movieRecsExplode.movieId==movieDetails.movieId,"left").select([movieRecsExplode.movieId,movieDetails.title,movieDetails.genres,movieRecsExplode.col.alias('recommendation')]).show()

+-------+--------------------+--------------------+-----------------+
|movieId|               title|              genres|   recommendation|
+-------+--------------------+--------------------+-----------------+
|   1580| Men in Black (1997)|Action|Adventure|...| {283, 5.2937593}|
|   1580| Men in Black (1997)|Action|Adventure|...|{3897, 5.2598047}|
|   1580| Men in Black (1997)|Action|Adventure|...|{5072, 5.1444917}|
|   1580| Men in Black (1997)|Action|Adventure|...|{2339, 5.1137643}|
|   1580| Men in Black (1997)|Action|Adventure|...|{3902, 5.1057076}|
|   1580| Men in Black (1997)|Action|Adventure|...|{1341, 5.0690293}|
|   1580| Men in Black (1997)|Action|Adventure|...|{2363, 5.0325427}|
|   1580| Men in Black (1997)|Action|Adventure|...| {4068, 5.031277}|
|   1580| Men in Black (1997)|Action|Adventure|...|   {41, 5.015733}|
|   1580| Men in Black (1997)|Action|Adventure|...|{5218, 4.9958496}|
|    471|Hudsucker Proxy, ...|      Comedy|Romance|  {269, 7.903981}|
|    471|Hudsucker P

In [None]:

singleUser = test_data.filter(test_data.userId==5300).select(['movieId','userId','rating'])
singleUser.join(movieDetails,singleUser.movieId==movieDetails.movieId,"left").select([singleUser.userId,singleUser.movieId,singleUser.rating,movieDetails.title,movieDetails.genres]).show(20,False)

+------+-------+------+-------------------------------------------------------------------------------+-----------------------------+
|userId|movieId|rating|title                                                                          |genres                       |
+------+-------+------+-------------------------------------------------------------------------------+-----------------------------+
|5300  |34     |4     |Babe (1995)                                                                    |Children's|Comedy|Drama      |
|5300  |125    |5     |Flirting With Disaster (1996)                                                  |Comedy                       |
|5300  |224    |4     |Don Juan DeMarco (1995)                                                        |Comedy|Drama|Romance         |
|5300  |235    |4     |Ed Wood (1994)                                                                 |Comedy|Drama                 |
|5300  |363    |5     |Wonderful, Horrible Life of Leni Riefen

In [None]:
userSubsetRecs = model.recommendForUserSubset(singleUser, 10)
userSubsetRecsExplode = userSubsetRecs.select(userSubsetRecs.userId,f.explode(userSubsetRecs.recommendations))
userSubsetRecsExplode.show()

+------+-----------------+
|userId|              col|
+------+-----------------+
|  5300|{2933, 6.2454147}|
|  5300|{3862, 6.0543795}|
|  5300| {557, 5.7669897}|
|  5300|{2129, 5.7041345}|
|  5300|{3416, 5.6975193}|
|  5300|  {1232, 5.48309}|
|  5300|{2963, 5.4634175}|
|  5300|{1534, 5.4455986}|
|  5300|{1076, 5.3692207}|
|  5300|{1138, 5.3632603}|
+------+-----------------+



In [None]:
recommendations = model.transform(singleUser)
userRecommendations= recommendations.orderBy('prediction',ascending=False)
userRecommendations.show(20,False)

+-------+------+------+----------+
|movieId|userId|rating|prediction|
+-------+------+------+----------+
|608    |5300  |5     |5.0221515 |
|1178   |5300  |5     |4.9657216 |
|2019   |5300  |5     |4.962606  |
|2997   |5300  |4     |4.9571986 |
|3077   |5300  |5     |4.950061  |
|1193   |5300  |5     |4.896202  |
|924    |5300  |5     |4.891986  |
|1147   |5300  |5     |4.868163  |
|1228   |5300  |5     |4.8566456 |
|3182   |5300  |5     |4.826264  |
|363    |5300  |5     |4.789949  |
|1136   |5300  |5     |4.7842155 |
|2395   |5300  |5     |4.738474  |
|1288   |5300  |5     |4.71871   |
|1080   |5300  |5     |4.691609  |
|2859   |5300  |4     |4.6670184 |
|3091   |5300  |4     |4.634818  |
|1214   |5300  |5     |4.6145015 |
|1617   |5300  |4     |4.601555  |
|2959   |5300  |4     |4.5337505 |
+-------+------+------+----------+
only showing top 20 rows



In [None]:
userRecommendations.join(movieDetails,userRecommendations.movieId==movieDetails.movieId,"left").select([userRecommendations.userId,movieDetails.title,movieDetails.genres,userRecommendations.prediction]).show(20,False)

+------+-------------------------------------------------------------------------------+-----------------------------------+----------+
|userId|title                                                                          |genres                             |prediction|
+------+-------------------------------------------------------------------------------+-----------------------------------+----------+
|5300  |Hudsucker Proxy, The (1994)                                                    |Comedy|Romance                     |4.4039907 |
|5300  |Men in Black (1997)                                                            |Action|Adventure|Comedy|Sci-Fi     |3.615427  |
|5300  |Fear and Loathing in Las Vegas (1998)                                          |Comedy|Drama                       |4.2510557 |
|5300  |Groove (2000)                                                                  |Drama                              |2.8516483 |
|5300  |Babe (1995)                             

In [None]:

singleMovie = test_data.filter(test_data.movieId==1).select(['movieId','userId'])
singleMovie.join(movieDetails,singleMovie.movieId==movieDetails.movieId,"left").select([singleMovie.movieId,movieDetails.title,movieDetails.genres,singleMovie.userId]).show(20,False)


+-------+----------------+---------------------------+------+
|movieId|title           |genres                     |userId|
+-------+----------------+---------------------------+------+
|1      |Toy Story (1995)|Animation|Children's|Comedy|1     |
|1      |Toy Story (1995)|Animation|Children's|Comedy|8     |
|1      |Toy Story (1995)|Animation|Children's|Comedy|18    |
|1      |Toy Story (1995)|Animation|Children's|Comedy|38    |
|1      |Toy Story (1995)|Animation|Children's|Comedy|49    |
|1      |Toy Story (1995)|Animation|Children's|Comedy|65    |
|1      |Toy Story (1995)|Animation|Children's|Comedy|90    |
|1      |Toy Story (1995)|Animation|Children's|Comedy|112   |
|1      |Toy Story (1995)|Animation|Children's|Comedy|119   |
|1      |Toy Story (1995)|Animation|Children's|Comedy|121   |
|1      |Toy Story (1995)|Animation|Children's|Comedy|134   |
|1      |Toy Story (1995)|Animation|Children's|Comedy|149   |
|1      |Toy Story (1995)|Animation|Children's|Comedy|151   |
|1      