# Initializing Spark

In [5]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Simple data mining with Synthetic Financial Dataset").getOrCreate()
    

In [6]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001A678767630>


# Loading Dataset 

The given dataset has a format of .dat and no headers. So before i load the dataset, first i convert it to a csv format and gave headers based on the dataset description

In [7]:
df = spark.read.csv("D:\Kuliah\Smt6\Big Data\Tugas\Tugas 4\jester_ratings.csv", header=True, inferSchema=True)

In [8]:
#Show Dataset

df.show()

+------+------+------+
|UserID|ItemID|Rating|
+------+------+------+
|     1|     5| 0.219|
|     1|     7|-9.281|
|     1|     8|-9.281|
|     1|    13|-6.781|
|     1|    15| 0.875|
|     1|    16|-9.656|
|     1|    17|-9.031|
|     1|    18|-7.469|
|     1|    19|-8.719|
|     1|    20|-9.156|
|     1|    21|-7.188|
|     1|    22|-8.781|
|     1|    23|-8.531|
|     1|    24|-7.906|
|     1|    25|-7.469|
|     1|    89| 9.812|
|     1|    50| 9.906|
|     1|   102|  0.75|
|     1|   103|  -5.0|
|     1|   104| 2.938|
+------+------+------+
only showing top 20 rows



In [9]:
#Count how many rows in dataset

df.count()

1761439

In [10]:
#Show Dataset Schema

df.schema

StructType(List(StructField(UserID,IntegerType,true),StructField(ItemID,IntegerType,true),StructField(Rating,DoubleType,true)))

# Recomender Model

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [14]:
(training, test) = df.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="UserID", itemCol="ItemID", ratingCol="Rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [17]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.532665840698821


In [18]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   148|[[116, 42.288284]...|
|   463|[[51, 37.017757],...|
|   471|[[31, 15.354328],...|
|   496|[[124, 16.072704]...|
|   833|[[52, 13.816869],...|
|  1088|[[52, 8.294235], ...|
|  1238|[[27, 7.6120276],...|
|  1342|[[120, 13.683072]...|
|  1580|[[20, 3.8503761],...|
|  1591|[[115, 12.142138]...|
|  1645|[[27, 4.1528196],...|
|  1829|[[5, 15.542175], ...|
|  1959|[[71, 28.089771],...|
|  2142|[[140, 13.44206],...|
|  2366|[[60, 4.511176], ...|
|  2659|[[121, 7.565319],...|
|  2866|[[25, 10.344862],...|
|  3175|[[89, 5.9849987],...|
|  3749|[[43, 14.191393],...|
|  3794|[[116, 12.546617]...|
+------+--------------------+
only showing top 20 rows



In [20]:
# Generate top 10 user recommendations for each movie
jokeRecs = model.recommendForAllItems(10)
jokeRecs.show()

+------+--------------------+
|ItemID|     recommendations|
+------+--------------------+
|   148|[[47876, 18.26961...|
|    31|[[58308, 37.98753...|
|    85|[[5662, 28.845434...|
|   137|[[28670, 16.21480...|
|    65|[[32602, 27.56709...|
|    53|[[6709, 19.833162...|
|   133|[[6669, 21.238832...|
|    78|[[7044, 24.031685...|
|   108|[[58141, 14.64467...|
|    34|[[1380, 25.779966...|
|   101|[[4809, 33.645332...|
|   115|[[20817, 34.09015...|
|   126|[[58141, 19.95779...|
|    81|[[25651, 27.24469...|
|    28|[[20607, 24.56235...|
|    76|[[47876, 14.89076...|
|    26|[[32602, 21.05184...|
|    27|[[48175, 52.7454]...|
|    44|[[4809, 37.000523...|
|   103|[[4809, 30.195704...|
+------+--------------------+
only showing top 20 rows



In [24]:
# Generate top 10 movie recommendations for a specified set of users
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|UserID|     recommendations|
+------+--------------------+
|   471|[[31, 15.354328],...|
|   463|[[51, 37.017757],...|
|   148|[[116, 42.288284]...|
+------+--------------------+



In [27]:
# Generate top 10 user recommendations for a specified set of movies
jokes = df.select(als.getItemCol()).distinct().limit(3)
jokeSubSetRecs = model.recommendForItemSubset(jokes, 10)
jokeSubSetRecs.show()

+------+--------------------+
|ItemID|     recommendations|
+------+--------------------+
|   148|[[47876, 18.26961...|
|    31|[[58308, 37.98753...|
|    85|[[5662, 28.845434...|
+------+--------------------+

