# Initializing Spark

In [5]:
# Import findspark to read SPARK_HOME and HADOOP_HOME
import findspark
findspark.init()
# Import required library
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder.appName("Simple data mining with Synthetic Financial Dataset").getOrCreate()
    

In [6]:
# Print Spark object ID
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001A678767630>


# Loading Dataset 

The given dataset has a format of .dat and no headers. So before i load the dataset, first i convert it to a csv format and gave headers based on the dataset description

In [28]:
df = spark.read.csv("D:\Kuliah\Smt6\Big Data\Tugas\Tugas 4\jester_ratings.csv", header=True, inferSchema=True)

In [29]:
#Show Dataset

df.show()

+------+------+------+
|userId|jokeId|rating|
+------+------+------+
|     1|     5| 0.219|
|     1|     7|-9.281|
|     1|     8|-9.281|
|     1|    13|-6.781|
|     1|    15| 0.875|
|     1|    16|-9.656|
|     1|    17|-9.031|
|     1|    18|-7.469|
|     1|    19|-8.719|
|     1|    20|-9.156|
|     1|    21|-7.188|
|     1|    22|-8.781|
|     1|    23|-8.531|
|     1|    24|-7.906|
|     1|    25|-7.469|
|     1|    89| 9.812|
|     1|    50| 9.906|
|     1|   102|  0.75|
|     1|   103|  -5.0|
|     1|   104| 2.938|
+------+------+------+
only showing top 20 rows



In [9]:
#Count how many rows in dataset

df.count()

1761439

In [30]:
#Show Dataset Schema

df.schema

StructType(List(StructField(userId,IntegerType,true),StructField(jokeId,IntegerType,true),StructField(rating,DoubleType,true)))

# Recomender Model

In [11]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [32]:
(training, test) = df.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="jokeId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [33]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.544751221462755


In [34]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[52, 29.516022],...|
|   463|[[80, 21.846846],...|
|   471|[[80, 17.509552],...|
|   496|[[86, 15.044545],...|
|   833|[[100, 9.5948105]...|
|  1088|[[71, 9.199725], ...|
|  1238|[[31, 4.66116], [...|
|  1342|[[83, 13.7630005]...|
|  1580|[[80, 6.9020944],...|
|  1591|[[135, 14.675354]...|
|  1645|[[100, 24.258127]...|
|  1829|[[43, 26.288832],...|
|  1959|[[80, 32.162617],...|
|  2142|[[80, 18.455503],...|
|  2366|[[140, 8.488211],...|
|  2659|[[71, 7.5537996],...|
|  2866|[[100, 31.024834]...|
|  3175|[[31, 8.550562], ...|
|  3749|[[116, 37.991333]...|
|  3794|[[43, 19.145561],...|
+------+--------------------+
only showing top 20 rows



In [35]:
# Generate top 10 user recommendations for each movie
jokeRecs = model.recommendForAllItems(10)
jokeRecs.show()

+------+--------------------+
|jokeId|     recommendations|
+------+--------------------+
|   148|[[3326, 20.734621...|
|    31|[[4294, 59.342907...|
|    85|[[61483, 27.85723...|
|   137|[[61377, 22.53096...|
|    65|[[8815, 26.413088...|
|    53|[[3305, 20.111027...|
|   133|[[57753, 18.95817...|
|    78|[[45596, 18.98806...|
|   108|[[8815, 15.663682...|
|    34|[[3292, 23.899088...|
|   101|[[59494, 30.35357...|
|   115|[[59500, 41.50966...|
|   126|[[51605, 17.99947...|
|    81|[[53222, 37.68750...|
|    28|[[3326, 24.297186...|
|    76|[[8815, 21.54119]...|
|    26|[[8815, 27.07493]...|
|    27|[[867, 49.093037]...|
|    44|[[52330, 33.64199...|
|   103|[[59494, 33.89000...|
+------+--------------------+
only showing top 20 rows



In [36]:
# Generate top 10 movie recommendations for a specified set of users
users = df.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[80, 17.509552],...|
|   463|[[80, 21.846846],...|
|   148|[[52, 29.516022],...|
+------+--------------------+



In [37]:
# Generate top 10 user recommendations for a specified set of movies
jokes = df.select(als.getItemCol()).distinct().limit(3)
jokeSubSetRecs = model.recommendForItemSubset(jokes, 10)
jokeSubSetRecs.show()

+------+--------------------+
|jokeId|     recommendations|
+------+--------------------+
|   148|[[3326, 20.734621...|
|    31|[[4294, 59.342907...|
|    85|[[61483, 27.85723...|
+------+--------------------+

