In [None]:
#when running in linux
#import findspark
#findspark.init('/home/pra/spark-2.4.3-bin-hadoop2.7')

In [61]:
#import required packages
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F
from pyspark.sql.functions import log

In [62]:
spark = SparkSession.builder.appName('recommendation').getOrCreate()

In [66]:
#Read Data
interactions_df = spark.read.csv('users_interactions.csv',header=True, inferSchema=True)
#interactions_df.head(10)
interactions_df

DataFrame[timestamp: int, eventType: string, contentId: bigint, personId: bigint, sessionId: bigint, userAgent: string, userRegion: string, userCountry: string]

In [73]:
#As ALS model requires userCol and itemCol in int format we change personId and ContentId from long int to int using string indexer
indexer = StringIndexer(inputCol="personId", outputCol="personId1")
indexed1 = indexer.fit(interactions_df).transform(interactions_df) 
indexer = StringIndexer(inputCol="contentId", outputCol="contentId1")
indexed1 = indexer.fit(indexed1).transform(indexed1) 

In [21]:
indexer = StringIndexer(inputCol="eventType", outputCol="eventStrength")
indexed1 = indexer.fit(indexed1).transform(indexed1) 
indexed1 = indexed1.withColumn('eventStrength1',indexed1['eventStrength']+1)
indexed1 = indexed1.select(['eventType','personId1','contentId1','eventStrength1'])
#indexed1.select(['eventType','eventStrength1']).distinct().show()

In [22]:
indexed1 = indexed1.withColumn('eventStrength1',
    F.when(indexed1['eventStrength1']==1,1.5).
    otherwise(indexed1['eventStrength1']))
indexed1.select(['eventType','eventStrength1']).distinct().show()

+---------------+--------------+
|      eventType|eventStrength1|
+---------------+--------------+
|         FOLLOW|           5.0|
|           VIEW|           1.5|
|COMMENT CREATED|           4.0|
|       BOOKMARK|           3.0|
|           LIKE|           2.0|
+---------------+--------------+



In [23]:
temp = indexed1.select(['personId1','contentId1','eventStrength1']).groupby(['personId1','contentId1']).sum()
temp = temp.select(['personId1','contentId1','sum(eventStrength1)'])
temp = temp.withColumn("eventStrength", log("sum(eventStrength1)"))
temp = temp.select(['personId1','contentId1','eventStrength'])
temp.show()

+---------+----------+------------------+
|personId1|contentId1|     eventStrength|
+---------+----------+------------------+
|   1194.0|     104.0|0.4054651081081644|
|   1563.0|    1078.0|0.4054651081081644|
|    275.0|    1349.0|0.4054651081081644|
|    149.0|     187.0|0.4054651081081644|
|     86.0|      87.0|0.4054651081081644|
|     44.0|     101.0|0.4054651081081644|
|    112.0|     657.0| 2.772588722239781|
|     40.0|     654.0|2.1972245773362196|
|      6.0|     465.0|2.1972245773362196|
|    290.0|    2518.0|0.4054651081081644|
|     74.0|     834.0|0.4054651081081644|
|      4.0|     922.0| 1.791759469228055|
|    961.0|    1358.0|1.0986122886681098|
|    216.0|     265.0|2.0794415416798357|
|     16.0|    1784.0|0.4054651081081644|
|    111.0|    1852.0|0.4054651081081644|
|    526.0|      37.0|1.0986122886681098|
|     61.0|     554.0|0.4054651081081644|
|      1.0|     437.0|1.0986122886681098|
|     98.0|    1015.0|0.6931471805599453|
+---------+----------+------------

In [24]:
(training, test) = temp.randomSplit([0.8, 0.2])

In [80]:
maxIter=10
reg=0.2
rank = 14
als = ALS(coldStartStrategy="drop",userCol="personId1", itemCol="contentId1", ratingCol='eventStrength').setMaxIter(maxIter).setRank(rank).setRegParam(reg)

rank = Number of latent factors
In production, for new users or items that have no rating history and on which the model has not been trained (this is the “cold start problem”).
currently the supported cold start strategies are “nan” and “drop”. Further strategies may be supported in future.

In [81]:
model = als.fit(training)

In [82]:
#make predictions
predictions = model.transform(test)
predictions.show()

+---------+----------+------------------+----------+
|personId1|contentId1|     eventStrength|prediction|
+---------+----------+------------------+----------+
|     28.0|     148.0|0.4054651081081644|0.60210997|
|    235.0|     148.0|1.0986122886681098|  0.554196|
|      4.0|     148.0|0.4054651081081644| 0.6798901|
|     51.0|     148.0|0.4054651081081644| 0.7906908|
|    404.0|     148.0|1.5040773967762742| 1.0197791|
|     45.0|     148.0|0.4054651081081644|0.63822436|
|    167.0|     148.0| 1.252762968495368|0.61203074|
|     70.0|     148.0|0.4054651081081644| 0.5861717|
|    492.0|     148.0|0.4054651081081644|0.67537326|
|    466.0|     148.0|0.4054651081081644| 0.4815166|
|    124.0|     148.0|0.4054651081081644|0.89902216|
|    144.0|     148.0|0.4054651081081644|0.55490786|
|    934.0|     148.0|0.4054651081081644|  0.388651|
|   1088.0|     463.0|0.4054651081081644|0.48600632|
|     48.0|     463.0|0.6931471805599453| 0.7216538|
|      7.0|     463.0|1.0986122886681098| 0.63

In [83]:
#Performance
evaluator = RegressionEvaluator(metricName="rmse", labelCol="eventStrength",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.6097846749909398


In [78]:
model.recommendForUserSubset(test,10).show(truncate=False)

+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|personId1|recommendations                                                                                                                                                                                   |
+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1580     |[[2646, 0.91587985], [2192, 0.89254314], [1004, 0.8718075], [131, 0.841737], [2240, 0.8103155], [1054, 0.78674114], [118, 0.7807522], [272, 0.7754787], [2368, 0.76627755], [2355, 0.7300656]]    |
|471      |[[2646, 1.8002559], [2192, 1.7940531], [131, 1.767715], [1004, 1.6687618], [2240, 1.5719346], [2368, 1.5493898], [1054, 1.5293586], [118, 1.4769846], [272, 1.456

In [39]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.2,0.3]) \
    .addGrid(als.rank, [18,20]) \
    .build()
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=2)

In [47]:
training = training.withColumn("label",training['eventStrength']*1)


In [48]:
cvModel = crossval.fit(training)

In [54]:
prediction = cvModel.transform(test)

In [60]:
cvModel.avgMetrics

[0.6345230491163932,
 0.6343878187343432,
 0.6524047958498627,
 0.6523966634643439]

In [None]:
52 48 3 0.03 