In [1]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.pandas import read_csv
import pyspark.pandas as ps
# means is for items, df is for collab

In [2]:
ps.set_option('compute.default_index_type', 'distributed')
path ='data\collab\collab_filter.csv'
df = read_csv(path)

In [3]:
df.head()

Unnamed: 0,userId,streamId,interactionTime,streamerId
0,1,33842865744,2.03125,3047137
1,1,33846768288,3.0625,3038334
2,1,33886469056,1.0,3047137
3,1,33887624992,2.03125,3046060
4,1,33890145056,3.0625,3050226


In [4]:
ratings = df.to_spark()

In [6]:
ratings.show()

+------+-----------+------------------+----------+
|userId|   streamId|   interactionTime|streamerId|
+------+-----------+------------------+----------+
|     1|33842865744|           2.03125|   3047137|
|     1|33846768288|            3.0625|   3038334|
|     1|33886469056|               1.0|   3047137|
|     1|33887624992|           2.03125|   3046060|
|     1|33890145056|            3.0625|   3050226|
|     1|33903958784|            3.0625|   3038141|
|     1|33929318864|15.437500000000002|   3050209|
|     1|33942837056|           2.03125|   3046060|
|     1|33955351648|           2.03125|   3050209|
|     1|34060922080|               1.0|   3047137|
|     1|34062621584|           2.03125|   3038188|
|     1|34077379792|           2.03125|   3047137|
|     1|34078096176|               1.0|   3038868|
|     1|34079135968|               1.0|   3050215|
|     1|34082259232|               1.0|   3038868|
|     1|34157036272|               1.0|   3047137|
|     1|34169481232|           

In [7]:
(training, test) = ratings.randomSplit([0.8,0.2], 38)

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
iteration = 10
mapping = {'item_id':'streamerId','rating': 'interactionTime', 'user_id': 'userId'}
als = ALS(maxIter=iteration, regParam=.8, rank=15, nonnegative=True,
          userCol=mapping['user_id'], itemCol=mapping["item_id"], ratingCol=mapping["rating"], coldStartStrategy='NaN')

In [10]:
model = als.fit(training)

In [15]:
predictions = model.transform(test) # NEED TO CONVERT NANS TO SOMETHING USEFUL!!!!!!!!!!!!!
evaluator = RegressionEvaluator(metricName="rmse", labelCol=mapping["rating"],
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions.dropna())
print("Root-mean-square error = " + str(rmse))

# Save the model
model.save("models/als_explicit_collab")
print("Model successfully saved")

Root-mean-square error = 4.164839496155471
Model successfully saved


In [16]:
predictions.show() # NEED TO CONVERT NANS TO SOMETHING USEFUL!!!!!!!!!!!!!

+------+-----------+------------------+----------+----------+
|userId|   streamId|   interactionTime|streamerId|prediction|
+------+-----------+------------------+----------+----------+
|     1|33846768288|            3.0625|   3038334|  1.135493|
|     1|33929318864|15.437500000000002|   3050209| 1.6648835|
|     1|34169481232|               1.0|   3050209| 1.6648835|
|     2|33990610144|               1.0|   3049965|0.93534696|
|     3|34275598832|               1.0|   1815230|0.67058045|
|     3|33842237568|               1.0|        87|       NaN|
|     1|34263962048|           2.03125|   3050222| 1.5576028|
|     2|34126387600|           2.03125|   2496184| 0.7445727|
|     3|33914081152|           2.03125|   2695851| 1.8658056|
|     3|33927084576|            3.0625|   2695851| 1.8658056|
|     3|34362468768|               1.0|   2695851| 1.8658056|
|     2|34033405968|               1.0|   3049950| 1.6418666|
|     2|34101551216|            3.0625|   3049950| 1.6418666|
|     2|

In [21]:
# test if model can be loaded
kmeans = ALSModel.load("models/als_explicit_collab")

In [22]:
predictions = kmeans.transform(ratings) # predict on everything

In [23]:
predictions.show()

+------+-----------+------------------+----------+----------+
|userId|   streamId|   interactionTime|streamerId|prediction|
+------+-----------+------------------+----------+----------+
|     1|33842865744|           2.03125|   3047137| 1.5231787|
|     1|33846768288|            3.0625|   3038334|  1.135493|
|     1|33886469056|               1.0|   3047137| 1.5231787|
|     1|33887624992|           2.03125|   3046060| 1.4020773|
|     1|33890145056|            3.0625|   3050226| 1.8726895|
|     1|33903958784|            3.0625|   3038141| 1.1227242|
|     1|33929318864|15.437500000000002|   3050209| 1.6648835|
|     1|33942837056|           2.03125|   3046060| 1.4020773|
|     1|33955351648|           2.03125|   3050209| 1.6648835|
|     1|34060922080|               1.0|   3047137| 1.5231787|
|     1|34062621584|           2.03125|   3038188| 1.8833702|
|     1|34077379792|           2.03125|   3047137| 1.5231787|
|     1|34078096176|               1.0|   3038868| 1.4088013|
|     1|

In [24]:
predictions.coalesce(1).write.format('csv').option('header', 'true').save('data/collab_predictions')