In [1]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.clustering import KMeansModel
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.pandas import read_csv
import pyspark.pandas as ps
ps.set_option('compute.default_index_type', 'distributed')

In [2]:
ALS_model = ALSModel.load("models/als_explicit_collab")
kmeans = KMeansModel.load('models/kmeans')

In [3]:
path = 'data/merged_predictions/stacking_data.csv'

In [4]:
df = ps.read_csv(path)

In [5]:
df = df.fillna(-1) # replace predicted nans with -1

In [6]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

In [7]:
df.head()

Unnamed: 0,userId,streamId,interactionTime,streamerId,collab_preds,avgInteractionTime,interactionCounts,item_preds,avgInteractionTimeScaled,interactionCountsScaled
0,1,33842865744,2.03125,3047137,1.523179,2.605432,5706,4,1.007278,22.695565
1,1,33846768288,3.0625,3038334,1.135493,2.03125,225,7,0.785295,0.894936
2,1,33886469056,1.0,3047137,1.523179,2.605432,5706,4,1.007278,22.695565
3,1,33887624992,2.03125,3046060,1.402077,2.40297,3859,2,0.929005,15.349139
4,1,33890145056,3.0625,3050226,1.872689,3.075806,1705,5,1.189128,6.781623


In [8]:
centroids = kmeans.clusterCenters()

In [9]:
centroids

[array([2.74042171, 5.82940959]),
 array([2.6126197e+00, 4.2640000e+04]),
 array([3.54289647e+00, 3.56205556e+03]),
 array([3.28779769e+00, 1.31603333e+04]),
 array([3.44739713e+00, 6.62713514e+03]),
 array([   3.33713562, 1775.05050505]),
 array([2.79294734e+00, 2.73620000e+04]),
 array([  3.18987411, 230.62720403]),
 array([  3.12847051, 759.94425676])]

In [10]:
df['centroids_0'] = df['item_preds'].apply(lambda x: centroids[x][0])
df['centroids_1'] = df['item_preds'].apply(lambda x: centroids[x][1])

In [11]:
df.head()

Unnamed: 0,userId,streamId,interactionTime,streamerId,collab_preds,avgInteractionTime,interactionCounts,item_preds,avgInteractionTimeScaled,interactionCountsScaled,centroids_0,centroids_1
0,1,33842865744,2.03125,3047137,1.523179,2.605432,5706,4,1.007278,22.695565,3.447397,6627.135135
1,1,33846768288,3.0625,3038334,1.135493,2.03125,225,7,0.785295,0.894936,3.189874,230.627204
2,1,33886469056,1.0,3047137,1.523179,2.605432,5706,4,1.007278,22.695565,3.447397,6627.135135
3,1,33887624992,2.03125,3046060,1.402077,2.40297,3859,2,0.929005,15.349139,3.542896,3562.055556
4,1,33890145056,3.0625,3050226,1.872689,3.075806,1705,5,1.189128,6.781623,3.337136,1775.050505


In [12]:
spark_df = df.to_spark()
#(training, test) = ratings.randomSplit([0.8,0.2], 38)

In [13]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=[
'collab_preds', 'centroids_0','centroids_1'], outputCol='features')
assembled_data=assemble.transform(spark_df)

In [14]:
assembled_data.columns

['userId',
 'streamId',
 'interactionTime',
 'streamerId',
 'collab_preds',
 'avgInteractionTime',
 'interactionCounts',
 'item_preds',
 'avgInteractionTimeScaled',
 'interactionCountsScaled',
 'centroids_0',
 'centroids_1',
 'features']

In [15]:
(training, test)=assembled_data.randomSplit([0.8,0.2], 38)

In [26]:
lr = LinearRegression(featuresCol='features',labelCol='interactionTime', regParam=0.3, elasticNetParam=0.8) 
# no hyperparm tuning yet
lrModel = lr.fit(training)

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = lrModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol='interactionTime',
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.6991871662072335


In [18]:
lrModel.write().overwrite().save('models/lin_reg')

In [19]:
model2 = LinearRegressionModel.load('models/lin_reg')# test model

In [20]:
predictions = lrModel.transform(assembled_data)

In [21]:
predictions=predictions.drop('features')

In [22]:
predictions.coalesce(1).write.format('csv').mode("overwrite").option('header', 'true').save('data/final_lr_preds')

In [23]:
from pyspark.ml.regression import IsotonicRegression, IsotonicRegressionModel

In [24]:
model = IsotonicRegression(labelCol='interactionTime').fit(training)

In [25]:
rmse_i=evaluator.evaluate(model.transform(test))
print("Root-mean-square error = " + str(rmse_i))

Root-mean-square error = 3.682140982215678
