In [1]:
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.clustering import KMeansModel
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
from pyspark.pandas import read_csv
import pyspark.pandas as ps
ps.set_option('compute.default_index_type', 'distributed')

In [2]:
ALS_model = ALSModel.load("models/als_explicit_collab")
kmeans = KMeansModel.load('models/kmeans')

In [3]:
path = 'data/merged_predictions/stacking_data.csv'

In [4]:
df = ps.read_csv(path)

In [10]:
df = df.fillna(-1) # replace predicted nans with -1

In [47]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel

In [16]:
df.head()

Unnamed: 0,userId,streamId,interactionTime,streamerId,collab_preds,avgInteractionTime,interactionCounts,item_preds,avgInteractionTimeScaled,interactionCountsScaled
0,1,33842865744,2.03125,3047137,1.523179,2.605432,5706,1,1.007278,22.695565
1,1,33846768288,3.0625,3038334,1.135493,2.03125,225,0,0.785295,0.894936
2,1,33886469056,1.0,3047137,1.523179,2.605432,5706,1,1.007278,22.695565
3,1,33887624992,2.03125,3046060,1.402077,2.40297,3859,3,0.929005,15.349139
4,1,33890145056,3.0625,3050226,1.872689,3.075806,1705,4,1.189128,6.781623


In [29]:
centroids = kmeans.clusterCenters()

In [35]:
centroids

[array([2.75919179, 9.92815865]),
 array([3.68205921e+00, 7.65566667e+03]),
 array([3.04801522e+00, 1.23880000e+04]),
 array([3.26974429e+00, 3.69220000e+03]),
 array([   3.26772161, 1196.73770492])]

In [33]:
df['centroids_0'] = df['item_preds'].apply(lambda x: centroids[x][0])
df['centroids_1'] = df['item_preds'].apply(lambda x: centroids[x][1])

In [34]:
df.head()

Unnamed: 0,userId,streamId,interactionTime,streamerId,collab_preds,avgInteractionTime,interactionCounts,item_preds,avgInteractionTimeScaled,interactionCountsScaled,centroids_0,centroids_1
0,1,33842865744,2.03125,3047137,1.523179,2.605432,5706,1,1.007278,22.695565,3.682059,7655.666667
1,1,33846768288,3.0625,3038334,1.135493,2.03125,225,0,0.785295,0.894936,2.759192,9.928159
2,1,33886469056,1.0,3047137,1.523179,2.605432,5706,1,1.007278,22.695565,3.682059,7655.666667
3,1,33887624992,2.03125,3046060,1.402077,2.40297,3859,3,0.929005,15.349139,3.269744,3692.2
4,1,33890145056,3.0625,3050226,1.872689,3.075806,1705,4,1.189128,6.781623,3.267722,1196.737705


In [36]:
spark_df = df.to_spark()
#(training, test) = ratings.randomSplit([0.8,0.2], 38)

In [37]:
from pyspark.ml.feature import VectorAssembler
assemble=VectorAssembler(inputCols=[
'collab_preds', 'centroids_0','centroids_1'], outputCol='features')
assembled_data=assemble.transform(spark_df)

In [38]:
assembled_data.columns

['userId',
 'streamId',
 'interactionTime',
 'streamerId',
 'collab_preds',
 'avgInteractionTime',
 'interactionCounts',
 'item_preds',
 'avgInteractionTimeScaled',
 'interactionCountsScaled',
 'centroids_0',
 'centroids_1',
 'features']

In [39]:
(training, test)=assembled_data.randomSplit([0.8,0.2], 38)

In [41]:
lr = LinearRegression(featuresCol='features',labelCol='interactionTime',maxIter=10, regParam=0.3, elasticNetParam=0.8) 
# no hyperparm tuning yet
lrModel = lr.fit(training)

In [45]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = lrModel.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol='interactionTime',
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.6991871865671713


In [46]:
lrModel.save('models/lin_reg')

In [49]:
model2 = LinearRegressionModel.load('models/lin_reg')# test model

In [53]:
predictions = lrModel.transform(assembled_data)

In [57]:
predictions=predictions.drop('features')

In [59]:
predictions.coalesce(1).write.format('csv').option('header', 'true').save('data/final_lr_preds')