In [3]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
#os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'

from pyspark.pandas import read_csv
import pyspark.pandas as ps
# means is for items, df is for collab
ps.set_option('compute.default_index_type', 'distributed')
path ='data/item/means.csv'

In [4]:
df = read_csv(path)

In [5]:
df.head()

Unnamed: 0,streamerId,avgInteractionTime,interactionCounts
0,2987163,3.338468,71
1,2751748,2.243566,34
2,3048233,2.03125,21
3,3051309,1.709515,1093
4,1950,1.0,1


In [15]:
from pyspark.ml.feature import VectorAssembler
spark_df =df.to_spark() 

In [21]:
assemble=VectorAssembler(inputCols=[
'avgInteractionTime', 'interactionCounts'], outputCol='features')
assembled_data=assemble.transform(spark_df)

In [29]:
assembled_data.show(3)

+----------+------------------+-----------------+--------------------+
|streamerId|avgInteractionTime|interactionCounts|            features|
+----------+------------------+-----------------+--------------------+
|   2987163| 3.338468309859155|               71|[3.33846830985915...|
|   2751748|2.2435661764705883|               34|[2.24356617647058...|
|   3048233|           2.03125|               21|      [2.03125,21.0]|
+----------+------------------+-----------------+--------------------+
only showing top 3 rows



In [30]:
from pyspark.ml.feature import StandardScaler
scale=StandardScaler(inputCol='features',outputCol='standardized')
data_scale=scale.fit(assembled_data)
data_scale_output=data_scale.transform(assembled_data)

In [31]:
import numpy as np
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
cost = []
for k in range(5,15):
    print(k)
    kmeans = KMeans().setK(k).setSeed(38).setFeaturesCol("features")
    model = kmeans.fit(data_scale_output.select('streamerId', 'features').sample(False,0.1, seed=38))
    predictions = model.transform(data_scale_output)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))
    cost.append(silhouette)

5
Silhouette with squared euclidean distance = 0.9952402756959156
6
Silhouette with squared euclidean distance = 0.9892085265877981
7
Silhouette with squared euclidean distance = 0.9892615567140653
8
Silhouette with squared euclidean distance = 0.9892305018294876
9
Silhouette with squared euclidean distance = 0.9871092278870074
10
Silhouette with squared euclidean distance = 0.9716866639855457
11
Silhouette with squared euclidean distance = 0.9578553211992997
12
Silhouette with squared euclidean distance = 0.9184192059600309
13
Silhouette with squared euclidean distance = 0.9132974344095116
14
Silhouette with squared euclidean distance = 0.9085563512128815


In [37]:
kmeans = KMeans().setK(np.argmax(cost)+5).setSeed(38).setFeaturesCol("features") # use highest silhouette value
model = kmeans.fit(data_scale_output.select('streamerId', 'features').sample(False,0.1, seed=38))
predictions = model.transform(data_scale_output)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette with squared euclidean distance for k = {np.argmax(cost)+5}: " + str(silhouette))

Silhouette with squared euclidean distance for k = 5: 0.9952402756959156


In [38]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[2.75919179 9.92815865]
[3.68205921e+00 7.65566667e+03]
[3.04801522e+00 1.23880000e+04]
[3.26974429e+00 3.69220000e+03]
[   3.26772161 1196.73770492]


In [40]:
predictions.show()

+----------+------------------+-----------------+--------------------+--------------------+----------+
|streamerId|avgInteractionTime|interactionCounts|            features|        standardized|prediction|
+----------+------------------+-----------------+--------------------+--------------------+----------+
|   2987163| 3.338468309859155|               71|[3.33846830985915...|[1.29067469396726...|         0|
|   2751748|2.2435661764705883|               34|[2.24356617647058...|[0.86737803670619...|         0|
|   3048233|           2.03125|               21|      [2.03125,21.0]|[0.78529514998799...|         0|
|   3051309|1.7095150960658738|             1093|[1.70951509606587...|[0.66091023447226...|         4|
|      1950|               1.0|                1|           [1.0,1.0]|[0.38660684307101...|         0|
|   2352113|         1.7734375|                4|     [1.7734375,4.0]|[0.68562307325874...|         0|
|      2529|          2.546875|                2|      [2.546875,2.0]|[0.

In [47]:
predictions_pandas = predictions.to_pandas_on_spark()

In [48]:
predictions_pandas.head()

Unnamed: 0,streamerId,avgInteractionTime,interactionCounts,features,standardized,prediction
0,2987163,3.338468,71,"[3.338468309859155, 71.0]","[1.290674693967263, 0.28240187874208855]",0
1,2751748,2.243566,34,"[2.2435661764705883, 34.0]","[0.867378036706194, 0.13523470249621142]",0
2,3048233,2.03125,21,"[2.03125, 21.0]","[0.785295149987992, 0.08352731624766]",0
3,3051309,1.709515,1093,"[1.7095150960658738, 1093.0]","[0.6609102344722644, 4.347397936128209]",4
4,1950,1.0,1,"[1.0, 1.0]","[0.38660684307101145, 0.003977491249888571]",0


In [52]:
temp = ps.DataFrame(predictions_pandas["standardized"].to_list(), columns=['avgInteractionTimeScaled', 'interactionCountsScaled'])

In [56]:
temp.head()

Unnamed: 0,avgInteractionTimeScaled,interactionCountsScaled
0,1.290675,0.282402
1,0.867378,0.135235
2,0.785295,0.083527
3,0.66091,4.347398
4,0.386607,0.003977


In [60]:
final = predictions_pandas.join(temp, how='outer')

In [62]:
final= final.drop(columns=['features','standardized'])

In [63]:
final.head()

Unnamed: 0,streamerId,avgInteractionTime,interactionCounts,prediction,avgInteractionTimeScaled,interactionCountsScaled
0,2987163,3.338468,71,0,1.290675,0.282402
7,3040638,3.506663,469,0,1.3557,1.865443
19,3050662,5.46875,63,0,2.114256,0.250582
22,3027657,2.075948,323,0,0.802576,1.28473
25,17971,20.078125,6,0,7.762341,0.023865


In [64]:
final.to_spark().coalesce(1).write.format('csv').option('header', 'true').save('data/item_predictions')

In [65]:
model.save('models/kmeans')

In [66]:
from pyspark.ml.clustering import KMeansModel
model2 = KMeansModel.load('models/kmeans')