In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('clustering').getOrCreate()

24/12/13 13:40:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

## 데이터 생성

In [2]:
# 데이터 생성
data = [
    (0, 0, 4.0),  # user 0 rated item 0 with 4.0
    (0, 1, 2.0),
    (1, 1, 3.0),
    (1, 2, 1.0),
    (2, 0, 5.0),
    (2, 2, 4.0)
]

columns = ['user_id','item_id','rating']

df = spark.createDataFrame(data, columns)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+-------+------+
|user_id|item_id|rating|
+-------+-------+------+
|      0|      0|   4.0|
|      0|      1|   2.0|
|      1|      1|   3.0|
|      1|      2|   1.0|
|      2|      0|   5.0|
|      2|      2|   4.0|
+-------+-------+------+



                                                                                

In [4]:
user_item_matrix = df.groupBy('user_id').pivot('item_id').avg('rating').fillna(0)
user_item_matrix.show()

                                                                                

+-------+---+---+---+
|user_id|  0|  1|  2|
+-------+---+---+---+
|      0|4.0|2.0|0.0|
|      1|0.0|3.0|1.0|
|      2|5.0|0.0|4.0|
+-------+---+---+---+



In [9]:
assembler = VectorAssembler(inputCols=["0","1","2"], outputCol='features')
user_features = assembler.transform(user_item_matrix)
user_features.show()

                                                                                

+-------+---+---+---+-------------+
|user_id|  0|  1|  2|     features|
+-------+---+---+---+-------------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|
+-------+---+---+---+-------------+



## 모델링

In [13]:
# KMeans 모델 설정
kmeans = KMeans(k=2, seed=1, featuresCol='features', predictionCol='cluster')

# 모델 학습
model = kmeans.fit(user_features)

# 클러스터 할당 결과 생성
clusters = model.transform(user_features)

# 결과 출력
clusters.show()

24/12/13 14:15:34 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/12/13 14:15:34 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

+-------+---+---+---+-------------+-------+
|user_id|  0|  1|  2|     features|cluster|
+-------+---+---+---+-------------+-------+
|      0|4.0|2.0|0.0|[4.0,2.0,0.0]|      0|
|      1|0.0|3.0|1.0|[0.0,3.0,1.0]|      0|
|      2|5.0|0.0|4.0|[5.0,0.0,4.0]|      1|
+-------+---+---+---+-------------+-------+



In [14]:
spark.stop()