In [2]:
from pyspark.sql import SparkSession 

MAX_MEMORY = '4g'

spark = SparkSession.builder.appName('ALS')\
                    .config('spark.executor.memory', MAX_MEMORY)\
                    .config('spark.driver.memory', MAX_MEMORY)\
                    .getOrCreate()

24/12/13 14:47:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/13 14:47:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [14]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

## 데이터 로드

In [3]:
rating_df = spark.read.format('csv')\
            .option('header','true')\
            .option('inferSchema', 'true')\
            .load('data/ratings.csv')

                                                                                

In [4]:
rating_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



## 필요없는 열 제거

In [5]:
rating_df = rating_df.select('userId','movieId','rating')

## 데이터 분할

In [6]:
train_ratio = 0.8
test_ratio = 0.2

train, test = rating_df.randomSplit([train_ratio, test_ratio], seed=42)

## 모델링

In [13]:
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy='drop'
)

model = als.fit(train)

                                                                                

In [15]:
predictions = model.transform(test)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 1.9709086|
|151614|    148|   1.0|  2.753184|
| 28229|    148|   1.0| 2.3868246|
|  6491|    148|   4.0| 2.5359988|
| 14831|    148|   3.0| 2.7542315|
+------+-------+------+----------+
only showing top 5 rows





In [16]:
predictions.describe().show()



+-------+-----------------+------------------+------------------+------------------+
|summary|           userId|           movieId|            rating|        prediction|
+-------+-----------------+------------------+------------------+------------------+
|  count|          4998109|           4998109|           4998109|           4998109|
|   mean|81199.14127382977|21297.130436531097|3.5341648211353536| 3.431768943902224|
| stddev|46789.49919203727| 39038.46249004317|1.0609230261741123|0.6466686092617803|
|    min|                1|                 1|               0.5|         -1.640657|
|    max|           162541|            209147|               5.0|         6.9244533|
+-------+-----------------+------------------+------------------+------------------+





In [17]:
predictions = model.transform(test)
predictions.show(5)



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
| 26480|    148|   2.0| 1.9709086|
|151614|    148|   1.0|  2.753184|
| 28229|    148|   1.0| 2.3868246|
|  6491|    148|   4.0| 2.5359988|
| 14831|    148|   3.0| 2.7542315|
+------+-------+------+----------+
only showing top 5 rows





## 평가

In [19]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
rmse

                                                                                

0.8067772158837905

## 모델 활용

In [21]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]

user_df = spark.createDataFrame(user_list, IntegerType()).toDF('userId')
user_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [23]:
user_recommend_movies = model.recommendForUserSubset(user_df, 3)
user_recommend_movies.show()

                                                                                

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    65|[{177209, 6.79347...|
|    78|[{177209, 6.93555...|
|    81|[{200930, 4.80531...|
+------+--------------------+



In [25]:
movie_df = spark.read.format('csv')\
            .option('header','true')\
            .option('inferSchema', 'true')\
            .load('data/movies.csv')
movie_df.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



In [26]:
# 65번 사용자 추천 영화 목록
movies_list = user_recommend_movies.collect()[0].recommendations
movies_list

                                                                                

[Row(movieId=177209, rating=6.793472766876221),
 Row(movieId=194434, rating=6.480366230010986),
 Row(movieId=202231, rating=6.444126129150391)]

In [27]:
rec_df = spark.createDataFrame(movies_list)
rec_df.show()

+-------+-----------------+
|movieId|           rating|
+-------+-----------------+
| 177209|6.793472766876221|
| 194434|6.480366230010986|
| 202231|6.444126129150391|
+-------+-----------------+



In [29]:
# 영화 정보와 추천 영화 목록 조인
rec_df.createOrReplaceTempView('recommend')
movie_df.createOrReplaceTempView('movies')

query = '''
select *
from movies
inner join recommend
on movies.movieId = recommend.movieId
order by rating desc
'''

spark.sql(query).show()

+-------+-----------------+------------------+-------+-----------------+
|movieId|            title|            genres|movieId|           rating|
+-------+-----------------+------------------+-------+-----------------+
| 177209|   Acı Aşk (2009)|             Drama| 177209|6.793472766876221|
| 194434|Adrenaline (1990)|(no genres listed)| 194434|6.480366230010986|
| 202231|    Foster (2018)|       Documentary| 202231|6.444126129150391|
+-------+-----------------+------------------+-------+-----------------+



In [30]:
spark.stop()