In [1]:
from pyspark.sql import SparkSession

In [2]:
# OutOfMemory 오류가 나면 각종 설정을 추가적으로 해줄 수 있다.
MAX_MEMORY = '5g'
spark = SparkSession.builder.appName("movie-recommendation")\
    .config("spark.executor.memory", MAX_MEMORY)\
    .config("spark.driver.memory", MAX_MEMORY)\
    .getOrCreate()

In [6]:
directory="C:\\Users\\daesi\\Downloads\\빅데이터 소스코드\\소스코드\\study_spark\\data\\ml-25m"
filename = "ratings.csv"

In [7]:
ratings_df = spark.read.csv(f"file:///{directory}\\{filename}", inferSchema=True, header=True)
ratings_df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
|     1|   1088|   4.0|1147868495|
|     1|   1175|   3.5|1147868826|
|     1|   1217|   3.5|1147878326|
|     1|   1237|   5.0|1147868839|
|     1|   1250|   4.0|1147868414|
|     1|   1260|   3.5|1147877857|
|     1|   1653|   4.0|1147868097|
|     1|   2011|   2.5|1147868079|
|     1|   2012|   2.5|1147868068|
|     1|   2068|   2.5|1147869044|
|     1|   2161|   3.5|1147868609|
|     1|   2351|   4.5|1147877957|
|     1|   2573|   4.0|1147878923|
|     1|   2632|   5.0|1147878248|
|     1|   2692|   5.0|1147869100|
+------+-------+------+----------+
only showing top 20 rows



In [8]:
ratings_df = ratings_df.select(["userid", "movieId", "rating"])
ratings_df.printSchema()

root
 |-- userid: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [9]:
ratings_df.select('rating').describe().show()

+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          25000095|
|   mean| 3.533854451353085|
| stddev|1.0607439611423508|
|    min|               0.5|
|    max|               5.0|
+-------+------------------+



In [10]:
train_ratio = 0.8
test_ratio  = 0.2

train_df, test_df = ratings_df.randomSplit([0.8, 0.2])

# ALS 추천 알고리즘 가져오기

In [13]:
from pyspark.ml.recommendation import ALS

In [14]:
als = ALS(
    maxIter=5,
    regParam=0.1,
    userCol = "userid",
    itemCol = "movieId",
    ratingCol = "rating",
    coldStartStrategy="drop"
)

In [15]:
#학습하기
model = als.fit(train_df)

In [16]:
#예측하기
predictions = model.transform(test_df)
predictions.show()

+------+-------+------+----------+
|userid|movieId|rating|prediction|
+------+-------+------+----------+
|   101|   8638|   5.0| 3.4953117|
|   243|   1580|   3.0| 2.6439457|
|   322|   1645|   4.0| 3.5498655|
|   332|   1580|   4.0| 3.2362347|
|   368|   1580|   3.5| 3.6040554|
|   375|   1580|   2.5| 3.5270433|
|   497|   2366|   4.0| 3.9810052|
|   501|   1580|   5.0|   3.96489|
|   501|   1645|   4.0| 3.6354952|
|   513|  44022|   5.0| 4.2891164|
|   597|   1580|   4.0| 3.7775984|
|   597|   1591|   2.0| 2.5916934|
|   597|   1645|   5.0| 3.4550447|
|   597|   3175|   5.0| 3.9443662|
|   606|  36525|   2.5| 4.2140217|
|   606| 160563|   4.0|  4.080393|
|   626|   1088|   4.0| 3.3628745|
|   626|   2366|   3.0| 3.2461371|
|   626|   6658|   3.5|  2.864083|
|   756|  44022|   3.5| 3.2319312|
+------+-------+------+----------+
only showing top 20 rows



In [17]:
predictions.select("rating", "prediction").describe().show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|           4992305|           4992305|
|   mean| 3.534352468448943| 3.423928911872502|
| stddev|1.0605168816397743|0.6426701150440502|
|    min|               0.5|        -1.4200428|
|    max|               5.0|          6.374601|
+-------+------------------+------------------+



In [18]:
# 영화의 평점 예측, 즉 회귀를 진행 했기 때문에 RegressionEvaluator로 평가 진행
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [19]:
# 평가 결과 확인하기

rmse = evaluator.evaluate(predictions)
print(rmse)

0.8083967430557135


In [20]:
# 각 user 에게 top3 아이템을 추천 = {item 번호, 예측된 점수}
model.recommendForAllUsers(3).show()



+------+--------------------+
|userid|     recommendations|
+------+--------------------+
|    26|[{177209, 5.68791...|
|    27|[{179063, 6.47533...|
|    28|[{177209, 7.73377...|
|    31|[{203086, 3.90486...|
|    34|[{177209, 5.62623...|
|    44|[{177209, 6.86821...|
|    53|[{194334, 6.58236...|
|    65|[{127252, 6.34869...|
|    76|[{177209, 6.28936...|
|    78|[{177209, 6.99564...|
|    81|[{158755, 4.75545...|
|    85|[{177209, 5.66690...|
|   101|[{177209, 5.19097...|
|   103|[{177209, 6.38743...|
|   108|[{177209, 5.36059...|
|   115|[{203086, 6.59715...|
|   126|[{177209, 6.47607...|
|   133|[{177209, 5.52077...|
|   137|[{203086, 5.97566...|
|   148|[{177209, 5.83982...|
+------+--------------------+
only showing top 20 rows



In [21]:
# 각 movie에 어울리는 top 3 user를 추천 
model.recommendForAllItems(3).show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     12|[{87426, 5.2856},...|
|     26|[{105801, 5.24974...|
|     27|[{87426, 5.498817...|
|     28|[{105801, 5.48115...|
|     31|[{87426, 5.230362...|
|     34|[{128562, 5.58630...|
|     44|[{87426, 5.081454...|
|     53|[{37339, 5.234616...|
|     65|[{55247, 5.097826...|
|     76|[{87426, 5.310548...|
|     78|[{142811, 4.89411...|
|     81|[{142811, 4.90820...|
|     85|[{67565, 4.989594...|
|    101|[{87816, 5.111314...|
|    103|[{142811, 5.21307...|
|    108|[{142811, 5.20111...|
|    115|[{96740, 5.854862...|
|    126|[{87426, 4.821117...|
|    133|[{33115, 5.563246...|
|    137|[{67565, 5.053014...|
+-------+--------------------+
only showing top 20 rows



In [22]:
from pyspark.sql.types import IntegerType

user_list = [65, 78, 81]
users_df = spark.createDataFrame(user_list, IntegerType()).toDF("userId")

users_df.show()

+------+
|userId|
+------+
|    65|
|    78|
|    81|
+------+



In [23]:
# 데이터 프레임으로 예측 할 때는 recommendForUserSubset
user_recs = model.recommendForUserSubset(users_df, 5) # 각 user에 대해 top 5 추천
user_recs.show()

+------+--------------------+
|userid|     recommendations|
+------+--------------------+
|    65|[{127252, 6.34869...|
|    78|[{177209, 6.99564...|
|    81|[{158755, 4.75545...|
+------+--------------------+



In [24]:
movies_list = user_recs.collect()[0].recommendations
movies_list

[Row(movieId=127252, rating=6.348696231842041),
 Row(movieId=159761, rating=6.030430793762207),
 Row(movieId=177209, rating=5.985671043395996),
 Row(movieId=169606, rating=5.9849934577941895),
 Row(movieId=127843, rating=5.9390177726745605)]

In [25]:
recs_df = spark.createDataFrame(movies_list)
recs_df.show()

+-------+------------------+
|movieId|            rating|
+-------+------------------+
| 127252| 6.348696231842041|
| 159761| 6.030430793762207|
| 177209| 5.985671043395996|
| 169606|5.9849934577941895|
| 127843|5.9390177726745605|
+-------+------------------+



In [26]:
movies_file = "movies.csv"

In [27]:
movies_df = spark.read.csv(f"file:///{directory}\\{movies_file}", inferSchema=True, header=True)
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [28]:
#sql사용하기 위함
recs_df.createOrReplaceTempView("recommendations")
movies_df.createOrReplaceTempView("movies")

In [29]:
query = """
    
    SELECT * 
    
    FROM movies
    JOIN recommendations ON movies.movieId = recommendations.movieId
    
    ORDER BY rating desc
    
"""

recommended_movies = spark.sql(query)
recommended_movies.show()

+-------+--------------------+--------------------+-------+------------------+
|movieId|               title|              genres|movieId|            rating|
+-------+--------------------+--------------------+-------+------------------+
| 127252|The Veil of Twili...|Crime|Fantasy|Mys...| 127252| 6.348696231842041|
| 159761|         Loot (1970)|        Comedy|Crime| 159761| 6.030430793762207|
| 177209|      Acı Aşk (2009)|               Drama| 177209| 5.985671043395996|
| 169606|Dara O'Briain Cro...|              Comedy| 169606|5.9849934577941895|
| 127843|   Antarctica (1983)|     Adventure|Drama| 127843|5.9390177726745605|
+-------+--------------------+--------------------+-------+------------------+



In [30]:
#유저별로 영화추천 > 함수화 시켜보기
def get_recommendations(user_id, num_recs):
    user_df = spark.createDataFrame([user_id], IntegerType()).toDF("userId")
    user_recs_df = model.recommendForUserSubset(user_df, num_recs)
    
    recs_list = user_recs_df.collect()[0].recommendations
    recs_df = spark.createDataFrame(recs_list)
    
    recommended_movies = recs_df.join(movies_df, "movieId")
    return recommended_movies

In [31]:
recs = get_recommendations(456, 10) #456유저 10개 추천
recs.toPandas()

Unnamed: 0,movieId,rating,title,genres
0,177209,6.991477,Acı Aşk (2009),Drama
1,159761,6.676736,Loot (1970),Comedy|Crime
2,203086,6.568202,Truth and Justice (2019),Drama
3,203882,6.42856,Dead in the Water (2006),Horror
4,144202,6.252572,Catch That Girl (2002),Action|Children
5,183947,6.149256,NOFX Backstage Passport 2,(no genres listed)
6,143571,6.116813,Private Practices: The Story of a Sex Surrogat...,Documentary
7,174627,5.961359,An Alternative Reality: The Football Manager D...,Documentary
8,127843,5.960376,Antarctica (1983),Adventure|Drama
9,187951,5.936113,Father of Lights (2012),Documentary


In [32]:
spark.stop()