# Surprise를 이용한 개인화 영화 추천 시스템 구축
- 잠재요인 협업 필터링
- 특이값 분해(SVD: Singular Value Decomposition) 활용

In [1]:
import pandas as pd
ratings = pd.read_csv('data/ml-latest/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
ratings.shape

(100836, 4)

In [6]:
# 사용자 수, 영화 수
len(ratings.userId.value_counts()), len(ratings.movieId.value_counts())

(610, 9724)

In [2]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5,5))
data_folds = DatasetAutoFolds(ratings_file='data/ml-latest/ratings_noh.csv', reader=reader)

In [3]:
# 전체 데이터를 학습 데이터로 사용
trainset = data_folds.build_full_trainset()

In [7]:
# 모델 생성 및 학습
model = SVD(n_epochs=20, n_factors=50, random_state=2022)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27b215743a0>

- 사용자 Id: 9, 영화 Id:42 (Dead Presidents (1995))

In [8]:
# 영화 정보
mdf = pd.read_csv('data/ml-latest/movies.csv')
mdf.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
# 사용자 9번이 42번 영화를 보았는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [11]:
# 사용자 9번의 42번 영화에 대한 예상 평점
uid = str(9)
mid = str(42)

pred = model.predict(uid, mid, verbose=True)

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


In [12]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=3.2499243773395383, details={'was_impossible': False})

### 사용자 9번이 보지 않은 영화중에서 예상점수가 가장 높은 Top 10

In [18]:
# 보지 않은 영화 리스트 도출
seen_movies = ratings[ratings.userId == 9]['movieId'].tolist()
total_movies = mdf.movieId.tolist()
unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
print(f'평점매긴 영화수: {len(seen_movies)}, 추천대상 영화수: {len(unseen_movies)}')

평점매긴 영화수: 46, 추천대상 영화수: 9696


In [19]:
predictions = [model.predict(str(9), str(mid)) for mid in unseen_movies]
predictions[:5]

[Prediction(uid='9', iid='1', r_ui=None, est=3.7029223474247126, details={'was_impossible': False}),
 Prediction(uid='9', iid='2', r_ui=None, est=3.2274451421980412, details={'was_impossible': False}),
 Prediction(uid='9', iid='3', r_ui=None, est=3.034251311512212, details={'was_impossible': False}),
 Prediction(uid='9', iid='4', r_ui=None, est=2.661778597408914, details={'was_impossible': False}),
 Prediction(uid='9', iid='5', r_ui=None, est=2.689490348191407, details={'was_impossible': False})]

In [20]:
def sortkey_est(pred):
    return pred.est

In [26]:
predictions.sort(key=sortkey_est, reverse=True)
predictions[:5]

[Prediction(uid='9', iid='318', r_ui=None, est=4.070330794979969, details={'was_impossible': False}),
 Prediction(uid='9', iid='1217', r_ui=None, est=4.063731956995097, details={'was_impossible': False}),
 Prediction(uid='9', iid='1261', r_ui=None, est=4.051908410348554, details={'was_impossible': False}),
 Prediction(uid='9', iid='1204', r_ui=None, est=4.0227662213503805, details={'was_impossible': False}),
 Prediction(uid='9', iid='3275', r_ui=None, est=4.011500870494226, details={'was_impossible': False})]

In [33]:
top_predictions = predictions[:10]
top_movie_ids = [int(pred.iid) for pred in top_predictions]
top_movie_ratings = [pred.est for pred in top_predictions]
top_movie_titles = mdf[mdf.movieId.isin(top_movie_ids)]['title']
top_df = pd.DataFrame({
    '영화명': top_movie_titles,
    '예상 평점': top_movie_ratings
})
top_df.index.rename('movieId', inplace=True)
top_df

Unnamed: 0_level_0,영화명,예상 평점
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
46,"Usual Suspects, The (1995)",4.070331
277,"Shawshank Redemption, The (1994)",4.063732
680,"Philadelphia Story, The (1940)",4.051908
906,Lawrence of Arabia (1962),4.022766
918,Ran (1985),4.011501
960,Evil Dead II (Dead by Dawn) (1987),3.999696
1258,Boogie Nights (1997),3.985348
2462,"Boondock Saints, The (2000)",3.979885
3622,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",3.978625
9071,Spotlight (2015),3.978415
