# 잠재요인 협업 필터링 - SVD 활용

In [1]:
import pandas as pd
from google.colab import files
up = files.upload()

Saving movies.csv to movies.csv
Saving ratings.csv to ratings.csv
Saving ratings_noh.csv to ratings_noh.csv


In [2]:
ratings = pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
ratings.shape

(100836, 4)

In [4]:
# 사용자수, 영화수
ratings.userId.nunique(), ratings.movieId.nunique()

(610, 9724)

In [5]:
!pip install scikit-surprise > /dev/null

In [6]:
from surprise import SVD, Reader
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5,5))
data_folds = DatasetAutoFolds('ratings_noh.csv', reader=reader)

In [7]:
# 전체 데이터를 학습 데이터로 사용
trainset = data_folds.build_full_trainset()

In [8]:
# 모델 생성 및 학습
model = SVD(n_epochs=20, n_factors=50, random_state=2022)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f89eb3fc710>

- 사용자 Id: 9, 영화 Id: 42 (Dead Presidents (1995))

In [9]:
# 영화 정보
mdf = pd.read_csv('movies.csv')
mdf.head(40).tail(5)

Unnamed: 0,movieId,title,genres
35,39,Clueless (1995),Comedy|Romance
36,40,"Cry, the Beloved Country (1995)",Drama
37,41,Richard III (1995),Drama|War
38,42,Dead Presidents (1995),Action|Crime|Drama
39,43,Restoration (1995),Drama


In [10]:
# 사용자 9번이 42번 영화를 보았는지 확인
movieIds = ratings[ratings.userId == 9]['movieId']
movieIds[movieIds == 42].count()

0

In [11]:
# 사용자 9번의 42번 영화에 대한 예상 평점
uid, mid = str(9), str(42)
pred = model.predict(uid, mid, verbose=True)

user: 9          item: 42         r_ui = None   est = 3.25   {'was_impossible': False}


In [12]:
pred

Prediction(uid='9', iid='42', r_ui=None, est=3.249924377339538, details={'was_impossible': False})

- 사용자 9번이 보지 않은 영화중에서 예상점수가 가장 높은 Top 10

In [13]:
seen_movies = ratings[ratings.userId == 9]['movieId'].tolist()
total_movies = mdf.movieId.tolist()
unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
len(seen_movies), len(unseen_movies)

(46, 9696)

In [14]:
uid = str(9)
#predictions = [model.predict(uid, str(mid)) for mid in unseen_movies]
predictions = []
for mid in unseen_movies:
    pred = model.predict(uid, str(mid))
    predictions.append(pred)
predictions[:5]

[Prediction(uid='9', iid='1', r_ui=None, est=3.702922347424712, details={'was_impossible': False}),
 Prediction(uid='9', iid='2', r_ui=None, est=3.2274451421980412, details={'was_impossible': False}),
 Prediction(uid='9', iid='3', r_ui=None, est=3.0342513115122123, details={'was_impossible': False}),
 Prediction(uid='9', iid='4', r_ui=None, est=2.661778597408914, details={'was_impossible': False}),
 Prediction(uid='9', iid='5', r_ui=None, est=2.689490348191407, details={'was_impossible': False})]

In [15]:
def sortkey_est(pred):
    return pred.est

In [16]:
predictions.sort(key=sortkey_est, reverse=True)
predictions[:5]

[Prediction(uid='9', iid='318', r_ui=None, est=4.070330794979969, details={'was_impossible': False}),
 Prediction(uid='9', iid='1217', r_ui=None, est=4.063731956995097, details={'was_impossible': False}),
 Prediction(uid='9', iid='1261', r_ui=None, est=4.051908410348554, details={'was_impossible': False}),
 Prediction(uid='9', iid='1204', r_ui=None, est=4.0227662213503805, details={'was_impossible': False}),
 Prediction(uid='9', iid='3275', r_ui=None, est=4.011500870494226, details={'was_impossible': False})]

In [24]:
top_movie_ids = [int(pred.iid) for pred in predictions[:10]]
top_movie_ratings = [pred.est for pred in predictions[:10]]
top_movie_titles = [mdf[mdf.movieId == mid]['title'].values[0] for mid in top_movie_ids]

In [25]:
top_df = pd.DataFrame({
    '영화명': top_movie_titles,
    '예상평점': top_movie_ratings
}, index=top_movie_ids)
top_df.index.name = 'movieId'
top_df

Unnamed: 0_level_0,영화명,예상평점
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
318,"Shawshank Redemption, The (1994)",4.070331
1217,Ran (1985),4.063732
1261,Evil Dead II (Dead by Dawn) (1987),4.051908
1204,Lawrence of Arabia (1962),4.022766
3275,"Boondock Saints, The (2000)",4.011501
4973,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",3.999696
142488,Spotlight (2015),3.985348
1673,Boogie Nights (1997),3.979885
50,"Usual Suspects, The (1995)",3.978625
898,"Philadelphia Story, The (1940)",3.978415
