In [209]:
import surprise

In [210]:
print(surprise.__version__)

1.1.1


In [211]:
from surprise import SVD
  ## 고유값분해(행렬을 더 낮은 차원으로 분해)해서 다시 원본으로 살리면서 잠재적인 값을 추론하는 방법
from surprise import Dataset, Reader  ## SVD를 사용하기 위한 데이터셋 만들어주는 클래스
                                ## 사용자, 아이템, 평점(rating)
from surprise import accuracy  ## RMSE, MAE, cross-validation(k-fold) 
from surprise.model_selection import train_test_split  ## 훈련/검증 데이터 분류

In [212]:
## 1. 데이터셋을 만들어주자(사용자, 아이템, 평점)  검증/검증 데이터 분류
data = Dataset.load_builtin('ml-100k')
data

<surprise.dataset.DatasetAutoFolds at 0x1635fbf7d00>

In [213]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

In [214]:
trainset

<surprise.trainset.Trainset at 0x163638dbf40>

In [215]:
testset[:5]

[('120', '282', 4.0),
 ('882', '291', 4.0),
 ('535', '507', 5.0),
 ('697', '244', 5.0),
 ('751', '385', 4.0)]

In [216]:
## 2. SVD() 모델 선정
algo = SVD()

In [217]:
## 3. 훈련용 데이터로 fit
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16364fa1730>

In [218]:
## 4. 검증용 데이터로 예측
predictions = algo.test(testset)[:5]
predictions

[Prediction(uid='120', iid='282', r_ui=4.0, est=3.2031059826490904, details={'was_impossible': False}),
 Prediction(uid='882', iid='291', r_ui=4.0, est=3.9428359568684477, details={'was_impossible': False}),
 Prediction(uid='535', iid='507', r_ui=5.0, est=4.1204142434378035, details={'was_impossible': False}),
 Prediction(uid='697', iid='244', r_ui=5.0, est=3.456077226214724, details={'was_impossible': False}),
 Prediction(uid='751', iid='385', r_ui=4.0, est=3.483383091592268, details={'was_impossible': False})]

In [219]:
## 5. 정확도 계산
[(pred.uid, pred.iid, pred.est) for pred in predictions]

[('120', '282', 3.2031059826490904),
 ('882', '291', 3.9428359568684477),
 ('535', '507', 4.1204142434378035),
 ('697', '244', 3.456077226214724),
 ('751', '385', 3.483383091592268)]

In [220]:
## 6. 사용자, 아이템정보 주고 예측실행

In [221]:
uid = str(196)
iid = str(302)
pred = algo.predict(uid, iid)
pred

Prediction(uid='196', iid='302', r_ui=None, est=4.053344888107689, details={'was_impossible': False})

In [222]:
import pandas as pd

In [223]:
## csv를 SVD하기 위한 Dataset를 주기 위해서는
## 1) df <--- csv, 2) DataSet <---- df
ratings = pd.read_csv('data/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [224]:
reader = Reader(rating_scale=(0.5, 5.0))

In [225]:
## SVD에서 사용할 수 있는 데이터 셋으로 만들어주자.
data = Dataset.load_from_df(ratings[['userId','movieId', 'rating']], reader )
data

<surprise.dataset.DatasetAutoFolds at 0x163650559a0>

In [226]:
## 데이터 분류, 객체생성, 훈련시키고, 검증해보고, 검증결과 프린트
trainset, testset = train_test_split(data, test_size=0.25, random_state=0)

In [227]:
from surprise.dataset import DatasetAutoFolds

reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5))
# DatasetAutoFolds 클래스를 ratings_noh.csv 파일 기반으로 생성. 
data_folds = DatasetAutoFolds(ratings_file='data/ratings_noh.csv', reader=reader)

#전체 데이터를 학습데이터로 생성함. 
trainset = data_folds.build_full_trainset()

In [258]:
algo = SVD(n_factors=20, n_epochs=30, random_state=1)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.0388


1.0387802469854106

In [259]:
from surprise.model_selection import cross_validate

In [260]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8722  0.8656  0.8668  0.8671  0.8643  0.8672  0.0027  
MAE (testset)     0.6680  0.6609  0.6670  0.6653  0.6609  0.6644  0.0030  
Fit time          3.06    3.06    3.10    3.50    3.33    3.21    0.18    
Test time         0.33    0.12    0.12    0.33    0.13    0.21    0.10    


{'test_rmse': array([0.87219341, 0.86560825, 0.86684125, 0.86713133, 0.8642525 ]),
 'test_mae': array([0.66795193, 0.66092227, 0.66700644, 0.66526307, 0.66091084]),
 'fit_time': (3.0603573322296143,
  3.0599873065948486,
  3.103024959564209,
  3.503995656967163,
  3.3319947719573975),
 'test_time': (0.3299858570098877,
  0.12399601936340332,
  0.12200713157653809,
  0.3340001106262207,
  0.12900280952453613)}

In [261]:
movies = pd.read_csv('data/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [262]:
movieIds = ratings[ratings['userId']==9]['movieId']
if movieIds[movieIds==42].count() == 0:
    print('사용자 아이디 9는 영화 아이디 42의 평점 없음')

print(movies[movies['movieId']==42])

사용자 아이디 9는 영화 아이디 42의 평점 없음
    movieId                   title              genres
38       42  Dead Presidents (1995)  Action|Crime|Drama


In [263]:
uid = str(9)
iid = str(42)
pred = algo.predict(uid , iid, verbose=True)

user: 9          item: 42         r_ui = None   est = 3.50   {'was_impossible': False}


In [264]:
## 내가 안본 영화 리스트를 구해서, 이중에서 추천하려고 함.
def get_unseen_surprise(movies, ratings, userId):
    ## 전체 영화리스트
    total_movies = movies['movieId'].tolist()
    
    ## 내가 본 영화 리스트
    seen_movies = ratings[ratings['userId'] == userId]['movieId'].tolist()
    
    ## 추천 대상이 되는 영화 리스트 = 전체 영화리스트 - 내가 본 영화 리스트
    unseen_movies = [movie for movie in total_movies if movie not in seen_movies]
    
    print('전체 영화 리스트 갯수 >> ', len(total_movies))
    print('내가 본 영화 리스트 갯수 >> ', len(seen_movies))
    print('내가 안본 영화 리스트 갯수 >> ', len(unseen_movies))
    
    return unseen_movies

In [265]:
unseen_moives = get_unseen_surprise(movies, ratings, 9)

전체 영화 리스트 갯수 >>  9742
내가 본 영화 리스트 갯수 >>  46
내가 안본 영화 리스트 갯수 >>  9696


In [266]:
## 안본 영화중에서 평점 예측이 높게 나온 5개를 리스트업하는 함수.
def recomm_movie_by_surprise(algo, userId, unseen_movies, top_n =10):
    ## 안본 영화리스트를 하나씩 꺼낸다음 평점을 예측하세요.
    predictions = [algo.predict(str(userId), str(movieId)) for movieId in unseen_movies]
    
    # sortkey_est 함수는 list 객체의 sort() 함수의 키 값으로 사용되어 정렬 수행.
    def sortkey_est(one):
        return one.est
    
    # sortkey_est( ) 반환값의 내림 차순으로 정렬 수행하고 top_n개의 최상위 값 추출.
    predictions.sort(key= sortkey_est, reverse=True)
    top_predictions = predictions[:top_n]
    print(len(predictions))
#     print(top_predictions)
#     print(predictions[:5])
    
    top_movie_ids = [ int(pred.iid) for pred in top_predictions]
    top_movie_rating = [ pred.est for pred in top_predictions]
    top_movie_titles = movies[movies.movieId.isin(top_movie_ids)]['title']
    top_movie_preds = [ (id, title, rating) for id, title, rating in zip(top_movie_ids, top_movie_titles, top_movie_rating)]
    
    for top_movie in top_movie_preds:
        print(top_movie[1], ":", top_movie[2])

In [267]:
recomm_movie_by_surprise(algo, 9, unseen_moives)

9696
Toy Story (1995) : 3.501437974934609
Jumanji (1995) : 3.501437974934609
Grumpier Old Men (1995) : 3.501437974934609
Waiting to Exhale (1995) : 3.501437974934609
Father of the Bride Part II (1995) : 3.501437974934609
Heat (1995) : 3.501437974934609
Sabrina (1995) : 3.501437974934609
Tom and Huck (1995) : 3.501437974934609
Sudden Death (1995) : 3.501437974934609
GoldenEye (1995) : 3.501437974934609
