# Surprise

## 기초

### 데이터불러오기

In [7]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

In [8]:
df = pd.read_csv('./필요 데이터/ml-latest-small/ratings.csv')
display(df.head())

# surprise 데이터로 변환
reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader = reader)
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<surprise.dataset.DatasetAutoFolds at 0x244b35ef8b0>

### 학습용/검증용 데이터 나누기

In [10]:
trainset, testset = train_test_split(data, test_size = 0.25, random_state = 0)

### 추천을 위한 예측 알고리즘으로 학습/예측/평가

#### 학습

In [13]:
algo = SVD(random_state = 0)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x244b88aca60>

#### 예측

In [21]:
pred_all = algo.test(testset)
pred_all[:5]

[Prediction(uid=63, iid=2000, r_ui=3.0, est=3.8949096605218374, details={'was_impossible': False}),
 Prediction(uid=31, iid=788, r_ui=2.0, est=3.3976098704249553, details={'was_impossible': False}),
 Prediction(uid=159, iid=6373, r_ui=4.0, est=2.843880098254117, details={'was_impossible': False}),
 Prediction(uid=105, iid=81564, r_ui=3.0, est=4.001264995957212, details={'was_impossible': False}),
 Prediction(uid=394, iid=480, r_ui=3.0, est=3.2445349452393972, details={'was_impossible': False})]

In [18]:
uij = str(testset[0][0])
iij = str(testset[0][1])

pred = algo.predict(uij, iij)
pred

Prediction(uid='63', iid='2000', r_ui=None, est=3.5002644558160445, details={'was_impossible': False})

In [20]:
pred.est

3.5002644558160445

#### 평가

In [22]:
accuracy.rmse(pred_all)

RMSE: 0.8692


0.8691518972016722

#### 하이퍼파라미터 튜닝

In [31]:
# 교차검증 활용

from surprise.model_selection import cross_validate

## 추천 알고리즘 지정
algo = SVD(random_state = 0)

## 교차 검증
cross_validate(algo, data, cv = 5, measures = ['RMSE', 'MAE'], verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8785  0.8758  0.8690  0.8689  0.8808  0.8746  0.0049  
MAE (testset)     0.6748  0.6732  0.6685  0.6663  0.6764  0.6718  0.0038  
Fit time          3.84    3.49    3.58    3.88    3.87    3.73    0.16    
Test time         0.77    0.75    0.82    0.73    0.72    0.76    0.04    


{'test_rmse': array([0.87849279, 0.87576235, 0.86904492, 0.86889099, 0.88077967]),
 'test_mae': array([0.67477762, 0.67316262, 0.66851386, 0.66633319, 0.67638842]),
 'fit_time': (3.835999011993408,
  3.488994598388672,
  3.577996253967285,
  3.8790056705474854,
  3.870992660522461),
 'test_time': (0.769000768661499,
  0.7540028095245361,
  0.8209991455078125,
  0.7349927425384521,
  0.715003252029419)}

In [38]:
# gridsearchCV 활용

from surprise.model_selection import GridSearchCV

params = {'n_epochs': [20, 40, 60], 'n_factors': [50, 100, 200]}

grid_clf = GridSearchCV(SVD, param_grid = params, cv = 5, measures = ['rmse', 'mae'])
grid_clf.fit(data)

print(grid_clf.best_score['rmse'])
print(grid_clf.best_score['mae'])
print(grid_clf.best_params['rmse'])
print(grid_clf.best_params['mae'])

0.870663067882038
0.6691296694225844
{'n_epochs': 20, 'n_factors': 50}
{'n_epochs': 20, 'n_factors': 50}


## 실습: Surprise를 이용한 개인화 영화 추천시스템 구축
- 잠재요인 협업 필터링 기반의 개인화된 영화 추천 구현
- 특정 사용자가 아직 평점을 매기지 않은 영화 중에서 개인 취향에 가장 적절한 영화 추천하기

In [48]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse, mae

### 데이터 불러오기

In [42]:
import pandas as pd
import numpy as np

In [43]:
rating_df = pd.read_csv('./필요 데이터/ml-latest-small/ratings.csv')
rating_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [56]:
np.min(rating_df['rating']), np.max(rating_df['rating'])

(0.5, 5.0)

In [65]:
reader = Reader(rating_scale = (0.5, 5))
rating_ = Dataset.load_from_df(rating_df[['userId','movieId', 'rating']], reader = reader)
rating_

<surprise.dataset.DatasetAutoFolds at 0x244c0cf48e0>

### 데이터 다시 불러오기 및 데이터셋 전체를 학습용 데이터로 분류
- surprise 라이브러리는 학습 전에 trainset 클래스객체로 변환하지 않으면 fit()을 통한 학습이 불가능함
- 데이터 전체를 학습용 데이터로 변환하기 위해서는 DatasetAutoFolds 클래스를 통해 객체를 생성한 뒤, build_full_trainset() 메서드를 호출하면 됨

In [66]:
from surprise.dataset import DatasetAutoFolds

In [73]:
# 데이터셋 불러오기
rating_df = pd.read_csv('./필요 데이터/ml-latest-small/ratings.csv')

# 헤더 없이 저장
rating_df.to_csv('./필요 데이터/ml-latest-small/ratings_noh.csv', index =  False, header = False)

In [82]:
reader = Reader(line_format = 'user item rating timestamp', 
                sep = ',', rating_scale = (0.5,5))

fold_data = DatasetAutoFolds('./필요 데이터/ml-latest-small/ratings_noh.csv', reader = reader)
trainset = fold_data.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x244c3195d90>

### 학습

In [89]:
algo = SVD(n_factors = 50, n_epochs = 20, random_state = 0)

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x244c3238be0>

### 테스트

In [87]:
userId = 9

# userid가 9인 사람의 평점
rating_df[rating_df['userId'] == userId][['movieId']].head()

Unnamed: 0,movieId
1073,41
1074,187
1075,223
1076,371
1077,627


In [90]:
# movieid가 42일 때 예측 평점
algo.predict(uid = '9', iid = '42')

Prediction(uid='9', iid='42', r_ui=None, est=3.130146490888994, details={'was_impossible': False})

### 추천 시스템

In [92]:
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [94]:
movies_df = pd.read_csv('./필요 데이터/ml-latest-small/movies.csv')
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [103]:
# 사용자가 보지 않은 영화id 리스트 반환
def get_unseen_surprise(ratings_df, movies_df, userId):
    seen_movies_idxes = rating_df[ratings_df['userId'] == userId]['movieId']
    
    all_movies_idxes = movies_df['movieId'].tolist()
    unseen_movies_idxes = [idx for idx in all_movies_idxes if idx not in seen_movies_idxes]
    
    return unseen_movies_idxes

In [175]:
# 추천시스템

def recommend_unseen_movies_by_surprise(algo, userId, movies_df, unseen_movies, top_n = 20):
    # 보지 않은 영화들의 예측 평점 리스트 확인
    preds = [algo.predict(uid = str(userId), iid = str(unseen_movie)) for unseen_movie in unseen_movies]
    
    # 예측 평점을 top_n개 순으로 내림차순 정렬
    sorted_movie_preds = sorted(preds, reverse = True, 
                                key = lambda x: x.est)[:top_n]
    
    # 반환
    movie_id = [int(i.iid) for i in sorted_movie_preds]
    print(movie_id)
    movie_title = [movies_df[movies_df['movieId'] == id_]['title'].iloc[0] for id_ in movie_id]
    movie_preds = [float(i.est) for i in sorted_movie_preds]
    
    recommend_df = pd.DataFrame(movie_preds, index = movie_title, columns = ['예측 평점'])
    return recommend_df
    

In [176]:
unseen_movies = get_unseen_surprise(rating_df, movies_df, 9)
recommend_unseen_movies_by_surprise(algo, 9, movies_df, unseen_movies, top_n = 20)

[858, 260, 296, 1196, 1198, 50, 1210, 4993, 1213, 1242, 593, 47, 1233, 56782, 741, 318, 2324, 6787, 58559, 1291]


Unnamed: 0,예측 평점
"Godfather, The (1972)",4.306302
Star Wars: Episode IV - A New Hope (1977),4.281664
Pulp Fiction (1994),4.278153
Star Wars: Episode V - The Empire Strikes Back (1980),4.226074
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),4.205267
"Usual Suspects, The (1995)",4.19181
Star Wars: Episode VI - Return of the Jedi (1983),4.122016
"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.118003
Goodfellas (1990),4.10801
Glory (1989),4.083465
