# Surprise

## 기초

### 데이터불러오기

In [7]:
import pandas as pd
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import accuracy

In [8]:
df = pd.read_csv('./필요 데이터/ml-latest-small/ratings.csv')
display(df.head())

# surprise 데이터로 변환
reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader = reader)
data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


<surprise.dataset.DatasetAutoFolds at 0x244b35ef8b0>

### 학습용/검증용 데이터 나누기

In [10]:
trainset, testset = train_test_split(data, test_size = 0.25, random_state = 0)

### 추천을 위한 예측 알고리즘으로 학습/예측/평가

#### 학습

In [13]:
algo = SVD(random_state = 0)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x244b88aca60>

#### 예측

In [21]:
pred_all = algo.test(testset)
pred_all[:5]

[Prediction(uid=63, iid=2000, r_ui=3.0, est=3.8949096605218374, details={'was_impossible': False}),
 Prediction(uid=31, iid=788, r_ui=2.0, est=3.3976098704249553, details={'was_impossible': False}),
 Prediction(uid=159, iid=6373, r_ui=4.0, est=2.843880098254117, details={'was_impossible': False}),
 Prediction(uid=105, iid=81564, r_ui=3.0, est=4.001264995957212, details={'was_impossible': False}),
 Prediction(uid=394, iid=480, r_ui=3.0, est=3.2445349452393972, details={'was_impossible': False})]

In [18]:
uij = str(testset[0][0])
iij = str(testset[0][1])

pred = algo.predict(uij, iij)
pred

Prediction(uid='63', iid='2000', r_ui=None, est=3.5002644558160445, details={'was_impossible': False})

In [20]:
pred.est

3.5002644558160445

#### 평가

In [22]:
accuracy.rmse(pred_all)

RMSE: 0.8692


0.8691518972016722

#### 하이퍼파라미터 튜닝

In [31]:
# 교차검증 활용

from surprise.model_selection import cross_validate

## 추천 알고리즘 지정
algo = SVD(random_state = 0)

## 교차 검증
cross_validate(algo, data, cv = 5, measures = ['RMSE', 'MAE'], verbose = True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8785  0.8758  0.8690  0.8689  0.8808  0.8746  0.0049  
MAE (testset)     0.6748  0.6732  0.6685  0.6663  0.6764  0.6718  0.0038  
Fit time          3.84    3.49    3.58    3.88    3.87    3.73    0.16    
Test time         0.77    0.75    0.82    0.73    0.72    0.76    0.04    


{'test_rmse': array([0.87849279, 0.87576235, 0.86904492, 0.86889099, 0.88077967]),
 'test_mae': array([0.67477762, 0.67316262, 0.66851386, 0.66633319, 0.67638842]),
 'fit_time': (3.835999011993408,
  3.488994598388672,
  3.577996253967285,
  3.8790056705474854,
  3.870992660522461),
 'test_time': (0.769000768661499,
  0.7540028095245361,
  0.8209991455078125,
  0.7349927425384521,
  0.715003252029419)}

In [38]:
# gridsearchCV 활용

from surprise.model_selection import GridSearchCV

params = {'n_epochs': [20, 40, 60], 'n_factors': [50, 100, 200]}

grid_clf = GridSearchCV(SVD, param_grid = params, cv = 5, measures = ['rmse', 'mae'])
grid_clf.fit(data)

print(grid_clf.best_score['rmse'])
print(grid_clf.best_score['mae'])
print(grid_clf.best_params['rmse'])
print(grid_clf.best_params['mae'])

0.870663067882038
0.6691296694225844
{'n_epochs': 20, 'n_factors': 50}
{'n_epochs': 20, 'n_factors': 50}


## 실습: 