## 3. Collaborative Filtering (협업 필터링 : 사용자 리뷰 기반)

In [1]:
!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit-surprise-1.1.1.tar.gz (11.8 MB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp39-cp39-win_amd64.whl size=726231 sha256=b1e598624fe52b3640e760c42897febaed746f1c3eda4c2bb5da0cfc12279077
  Stored in directory: c:\users\chan_lee\appdata\local\pip\cache\wheels\6b\10\c9\7f607c8cb522ef378844f41e63b30d7181a6495d2c1ae514e9
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [2]:
import surprise
surprise.__version__

'1.1.1'

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [6]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
ratings['rating'].min()

0.5

In [8]:
ratings['rating'].max()

5.0

In [10]:
reader = Reader(rating_scale=(0.5, 5))

In [14]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader)
# 라이브러리에서 userid, item, rating만 들어있는 데이터프레임이여야만함

data

<surprise.dataset.DatasetAutoFolds at 0x1acb1f14bb0>

In [15]:
svd = SVD(random_state=0)

In [17]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Route Mean Squar Error, Min Absolute Err
# 모델을 평가할 때 교차로 검증하는 것

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8971  0.8940  0.8990  0.8986  0.8950  0.8967  0.0020  
MAE (testset)     0.6892  0.6924  0.6890  0.6926  0.6890  0.6904  0.0017  
Fit time          3.81    3.76    3.72    3.71    3.74    3.75    0.04    
Test time         0.11    0.10    0.15    0.12    0.11    0.12    0.02    


{'test_rmse': array([0.89706182, 0.89400962, 0.89904533, 0.89855042, 0.89502597]),
 'test_mae': array([0.68920342, 0.69243747, 0.68895574, 0.6925668 , 0.688966  ]),
 'fit_time': (3.814973831176758,
  3.761998176574707,
  3.7179980278015137,
  3.709000587463379,
  3.7400224208831787),
 'test_time': (0.10799956321716309,
  0.10400009155273438,
  0.14800119400024414,
  0.11500167846679688,
  0.10897612571716309)}

### 교차 검증 (K-Fold 교차 검증)
example)  

100개 데이터

cv=5   
5개로 나누어서 진행하라는 것  

A : 1-20
B : 21-40
C : 41-60
D : 61-80
E : 81-100

ABCD (train set) E (test set)  
ABCE (train set) D (test set)  
ABDE (train set) C (test set)  
ACDE (train set) B (test set)  
BCDE (train set) A (test set)  

학습 - 테스트를 총 5번 수행  
수행된 모든 결과의 평균을 가지고옴  

In [18]:
trainset = data.build_full_trainset()
svd.fit(trainset) # 학습

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1acb1f5ca00>

In [20]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [22]:
svd.predict(1, 302) # userid , 평가 대상 , est=예측값

Prediction(uid=1, iid=302, r_ui=None, est=2.7142061734434044, details={'was_impossible': False})

In [23]:
svd.predict(1, 302, 3) # r_ui : 디폴트는 None, 실제로는 item을 3점으로 평가했는데 결과와 비교하는 것

Prediction(uid=1, iid=302, r_ui=3, est=2.7142061734434044, details={'was_impossible': False})

In [25]:
svd.predict(1, 1029, 3) # userId = 1 인 사람이 Movie Id = 1029인 영화에 대해서 실제 평가 3점일 때, 예측 평가 점수?

Prediction(uid=1, iid=1029, r_ui=3, est=2.8814455446761933, details={'was_impossible': False})

In [26]:
ratings[ratings['userId'] == 100]

Unnamed: 0,userId,movieId,rating,timestamp
15273,100,1,4.0,854193977
15274,100,3,4.0,854194024
15275,100,6,3.0,854194023
15276,100,7,3.0,854194024
15277,100,25,4.0,854193977
15278,100,32,5.0,854193977
15279,100,52,3.0,854194056
15280,100,62,3.0,854193977
15281,100,86,3.0,854194208
15282,100,88,2.0,854194208


In [27]:
svd.predict(100, 1029) # userId = 100, Moive id = 1029

Prediction(uid=100, iid=1029, r_ui=None, est=3.7705476478414846, details={'was_impossible': False})