In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
# !pip3 install surprise # 매번 인스톨

In [2]:
ratings = pd.read_csv("./data/ml-latest-small/ratings.csv")

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# 불필요한 컬럼 제거
ratings = ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
# sample 데이터를 이용해 Item-user 매트릭스 그려보기
ratings.set_index(['movieId', 'userId']).unstack()['rating']

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [6]:
# item-item CF를 사용할 때 연산 수가 너무 커져서 azure notebook으로는 실행이 안됨
sample = ratings.sample(frac=.3, random_state=42)

## 1. 서프라이즈 패키지용 데이터셋 세팅

In [7]:
from surprise import Reader, Dataset

In [8]:
# 서프라이즈 패키지에게 rating의 최솟값, 최댓값 알려줌
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(sample, reader)

## 2. train, test 데이터 셋 세팅_surprise

In [9]:
from surprise.model_selection import train_test_split

In [10]:
train, test = train_test_split(data, test_size=0.1, random_state=42)

surprise의 모델에 돌리기 위해서는 특별한 형태의 데이터로 변환이 필요

계산의 용이성을 위해 user_id, item_id도 임의의 값으로 변환됨

- 기존의 user_id과 item_id는 `raw_uid`, `raw_iid`
- 서프라이즈 train_set 내부의 uid와 iid는 `inner_uid`, `inner_iid`

In [11]:
train

<surprise.trainset.Trainset at 0x7fc9c2b50c18>

In [12]:
# 서프라이즈 trainset 내부 데이터 확인(inner_uid, inner_iid)
train.ur

defaultdict(list,
            {0: [(0, 4.0),
              (411, 3.0),
              (461, 3.0),
              (2705, 4.0),
              (984, 5.0),
              (4, 5.0),
              (1, 3.0),
              (209, 4.0),
              (428, 3.0),
              (305, 3.0),
              (3, 3.0),
              (1045, 4.0),
              (65, 3.0),
              (255, 4.0),
              (3931, 3.0),
              (430, 3.0),
              (2226, 4.0),
              (1584, 3.0),
              (352, 3.0),
              (1214, 4.0),
              (256, 4.0)],
             1: [(1, 4.0),
              (710, 4.0),
              (871, 5.0),
              (981, 4.0),
              (226, 2.0),
              (674, 4.5),
              (569, 2.5),
              (394, 4.5),
              (1264, 4.5),
              (307, 4.5),
              (2256, 5.0),
              (695, 4.0),
              (250, 4.0),
              (464, 4.5),
              (1526, 5.0),
              (416, 2.5),
              (

In [13]:
# 원본데이터에는 없는 유저아이디 0번과 무비아이디 0번
print(len(ratings[ratings['userId']==0]))
print(len(ratings[ratings['movieId']==0]))

0
0


In [14]:
raw_uid_0 = train.to_raw_uid(0)
raw_uid_0

476

In [15]:
raw_iid_0 = train.to_raw_iid(0)
raw_iid_0

2

In [16]:
# 실제 userId와 movieId
ratings[(ratings['userId']==raw_uid_0) & (ratings['movieId']==raw_iid_0)]

Unnamed: 0,userId,movieId,rating
75356,476,2,4.0


test 데이터 확인 raw_iid, raw_uid

In [17]:
# test 데이터 확인 (raw_iid, raw_uid)
test[:5]

[(474, 1704, 3.5),
 (307, 54272, 4.5),
 (275, 1186, 5.0),
 (600, 2291, 4.5),
 (351, 480, 4.0)]

In [41]:
ratings[(ratings['userId']==test[0][0]) & (ratings['movieId']==test[0][1])]

Unnamed: 0,userId,movieId,rating
73577,474,1704,3.5


In [42]:
ratings[(ratings['userId']==test[1][0]) & (ratings['movieId']==test[1][1])]

Unnamed: 0,userId,movieId,rating
47651,307,54272,4.5


<br>


<br>

# 3. KNNWithMeans item-item 모델링

In [20]:
from surprise.prediction_algorithms.knns import KNNWithMeans

**KNNWithMeans 모델 파라미터**

1. k : 타겟과 유사도가 높은 유저들 중 몇명까지 계산에 사용할 것인지(최댓값).
2. min_k : 타겟과 유사도가 높은 유저들이 많지 않다면 최소 몇명은 그래도 있어야 하는지(최솟값). 비슷한 유저의 수가 min_k보다 작을 경우 전체 데이터의 평균값이 반환.
3. sim_options : 유사도 관련한 모든 옵션
    - name : 어떤 유사도를 사용할 것인지 이름으로 지정.
    - min_support : 유저 간에 공통되는 아이템이 최소 몇개 이상이어야 하는지 지정. 두 유저의 공통 아이템이 min_support 값 이하일 경우 전체의 평균값이 반환.
    - user_based : True일 경우 User-based(User-user) CF, False일 경우 Item-based(Item-item) CF 방식 사용.
4. verbose : 모델 트레이닝 과정에서 반환되는 중간 결과물 들을 프린트해주는 명령(True일 경우 프린트, False일 경우 아무것도 보여주지 않음)


In [21]:
sim_options = {'name': 'pearson',
              'min_support': 0,
              'user_based': False}

model = KNNWithMeans(k=20, min_k=5, sim_options=sim_options, verbose=False)

In [22]:
# trainset 피팅
model.fit(train)

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fc9c69aeef0>

In [23]:
# testset으로 학습된 모델로 test셋 결과 예측
predictions = model.test(test)

# 4. Userid, Itemid, 실제값, 예측값, 연산 결과 확인

KNNBasic에서는 타겟 아이템과 유사한 아이템의 갯수가 min_k보다 적을 경우 전체 평점의 평균(global_mean)을 적용하고 있으나 KNNWithMeans에서는 아이템의 평균 평점을 적용

### CASE 1. 유사 아이템의 갯수가 충분히 많을 때(> min_k)

In [27]:
prediction_df = pd.DataFrame(predictions)

In [28]:
prediction_df.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,474,1704,3.5,4.189198,"{'actual_k': 20, 'was_impossible': False}"
1,307,54272,4.5,3.231363,"{'actual_k': 6, 'was_impossible': False}"
2,275,1186,5.0,2.5,"{'actual_k': 1, 'was_impossible': False}"
3,600,2291,4.5,3.522651,"{'actual_k': 20, 'was_impossible': False}"
4,351,480,4.0,3.894863,"{'actual_k': 12, 'was_impossible': False}"


### CASE 2. 유사 아이템이 있으나 충분히 많지 않을 때(< min_k)

In [29]:
train.to_inner_iid(1186)

826

In [30]:
# train의 딕셔너리 key=inner_item_id / value=[inner_user_id, ratings]
train.ir

defaultdict(list,
            {0: [(0, 4.0),
              (42, 5.0),
              (105, 2.5),
              (23, 3.0),
              (431, 4.5),
              (410, 4.0),
              (359, 4.0),
              (398, 3.0),
              (385, 2.5),
              (63, 3.5),
              (69, 5.0),
              (180, 3.0),
              (154, 4.5),
              (333, 5.0),
              (216, 4.5),
              (40, 2.0),
              (357, 5.0),
              (46, 4.0),
              (289, 3.0),
              (65, 3.5),
              (139, 2.5),
              (131, 3.5),
              (58, 4.0),
              (381, 3.5),
              (73, 3.5),
              (207, 3.0),
              (12, 2.5)],
             1: [(1, 4.0),
              (69, 3.0),
              (182, 4.0),
              (130, 4.0),
              (132, 4.0),
              (501, 3.0),
              (30, 3.5),
              (79, 3.5),
              (18, 4.0),
              (475, 4.0),
              (44, 5.0),
      

In [31]:
# inner_item_id가 826인 영화의 평균 평점 
# == min_k보다 작은 수의 데이터를 가진 826번 아이템의 예측평점

pd.DataFrame(train.ir[826])[1].mean()

2.5

### CASE 3. 유사 아이템이 없을 때

In [45]:
prediction_df['details'].astype('str').unique()

array(["{'actual_k': 20, 'was_impossible': False}",
       "{'actual_k': 6, 'was_impossible': False}",
       "{'actual_k': 1, 'was_impossible': False}",
       "{'actual_k': 12, 'was_impossible': False}",
       "{'actual_k': 3, 'was_impossible': False}",
       "{'was_impossible': True, 'reason': 'User and/or item is unkown.'}",
       "{'actual_k': 0, 'was_impossible': False}",
       "{'actual_k': 9, 'was_impossible': False}",
       "{'actual_k': 10, 'was_impossible': False}",
       "{'actual_k': 18, 'was_impossible': False}",
       "{'actual_k': 5, 'was_impossible': False}",
       "{'actual_k': 2, 'was_impossible': False}",
       "{'actual_k': 4, 'was_impossible': False}",
       "{'actual_k': 13, 'was_impossible': False}",
       "{'actual_k': 17, 'was_impossible': False}",
       "{'actual_k': 8, 'was_impossible': False}",
       "{'actual_k': 7, 'was_impossible': False}",
       "{'actual_k': 15, 'was_impossible': False}",
       "{'actual_k': 16, 'was_impossible': False}"

In [47]:
prediction_df[prediction_df['details'].astype('str') == "{'was_impossible': True, 'reason': 'User and/or item is unkown.'}"].head()

Unnamed: 0,uid,iid,r_ui,est,details
8,460,89039,4.5,3.49438,"{'was_impossible': True, 'reason': 'User and/o..."
17,474,5279,3.5,3.49438,"{'was_impossible': True, 'reason': 'User and/o..."
28,202,3372,4.0,3.49438,"{'was_impossible': True, 'reason': 'User and/o..."
30,66,7766,4.0,3.49438,"{'was_impossible': True, 'reason': 'User and/o..."
41,599,7704,2.0,3.49438,"{'was_impossible': True, 'reason': 'User and/o..."


In [48]:
# 'was_impossible':True인 경우 train의 global_mean 값으로 예측값을 지정
train.global_mean

3.494380165289256

**샘플 데이터 프레임에서 확인**

movieId 89039인 데이터가 하나밖에 없는데 testset에만 들어있음

In [50]:
sample[sample['userId'] == 460]

Unnamed: 0,userId,movieId,rating
71491,460,34405,5.0
71454,460,3114,4.0
71508,460,68954,4.5
71501,460,51255,4.5
71502,460,54259,4.0
71457,460,3615,3.0
71499,460,48780,4.5
71452,460,2762,4.5
71451,460,2747,3.0
71459,460,3831,4.5


In [51]:
sample[sample['movieId'] == 89039]

Unnamed: 0,userId,movieId,rating
71516,460,89039,4.5


**surprise 매서드로 확인**

movieId 89039인 데이터가 하나밖에 없는데 testset에만 들어있음

In [52]:
# 441 유저가 train에 있는지 확인
train.knows_user(uid=train.to_inner_uid(460))

True

In [53]:
# 4402 아이템이 train에 있는지 확인 --> 없음
train.knows_item(iid=train.to_inner_iid(89039))

ValueError: Item 89039 is not part of the trainset.

# 5. KNN 확인하기

타겟 유저와 유사도가 높은 k명의 유저들 확인

In [54]:
prediction_df.head()

Unnamed: 0,uid,iid,r_ui,est,details
0,474,1704,3.5,4.189198,"{'actual_k': 20, 'was_impossible': False}"
1,307,54272,4.5,3.231363,"{'actual_k': 6, 'was_impossible': False}"
2,275,1186,5.0,2.5,"{'actual_k': 1, 'was_impossible': False}"
3,600,2291,4.5,3.522651,"{'actual_k': 20, 'was_impossible': False}"
4,351,480,4.0,3.894863,"{'actual_k': 12, 'was_impossible': False}"


In [110]:
# item 54272와 유사도가 높은 iid 확인
# get_neighbors() 함수는 inner_id를 결과로 반환하므로 원본 id를 알기 위해서는 변환 필요

iid = train.to_inner_iid(54272)
k = 20

In [111]:
iid

174

In [116]:
[inner_iid for inner_iid in model.get_neighbors(iid, k)]

[11,
 14,
 68,
 93,
 100,
 188,
 232,
 235,
 243,
 363,
 424,
 437,
 472,
 479,
 506,
 511,
 529,
 548,
 569,
 574]

In [117]:
sim_df = pd.DataFrame(model.sim)
sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5884,5885,5886,5887,5888,5889,5890,5891,5892,5893
0,1.000000,-0.645497,0.0,0.717137,0.920737,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.645497,1.000000,0.0,0.643268,0.606200,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.717137,0.643268,0.0,1.000000,0.695852,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.920737,0.606200,0.0,0.695852,1.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5889,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5890,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5891,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5892,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [119]:
# 유사도가 높은 아이템 순으로 정렬(inner_iid)
# item 54272의 actual_k가 6이라는 뜻은 이 리스트 중에서 min_surpport가 3이상인 item의 갯수가 6개라는 뜻.
pd.Series(sim_df.iloc[iid].sort_values(ascending=False))[:50]

424     1.0
931     1.0
3185    1.0
235     1.0
232     1.0
594     1.0
972     1.0
967     1.0
3540    1.0
4230    1.0
188     1.0
2087    1.0
1694    1.0
4238    1.0
174     1.0
1576    1.0
3830    1.0
2747    1.0
1375    1.0
649     1.0
1667    1.0
1394    1.0
2121    1.0
1475    1.0
2856    1.0
687     1.0
2322    1.0
880     1.0
100     1.0
1621    1.0
93      1.0
4316    1.0
68      1.0
717     1.0
585     1.0
1051    1.0
3349    1.0
437     1.0
1123    1.0
1161    1.0
472     1.0
363     1.0
479     1.0
1958    1.0
11      1.0
1966    1.0
506     1.0
14      1.0
243     1.0
511     1.0
Name: 174, dtype: float64

### optional. 유사도가 높게 나온 영화들은 어떤 영화들인지 확인해보기
주로 비슷한 시기(ex 2000년대)에 개봉한 영화들의 유사도가 높음

In [123]:
train.to_raw_iid(931)

6377

In [102]:
movies = pd.read_csv("./data/ml-latest-small/movies.csv")

In [120]:
movies[movies['movieId'] == 54272]

Unnamed: 0,movieId,title,genres
6530,54272,"Simpsons Movie, The (2007)",Animation|Comedy


In [122]:
movies[movies['movieId'] == 4226]

Unnamed: 0,movieId,title,genres
3141,4226,Memento (2000),Mystery|Thriller


In [124]:
movies[movies['movieId'] == 6377]

Unnamed: 0,movieId,title,genres
4360,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy


# 6. RMSE 확인

In [25]:
# RMSE 확인
from surprise.accuracy import rmse

In [26]:
rmse(predictions)

RMSE: 1.0042


1.0041646813557552