In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
import math

# sample 데이터 셋팅

In [3]:
# # 우리강의 샘플 데이터
# raw_sample = pd.DataFrame([[0,1,3.5],[0,2,4.5],[0,3,5],[0,4,4], [0,6,1]
#                            ,[1,0,4.5],[1,2,5],[1,3,5], [1,6,0.5]
#                            ,[2,1,4],[2,2,2],[2,3,1], [2,5,4.5]
#                            ,[3,1,3.5],[3,4,3], [3,5,4], [3,6,2]
#                            , [4,3,5],[4,4,4.5]], columns=['userId', 'movieId', 'rating'])

# raw_sample

In [4]:
# 스탠포드 강의 예시
raw_sample = pd.DataFrame([[0,0,4],[0,3,5],[0,4,1]
                           ,[1,0,5],[1,1,5],[1,2,4]
                           ,[2,3,2],[2,4,4],[2,5,5]
                           ,[3,1,3],[3,6,3]], columns=['userId', 'movieId', 'rating'])

raw_sample

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,3,5
2,0,4,1
3,1,0,5
4,1,1,5
5,1,2,4
6,2,3,2
7,2,4,4
8,2,5,5
9,3,1,3


In [5]:
ui_metrix = raw_sample.set_index(['movieId', 'userId']).unstack()['rating']
ui_metrix

userId,0,1,2,3
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.0,5.0,,
1,,5.0,,3.0
2,,4.0,,
3,5.0,,2.0,
4,1.0,,4.0,
5,,,5.0,
6,,,,3.0


## 코사인 유사도

메뉴얼 대로 코사인 유사도 구하기

In [15]:
def cosine_m(A_ratings, B_ratings):
    A_ratings = (A_ratings).fillna(0)
    B_ratings = (B_ratings).fillna(0)
    
    # 분자
    ab_dot = np.dot(A_ratings, B_ratings)

    # 분모
    ab_length_multiply = math.sqrt(sum(i**2 for i in A_ratings))*math.sqrt(sum(i**2 for i in B_ratings))
    
    return ab_dot/ab_length_multiply 

In [16]:
cosine_m(ui_metrix[0], ui_metrix[1])

0.3798685881987931

In [17]:
cosine_m(ui_metrix[0], ui_metrix[2])

0.3220305943597653

## pearson 유사도

메뉴얼로 구함

In [27]:
def pearson_m(A_ratings, B_ratings):
    a_mean = A_ratings.mean()
    b_mean = B_ratings.mean()
    
    A_ratings = (A_ratings - a_mean).fillna(0)
    B_ratings = (B_ratings - b_mean).fillna(0)
    
    # 분자
    ab_dot = np.dot(A_ratings, B_ratings)

    # 분모
    ab_length_multiply = math.sqrt(sum(i**2 for i in A_ratings))*math.sqrt(sum(i**2 for i in B_ratings))
    
    return ab_dot/ab_length_multiply 

In [29]:
pearson_m(ui_metrix[0], ui_metrix[1])

0.09245003270420475

In [30]:
pearson_m(ui_metrix[0], ui_metrix[2])

-0.5590852462516898

<br>

# 코사인 유사도 라이브러리로 재현하기

서프라이즈 모델에 적용해서 확인

In [6]:
from surprise import Reader, Dataset

# 서프라이즈 패키지에게 rating의 최솟값, 최댓값 알려줌
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(raw_sample, reader)

In [38]:
from surprise.prediction_algorithms.knns import KNNBasic

# 모델 셋팅
sim_options = {'name': 'cosine',
              'min_support': 1,
              'user_based': True}

model = KNNBasic(k=2, min_k=0, sim_options=sim_options)

model.fit(data.build_full_trainset())

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f3acb321160>

In [39]:
model.get_neighbors(1,4)

[0, 3, 2]

In [40]:
model.sim

array([[1.        , 1.        , 0.61394061, 0.        ],
       [1.        , 1.        , 0.        , 1.        ],
       [0.61394061, 0.        , 1.        , 0.        ],
       [0.        , 1.        , 0.        , 1.        ]])

라이브러리로 코사인 유사도 구하기

In [41]:
from surprise.similarities import cosine

In [46]:
cosine(model.n_x, model.yr, 0)

array([[1.        , 1.        , 0.61394061,        nan],
       [1.        , 1.        ,        nan, 1.        ],
       [0.61394061,        nan, 1.        ,        nan],
       [       nan, 1.        ,        nan, 1.        ]])

In [12]:
from scipy.spatial.distance import cosine

In [13]:
ui_metrix[0]

movieId
0    4.0
1    NaN
2    NaN
3    5.0
4    1.0
5    NaN
6    NaN
Name: 0, dtype: float64

In [14]:
1 - cosine(ui_metrix.fillna(0)[0], ui_metrix.fillna(0)[1])

0.3798685881987932

<br>

## 피어슨 유사도 라이브러리로 재현하기

In [34]:
ui_metrix

userId,0,1,2,3
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4.0,5.0,,
1,,5.0,,3.0
2,,4.0,,
3,5.0,,2.0,
4,1.0,,4.0,
5,,,5.0,
6,,,,3.0


서프라이즈 모델에 적용해서 확인

In [31]:
# 모델 셋팅
sim_options = {'name': 'pearson',
              'min_support': 0,
              'user_based': True}

model = KNNBasic(k=2, min_k=0, sim_options=sim_options)

model.fit(data.build_full_trainset())

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f3acb3217b8>

In [32]:
model.get_neighbors(1, 4)

[0, 2, 3]

In [33]:
model.sim

array([[ 1.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.],
       [-1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

라이브러리로 피어슨 유사도 구하기

In [36]:
from scipy.stats import pearsonr
from numpy import corrcoef
from surprise.similarities import pearson

In [47]:
model.yr

defaultdict(list,
            {0: [(0, 4.0), (1, 5.0)],
             1: [(0, 5.0), (2, 2.0)],
             2: [(0, 1.0), (2, 4.0)],
             3: [(1, 5.0), (3, 3.0)],
             4: [(1, 4.0)],
             5: [(2, 5.0)],
             6: [(3, 3.0)]})

In [37]:
pearson(model.n_x, model.yr, 1)

array([[ 1.,  0., -1.,  0.],
       [ 0.,  1.,  0.,  0.],
       [-1.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.]])

In [25]:
# pearsonr
pearsonr(ui_metrix[1], ui_metrix[0])

ValueError: array must not contain infs or NaNs

In [48]:
pearsonr(ui_metrix.fillna(0)[1], ui_metrix.fillna(0)[0])

(0.0, 1.0000000000000002)

In [26]:
# corrcoef
corrcoef(ui_metrix.fillna(0)[1], ui_metrix.fillna(0)[0])

array([[1.00000000e+00, 2.73688515e-17],
       [2.73688515e-17, 1.00000000e+00]])