In [1]:
import pandas as pd
import numpy as np

In [2]:
ex = pd.DataFrame([[np.nan, 4.5, np.nan, np.nan, np.nan]
                 , [3.5, np.nan, 4.0, 3.5, np.nan]
                 , [4.5, 5.0, 2.0, np.nan, np.nan]
                 , [5.0, 5.0, 1.0, np.nan, 5.0]
                 , [4.0, np.nan, np.nan, 3.0, 4.5]
                 , [np.nan, 4.0, 4.5, 4.0, np.nan]
                 , [1.0, 0.5, np.nan, 2.0, np.nan]]
                 , columns = ['조', '에이미', '베스', '메그', '마미'])

In [3]:
ex

Unnamed: 0,조,에이미,베스,메그,마미
0,,4.5,,,
1,3.5,,4.0,3.5,
2,4.5,5.0,2.0,,
3,5.0,5.0,1.0,,5.0
4,4.0,,,3.0,4.5
5,,4.0,4.5,4.0,
6,1.0,0.5,,2.0,


# 유사도 구하기

## Cosine Similarity

In [4]:
jo_idx = ex[ex['조'].notnull()].index
jo_idx

Int64Index([1, 2, 3, 4, 6], dtype='int64')

In [5]:
amy_idx = ex[ex['에이미'].notnull()].index
amy_idx

Int64Index([0, 2, 3, 5, 6], dtype='int64')

In [6]:
jo_amy_idx = jo_idx.intersection(amy_idx)
jo_amy_idx

Int64Index([2, 3, 6], dtype='int64')

In [7]:
jo_amy = ex.loc[jo_amy_idx, ['조', '에이미']]
jo_amy

Unnamed: 0,조,에이미
2,4.5,5.0
3,5.0,5.0
6,1.0,0.5


In [8]:
np.dot(jo_amy['조'], jo_amy['에이미'])

48.0

In [9]:
import math

In [10]:
math.sqrt(sum(jo_amy['조']**2)) * math.sqrt(sum(jo_amy['에이미']**2))

48.20853140264698

In [11]:
cs_jo_amy = np.dot(jo_amy['조'], jo_amy['에이미']) / (math.sqrt(sum(jo_amy['조']**2)) * math.sqrt(sum(jo_amy['에이미']**2)))
cs_jo_amy

0.9956743879852865

In [12]:
def cos_sim(u, v, data):
    
    # 두 유저가 공통으로 평점을 남긴 영화의 데이터만 있는 데이터프레임을 만든다.
    u_idx = data[data[u].notnull()].index
    v_idx = data[data[v].notnull()].index
    idx_list = u_idx.intersection(v_idx)
    df = data.loc[idx_list, [u, v]]
    
    # 분자
    numerator = np.dot(df[u], df[v])
    
    # 분모 
    denominator = math.sqrt(sum(df[u]**2)) * math.sqrt(sum(df[v]**2))
    
    return numerator / denominator

# 이 함수의 문제점: 자기 자신과의 유사도를 구하려고 하면 에러남

In [13]:
cos_sim('조', '에이미', ex)

0.9956743879852865

In [14]:
cos_sim('에이미', '베스', ex)

0.808372034730701

In [15]:
cos_sim('에이미', '메그', ex)

0.9429903335828895

In [16]:
cos_sim('에이미', '마미', ex)

1.0

## Cosine Similarity with Surprise

In [17]:
from surprise import Reader, Dataset

In [18]:
ex

Unnamed: 0,조,에이미,베스,메그,마미
0,,4.5,,,
1,3.5,,4.0,3.5,
2,4.5,5.0,2.0,,
3,5.0,5.0,1.0,,5.0
4,4.0,,,3.0,4.5
5,,4.0,4.5,4.0,
6,1.0,0.5,,2.0,


In [19]:
ex2 = ex.stack().reset_index().rename(columns = {'level_0':'item', 'level_1':'user', 0:'rating'})[['user', 'item', 'rating']]
ex2

Unnamed: 0,user,item,rating
0,에이미,0,4.5
1,조,1,3.5
2,베스,1,4.0
3,메그,1,3.5
4,조,2,4.5
5,에이미,2,5.0
6,베스,2,2.0
7,조,3,5.0
8,에이미,3,5.0
9,베스,3,1.0


In [20]:
# 서프라이즈 패키지에게 rating의 최솟값, 최댓값 알려줌
reader = Reader(rating_scale=(0,5))
data = Dataset.load_from_df(ex2, reader)
trainset = data.build_full_trainset()

In [21]:
from surprise.prediction_algorithms.knns import KNNBasic

In [22]:
# 모델 셋팅
sim_options = {'name': 'cosine',
               'min_support': 0,
               'user_based': True}

model = KNNBasic(k=2, min_k=1, sim_options=sim_options, verbose=True)

model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fa295903400>

In [23]:
pd.DataFrame(model.sim)
# 0: 에이미, 1: 조, 2: 베스, 3: 메그, 4: 마미

Unnamed: 0,0,1,2,3,4
0,1.0,0.995674,0.808372,0.94299,1.0
1,0.995674,1.0,0.805776,0.965908,0.998314
2,0.808372,0.805776,1.0,0.999969,1.0
3,0.94299,0.965908,0.999969,1.0,1.0
4,1.0,0.998314,1.0,1.0,1.0


## Pearson Correlation Coefficient (Centered Cosine Similarity)

In [24]:
ex

Unnamed: 0,조,에이미,베스,메그,마미
0,,4.5,,,
1,3.5,,4.0,3.5,
2,4.5,5.0,2.0,,
3,5.0,5.0,1.0,,5.0
4,4.0,,,3.0,4.5
5,,4.0,4.5,4.0,
6,1.0,0.5,,2.0,


In [26]:
ex['조'].mean()

3.6

In [27]:
def pear_sim(u, v, data):
    
    u_mean = data[u].mean()
    v_mean = data[v].mean()
    
    # 두 유저가 공통으로 평점을 남긴 영화의 데이터만 있는 데이터프레임을 만든다.
    u_idx = data[data[u].notnull()].index
    v_idx = data[data[v].notnull()].index
    idx_list = u_idx.intersection(v_idx)
    df = data.loc[idx_list, [u, v]]
    
    # 분자
    numerator = np.dot((df[u]-u_mean), (df[v]-v_mean))
    
    # 분모 
    denominator = math.sqrt(sum((df[u]-u_mean)**2)) * math.sqrt(sum((df[v]-v_mean)**2))
    
    return numerator / denominator

In [28]:
pear_sim('조', '에이미', ex)

0.9899189410789128

In [29]:
pear_sim('에이미', '베스', ex)

-0.6617343335120788

In [30]:
pear_sim('에이미', '메그', ex)

0.8250468906287833

In [31]:
pear_sim('에이미', '마미', ex)

1.0

## Pearson Correlation Coefficient with Surprise
전체 평균 대신 두 유저가 공통으로 평점을 매긴 아이템의 평점만 가지고 구한 평균을 사용한다.

In [37]:
sim_options = {'name': 'pearson',
               'min_support': 0,
               'user_based': True}

model = KNNBasic(k=2, min_k=1, sim_options=sim_options, verbose=True)

model.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fa295da6fd0>

In [38]:
pd.DataFrame(model.sim)
# 0: 에이미, 1: 조, 2: 베스, 3: 메그, 4: 마미

Unnamed: 0,0,1,2,3,4
0,1.0,0.993399,-0.960769,1.0,0.0
1,0.993399,1.0,-1.0,0.882498,1.0
2,-0.960769,-1.0,1.0,1.0,0.0
3,1.0,0.882498,1.0,1.0,0.0
4,0.0,1.0,0.0,0.0,1.0


# 예상 평점 구하기

## Cosine Similarity + kNN Basic

In [41]:
ex

Unnamed: 0,조,에이미,베스,메그,마미
0,,4.5,,,
1,3.5,,4.0,3.5,
2,4.5,5.0,2.0,,
3,5.0,5.0,1.0,,5.0
4,4.0,,,3.0,4.5
5,,4.0,4.5,4.0,
6,1.0,0.5,,2.0,


In [106]:
amy_cos = dict()

for user in ['조', '베스', '메그', '마미']:
    amy_cos[user] = cos_sim('에이미', user, ex)

amy_cos = sorted(amy_cos.items(), key=lambda k: k[1], reverse=True)
    
amy_cos_df = pd.DataFrame(amy_cos).rename(columns = {0:'user', 1:'cos_sim'})
amy_cos_df

Unnamed: 0,user,cos_sim
0,마미,1.0
1,조,0.995674
2,메그,0.94299
3,베스,0.808372


In [116]:
# 에이미의 4번 영화 예상 평점 (k=2)

movieId = 4 
k = 2

numerator = 0
denominator = 0 

for i in range(0, k):
    user = amy_cos_df.loc[i, 'user']
    numerator += ex.loc[movieId, user] * amy_cos_df.loc[i, 'cos_sim']
    denominator += amy_cos_df.loc[i, 'cos_sim']
    
numerator / denominator

4.250541873469034