# 1. Similarity

In [13]:
from scipy import spatial

vector_1 = np.array([1,2,3,4,5])
vector_2 = np.array([5,4,3,2,1])
vector_3 = np.array([2,3,4,5,6])
vector_4 = np.array([11,19,28,32,47])

# 2. Euclidean Distance Similarity

In [22]:
np.sqrt(sum((vector_1 - vector_2) ** 2))

6.324555320336759

In [33]:
np.linalg.norm(vector_1 - vector_2)

6.324555320336759

# 3. Cosine Similarity
 - Recommend 에서 많이 사용 (영화를 많이보는 사용자, 비슷한 장르의 영화를 보는 사용자)

In [19]:
# 분자 - 벡터의 내적
num1 = sum(vector_1 * vector_2) # np.dot(vector_1, vector_2)

# 분모 - 벡터의 크기
num2 = np.sqrt(sum(vector_1 ** 2) * sum(vector_2 ** 2))

num1 / num2

0.6363636363636364

In [28]:
print(1 - spatial.distance.cosine(vector_1, vector_2)), 
print(1 - spatial.distance.cosine(vector_1, vector_3)), 
print(1 - spatial.distance.cosine(vector_1, vector_4))

0.6363636363636364
0.9949366763261821
0.9971083087048903


# 5. Recommend System

In [38]:
columns = ['article_1', 'article_2', 'article_3', 'article_4', 'article_5']
index = ['user_1', 'user_2', 'user_3', 'user_4']

data = np.array([
    [5,3,0,0,2],
    [2,0,0,1,4],
    [0,0,4,3,1],
    [4,0,4,5,0],
])

sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_1,5,3,0,0,2
user_2,2,0,0,1,4
user_3,0,0,4,3,1
user_4,4,0,4,5,0


In [50]:
def cosine_similarity(vector_1, vector_2):
#     print(list(vector_1), list(vector_2))
    idx = vector_1.nonzero()[0]
#     print(idx)
    vector_1 = np.array(vector_1)[idx]
    vector_2 = np.array(vector_2)[idx]
#     print(vector_1, vector_2)
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [51]:
# test code - cosine_similarity
cosine_similarity(sample_df.loc['user_1'], sample_df.loc['user_2'])

0.6529286250990105

In [52]:
def similarity_matrix(sample_df, similarity_func):
    index = sample_df.index
    df = sample_df.T
    
    matrix = []
    for idx_1, value_1 in df.items():
        row = []
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
    
    return pd.DataFrame(matrix, index=index, columns=index)

In [54]:
sm_df = similarity_matrix(sample_df, cosine_similarity)
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_1,1.0,0.652929,0.324443,0.811107
user_2,0.729397,1.0,0.483046,0.443039
user_3,0.196116,0.332956,1.0,0.949474
user_4,0.529813,0.770054,0.82121,1.0


In [68]:
user, closer_count = "user_1", 2
ms_df = sm_df.drop(user)
ms_df = ms_df.sort_values(user, ascending=False)
ms_df = ms_df[:closer_count]
# sample_df.loc[ms_df.index]
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0


In [70]:
mean = np.zeros(len(sample_df.columns))
for ms_user, sms_value in ms_df[user].items():
    mean += sample_df.loc[ms_user]
mean /= len(ms_df[user])
mean

article_1    3.0
article_2    0.0
article_3    2.0
article_4    3.0
article_5    2.0
Name: user_2, dtype: float64

In [74]:
pred_df = pd.DataFrame(columns=sample_df.columns)
pred_df.loc["user"] = sample_df.loc[user]
pred_df.loc["mean"] = mean

pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [87]:
def mean_score(sample_df, sm_df, target, closer_count):
    
    ms_df = sm_df.drop(user)
    ms_df = ms_df.sort_values(user, ascending=False)
    ms_df = ms_df[:closer_count]
    
    ms_df = sample_df.loc[ms_df.index]
    
    pred_df = pd.DataFrame(columns=sample_df.columns)
    pred_df.loc['user'] = sample_df.loc[target]
    pred_df.loc['mean'] = ms_df.mean()
    
    return pred_df

In [88]:
# test code - mean_score
mean_score(sample_df, sm_df, "user_1", 2)

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [96]:
# recommend
recommed_df = pred_df.T
recommed_df = recommed_df[recommed_df['user'] == 0]
recommed_df = recommed_df.sort_values('mean', ascending=False)
print(list(recommed_df.index))
recommed_df

['article_4', 'article_3']


Unnamed: 0,user,mean
article_4,0,3
article_3,0,2


# MSE

In [106]:
def mse(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    return sum((value - pred) ** 2) / len(idx)

In [107]:
mse(pred_df.loc['user'], pred_df.loc['mean'])

4.333333333333333

In [110]:
def evaluate(df, sm_df, closer_count, algorithm):
    
    users = df.index
    evaluate_list = []
    
    for target in users:
        pred_df = mean_score(sample_df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(pred_df.loc['user'], pred_df.loc['mean']))
        
    return np.average(evaluate_list)

In [111]:
evaluate(sample_df, sm_df, 2, mse)

3.0

# RMSE

In [115]:
def rmse(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    return np.sqrt(sum((value - pred) ** 2) / len(idx))

In [116]:
# test code -rmse
rmse(pred_df.loc['user'], pred_df.loc['mean'])

2.0816659994661326

In [117]:
evaluate(sample_df, sm_df, 2, rmse)

1.7091905158349232

# MAE

In [120]:
def mae(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx]
    return sum(np.abs(value - pred)) / len(idx)

In [121]:
mae(pred_df.loc['user'], pred_df.loc['mean'])

1.6666666666666667

In [122]:
evaluate(sample_df, sm_df, 2, mae)

1.5000000000000002