In [1]:
import pandas as pd
import numpy as np
import math as mt
from numpy import dot
from numpy.linalg import norm
from scipy import stats

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn

sn.set()
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# 한글출력
matplotlib.rc('font', family='AppleGothic')  # MacOS
# matplotlib.rc('font', family='Malgun Gothic')  # Windows
plt.rcParams['axes.unicode_minus'] = False

In [2]:
def euclidean_distance(A, B):
    err = 0
    length = len(A)
    for idx in range(0, length):
        tmp = ((A[idx] - B[idx]) ** 2)
        err += tmp
        
    return mt.sqrt(err)

def cosine_similarity(A,B):
    return dot(A, B) / (norm(A) * norm(B))

def sumDiffer(A,B):
    length = len(A)
    err = 0
    for idx in range(0, length):
        err += ((A[idx] - B[idx]) ** 2)
    return mt.sqrt(err / length)

def improved_similarity(A, B, w):
    cos_sim = cosine_similarity(A, B)
    sum_diff = sumDiffer(A, B)
    w **= sum_diff
    
    return cos_sim * w

In [4]:
datas = np.round((np.random.rand(20,16) * 500))
sim_df = pd.DataFrame(columns=['유클리디안 거리', '코사인 유사도', '피어슨 상관계수','향상된 코사인 유사도'])

A = datas[0]
B_values = datas[1:]
imp_weight = 0.99

for idx,B in enumerate(B_values):
    sim_df.loc[idx] = [
        euclidean_distance(A,B),
        cosine_similarity(A,B),
        stats.pearsonr(A,B)[0],
        improved_similarity(A, B, imp_weight)
    ]
    
sim_df

Unnamed: 0,유클리디안 거리,코사인 유사도,피어슨 상관계수,향상된 코사인 유사도
0,811.022811,0.71119,0.086695,0.092682
1,763.311208,0.74842,0.081245,0.109956
2,1020.776665,0.578166,-0.522292,0.044482
3,824.405847,0.717915,-0.206299,0.090465
4,612.919244,0.844775,0.479573,0.181102
5,978.351675,0.607361,-0.504427,0.051984
6,683.479334,0.804841,0.259037,0.14451
7,750.315267,0.804407,0.07973,0.122104
8,795.553895,0.7555,-0.073532,0.102359
9,711.980337,0.826185,-0.024018,0.13809


In [5]:
sim_df.sort_values(by=["유클리디안 거리"])

Unnamed: 0,유클리디안 거리,코사인 유사도,피어슨 상관계수,향상된 코사인 유사도
16,556.239157,0.872051,0.603298,0.215562
14,565.520999,0.867625,0.388581,0.209524
4,612.919244,0.844775,0.479573,0.181102
6,683.479334,0.804841,0.259037,0.14451
9,711.980337,0.826185,-0.024018,0.13809
10,747.511204,0.777976,0.059099,0.118927
7,750.315267,0.804407,0.07973,0.122104
12,759.399763,0.743888,-0.069114,0.11037
1,763.311208,0.74842,0.081245,0.109956
15,790.657954,0.785402,0.054026,0.107727


In [7]:
sim_df.sort_values(by=["코사인 유사도"], ascending=False)

Unnamed: 0,유클리디안 거리,코사인 유사도,피어슨 상관계수,향상된 코사인 유사도
16,556.239157,0.872051,0.603298,0.215562
14,565.520999,0.867625,0.388581,0.209524
4,612.919244,0.844775,0.479573,0.181102
9,711.980337,0.826185,-0.024018,0.13809
6,683.479334,0.804841,0.259037,0.14451
7,750.315267,0.804407,0.07973,0.122104
15,790.657954,0.785402,0.054026,0.107727
10,747.511204,0.777976,0.059099,0.118927
18,920.334178,0.766661,-0.240892,0.075916
8,795.553895,0.7555,-0.073532,0.102359


In [8]:
sim_df.sort_values(by=["피어슨 상관계수"], ascending=False)

Unnamed: 0,유클리디안 거리,코사인 유사도,피어슨 상관계수,향상된 코사인 유사도
16,556.239157,0.872051,0.603298,0.215562
4,612.919244,0.844775,0.479573,0.181102
14,565.520999,0.867625,0.388581,0.209524
6,683.479334,0.804841,0.259037,0.14451
0,811.022811,0.71119,0.086695,0.092682
1,763.311208,0.74842,0.081245,0.109956
7,750.315267,0.804407,0.07973,0.122104
10,747.511204,0.777976,0.059099,0.118927
15,790.657954,0.785402,0.054026,0.107727
9,711.980337,0.826185,-0.024018,0.13809


In [9]:
sim_df.sort_values(by=["향상된 코사인 유사도"], ascending=False)

Unnamed: 0,유클리디안 거리,코사인 유사도,피어슨 상관계수,향상된 코사인 유사도
16,556.239157,0.872051,0.603298,0.215562
14,565.520999,0.867625,0.388581,0.209524
4,612.919244,0.844775,0.479573,0.181102
6,683.479334,0.804841,0.259037,0.14451
9,711.980337,0.826185,-0.024018,0.13809
7,750.315267,0.804407,0.07973,0.122104
10,747.511204,0.777976,0.059099,0.118927
12,759.399763,0.743888,-0.069114,0.11037
1,763.311208,0.74842,0.081245,0.109956
15,790.657954,0.785402,0.054026,0.107727


In [20]:
euc_dis_std = sim_df['유클리디안 거리'].std()
cos_sim_std = sim_df['코사인 유사도'].std()
corr_std = sim_df['피어슨 상관계수'].std()

length = len(sim_df)
sort_euc_based = sim_df.sort_values(by=["유클리디안 거리"])
err_cos_sim_std = 0

for idx in range(1, length):
    err = abs(sort_euc_based.iloc[idx-1]['코사인 유사도']\
        - sort_euc_based.iloc[idx]['코사인 유사도'])

    if err > (cos_sim_std * 0.8):
        err_cos_sim_std += 1
        
print("[distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_cos_sim_std)
print("\n#############################################")

sort_cos_based = sim_df.sort_values(by=["코사인 유사도"])
err_euc_dis_std = 0
err_corr_std = 0

for idx in range(1, length):
    err = abs(sort_cos_based.iloc[idx-1]['유클리디안 거리']\
        - sort_cos_based.iloc[idx]['유클리디안 거리'])

    if err > (euc_dis_std * 0.8):
        err_euc_dis_std += 1
        
print("[cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_euc_dis_std)

[distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 3

#############################################
[cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 5


In [22]:
imp_cos_based = sim_df.sort_values(by=["피어슨 상관계수"])
err_cos_sim_std = 0
err_euc_dis_std = 0

for idx in range(1, length):
    err_euc_dis = abs(imp_cos_based.iloc[idx-1]['유클리디안 거리']\
        - imp_cos_based.iloc[idx]['유클리디안 거리'])
    err_cos_sim = abs(imp_cos_based.iloc[idx-1]['코사인 유사도']\
        - imp_cos_based.iloc[idx]['코사인 유사도'])

    if err_euc_dis > (euc_dis_std * 0.8):
        err_euc_dis_std += 1
    if err_cos_sim > (cos_sim_std * 0.8):
        err_cos_sim_std += 1

print("[pearson : distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_euc_dis_std)
print("[pearson : cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_cos_sim_std)

[pearson : distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 2
[pearson : cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 5


In [23]:
imp_cos_based = sim_df.sort_values(by=["향상된 코사인 유사도"])
err_cos_sim_std = 0
err_euc_dis_std = 0

for idx in range(1, length):
    err_euc_dis = abs(imp_cos_based.iloc[idx-1]['유클리디안 거리']\
        - imp_cos_based.iloc[idx]['유클리디안 거리'])
    err_cos_sim = abs(imp_cos_based.iloc[idx-1]['코사인 유사도']\
        - imp_cos_based.iloc[idx]['코사인 유사도'])

    if err_euc_dis > (euc_dis_std * 0.8):
        err_euc_dis_std += 1
    if err_cos_sim > (cos_sim_std * 0.8):
        err_cos_sim_std += 1

print("[imp : distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_euc_dis_std)
print("[imp : cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 :", err_cos_sim_std)

[imp : distance based] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 0
[imp : cosine similarity] 각 순위 사이, 표준 편차의 80%를 넘어서게 차이난 개수 : 3
