In [1]:
import pandas as pd
import numpy as np
import math as mt
from numpy import dot
from numpy.linalg import norm
from scipy import stats
from sklearn.decomposition import TruncatedSVD 

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
import IPython
import IPython.display

sn.set()
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

# 한글출력
matplotlib.rc('font', family='AppleGothic')  # MacOS
# matplotlib.rc('font', family='Malgun Gothic')  # Windows
plt.rcParams['axes.unicode_minus'] = False

In [2]:
def euclidean_distance(A, B):
    err = 0
    length = len(A)
    for idx in range(0, length):
        tmp = ((A[idx] - B[idx]) ** 2)
        err += tmp
        
    return mt.sqrt(err)

def cosine_similarity(A,B):
    return dot(A, B) / (norm(A) * norm(B))

def sumDiffer(A,B):
    length = len(A)
    err = 0
    for idx in range(0, length):
        err += ((A[idx] - B[idx]) ** 2)
    return mt.sqrt(err / length)

def improved_similarity(A, B, w):
    cos_sim = cosine_similarity(A, B)
    sum_diff = sumDiffer(A, B)
    w **= sum_diff
    
    return cos_sim * w

## an improved collaborative recommendation algorithm based on optimizer user similarity fit test
- 랜덤하게 (100, 32) 크기의 행렬 구조 생성 100 looping
- 어떠한 벡터를 정해서 해당 벡터에 대한 순위 매기기, sort_values
---
1. Euclidean Distance
    - 거리기반 유사도 측정값을 오름차순 정렬하고, 각 순위 사이의 방향기반(Cosine Similarity) 지수가 표준편차의 n% 를 넘어가면 잘못된 측정으로 Count
2. Cosine Similarity
    - 방향기반 유사도 측정값을 내림차순 정렬하고, 각 순위 사이의 거리기반(Euclidean Distance) 지수가 표준편차의 n% 를 넘어가면 잘못된 측정으로 Count
3. imp cos
    - 거리+방향 기반 유사도 측정값을 내림차순 정렬하고, 각 순위 사이의 방향기반과 거리기반 지수가 표준편차의 n%를 넘어가면 잘못된 측정으로 Count

In [35]:
def test_func(test_per):
    fit_cols = ['execute 거리기반, dis_err','execute 거리기반, cos_err', 'execute 거리기반, corr_err', 
                'execute 방향기반, dis_err', 'execute 방향기반, cos_err', 'execute 방향기반, corr_err',
                'execute 상관관계 기반, dis_err', 'execute 상관관계 기반, cos_err', 'execute 상관관계 기반, corr_err',
                'execute 거리+방향기반, dis_err', 'execute 거리+방향기반, cos_err', 'execute 거리+방향기반, corr_err']
    fit_df = pd.DataFrame(columns=fit_cols)

    for test_idx in range(1, 101):
        datas = np.round((np.random.rand(101,32) * 500))
        cols = ['유클리디안 거리', '코사인 유사도', '피어슨 상관계수','향상된 코사인 유사도']
        sim_df = pd.DataFrame(columns=cols)

        A = datas[0]
        B_values = datas[1:]
        imp_weight = 0.99

        for idx,B in enumerate(B_values):
            sim_df.loc[idx] = [
                euclidean_distance(A,B),
                cosine_similarity(A,B),
                stats.pearsonr(A,B)[0],
                improved_similarity(A, B, imp_weight)
            ]

        euc_std = sim_df['유클리디안 거리'].std()
        euc_std_per = euc_std * (test_per / 100)
        print("유클리디안 거리 표준편차 : {}\n".format(euc_std)
             +"유클리디안 거리 표준편차 ({}%) : {}\n".format(test_per,euc_std_per))

        cos_std = sim_df['코사인 유사도'].std()
        cos_std_per = cos_std * (test_per / 100)
        print("코사인 유사도 표준편차 : {}\n".format(cos_std)
             +"코사인 유사도 표준편차 ({}%) : {}\n".format(test_per,cos_std_per))
        
        corr_std = sim_df['피어슨 상관계수'].std()
        corr_std_per = corr_std * (test_per / 100)
        print("피어슨 상관계수 표준편차 : {}\n".format(corr_std)
             +"피어슨 상관계수 표준편차 ({}%) : {}\n".format(test_per,corr_std_per))


        sort_euc_based = sim_df.sort_values(by=['유클리디안 거리'])
        sort_cos_based = sim_df.sort_values(by=['코사인 유사도'], ascending=False)
        sort_corr_based = sim_df.sort_values(by=['피어슨 상관계수'], ascending=False)
        sort_imp_based = sim_df.sort_values(by=['향상된 코사인 유사도'], ascending=False)

        count_arr = []

        length = len(sort_euc_based)
        dis_count = 0
        cos_count = 0
        corr_count = 0
        for idx in range(1, length):
            dis_err = abs(sort_euc_based.iloc[idx - 1]['유클리디안 거리']
                     - sort_euc_based.iloc[idx]['유클리디안 거리']
                     )
            cos_err = abs(sort_euc_based.iloc[idx - 1]['코사인 유사도']
                     - sort_euc_based.iloc[idx]['코사인 유사도']
                     )
            corr_err = abs(sort_euc_based.iloc[idx - 1]['피어슨 상관계수']
                    - sort_euc_based.iloc[idx]['피어슨 상관계수'])

            if dis_err > euc_std_per:
                dis_count += 1
            if cos_err > cos_std_per:
                cos_count += 1
            if corr_err > corr_std_per:
                corr_count += 1

        print("표준편차를 넘어선 개수 : {}".format(cos_count))
        print("표준편차를 넘어선 개수 : {}\n".format(corr_count))
        count_arr.append(dis_count)
        count_arr.append(cos_count)
        count_arr.append(corr_count)

        length = len(sort_cos_based)
        dis_count = 0
        cos_count = 0
        corr_count = 0
        for idx in range(1, length):
            dis_err = abs(sort_cos_based.iloc[idx - 1]['유클리디안 거리']
                     - sort_cos_based.iloc[idx]['유클리디안 거리']
                     )
            cos_err = abs(sort_cos_based.iloc[idx - 1]['코사인 유사도']
                     - sort_cos_based.iloc[idx]['코사인 유사도']
                     )
            corr_err = abs(sort_cos_based.iloc[idx - 1]['피어슨 상관계수']
                     - sort_cos_based.iloc[idx]['피어슨 상관계수']
                     )

            if dis_err > euc_std_per:
                dis_count += 1
            if cos_err > cos_std_per:
                cos_count += 1
            if corr_err > corr_std_per:
                corr_count += 1

        print("표준편차를 넘어선 개수 : {}".format(dis_count))
        print("표준편차를 넘어선 개수 : {}\n".format(corr_count))
        count_arr.append(dis_count)
        count_arr.append(cos_count)
        count_arr.append(corr_count)
        
        length = len(sort_corr_based)
        dis_count = 0
        cos_count = 0
        corr_count = 0
        for idx in range(1, length):
            dis_err = abs(sort_corr_based.iloc[idx - 1]['유클리디안 거리']
                     - sort_corr_based.iloc[idx]['유클리디안 거리']
                     )
            cos_err = abs(sort_corr_based.iloc[idx - 1]['코사인 유사도']
                     - sort_corr_based.iloc[idx]['코사인 유사도']
                     )
            corr_err = abs(sort_corr_based.iloc[idx - 1]['피어슨 상관계수']
                     - sort_corr_based.iloc[idx]['피어슨 상관계수']
                     )

            if dis_err > euc_std_per:
                dis_count += 1
            if cos_err > cos_std_per:
                cos_count += 1
            if corr_err > corr_std_per:
                corr_count += 1

        print("표준편차를 넘어선 개수 : {}".format(dis_count))
        print("표준편차를 넘어선 개수 : {}\n".format(cos_count))
        count_arr.append(dis_count)
        count_arr.append(cos_count)
        count_arr.append(corr_count)
        
        length = len(sort_imp_based)
        dis_count = 0
        cos_count = 0
        corr_count = 0
        for idx in range(1, length):
            dis_err = abs(sort_imp_based.iloc[idx - 1]['유클리디안 거리']
                     - sort_imp_based.iloc[idx]['유클리디안 거리']
                     )
            cos_err = abs(sort_imp_based.iloc[idx - 1]['코사인 유사도']
                     - sort_imp_based.iloc[idx]['코사인 유사도']
                     )
            corr_err = abs(sort_imp_based.iloc[idx - 1]['피어슨 상관계수']
                     - sort_imp_based.iloc[idx]['피어슨 상관계수']
                     )

            if dis_err > euc_std_per:
                dis_count += 1
            if cos_err > cos_std_per:
                cos_count += 1
            if corr_err > corr_std_per:
                corr_count += 1
                
        count_arr.append(dis_count)                
        count_arr.append(cos_count)
        count_arr.append(corr_count)

        print("표준편차를 넘어선 개수 : {}".format(dis_count))
        print("표준편차를 넘어선 개수 : {}".format(cos_count))
        print("표준편차를 넘어선 개수 : {}\n".format(corr_count))

        fit_df.loc['test {}'.format(test_idx)] = count_arr

        IPython.display.clear_output()
        
    print("compare {}% 거리기반 mean error count, {}% 거리+방향 기반 mean error count".format(test_per, test_per))
    print("거리기반 =================> dis:{} cos:{} corr:{}".format(round(fit_df[fit_cols[0]].mean()),
                                                          round(fit_df[fit_cols[1]].mean()),
                                                         round(fit_df[fit_cols[2]].mean())))
    print("거리 + 방향 기반 ==========> dis:{} cos:{} corr:{}\n".format(round(fit_df[fit_cols[9]].mean()),
                                                                  round(fit_df[fit_cols[10]].mean()),
                                                          round(fit_df[fit_cols[11]].mean())))


    print("compare {}% 방향기반 mean error count, {}% 거리+방향 기반 mean error count".format(test_per, test_per))
    print("방향기반 =================> dis:{} cos:{} corr:{}".format(round(fit_df[fit_cols[3]].mean()),
                                                                 round(fit_df[fit_cols[4]].mean()),
                                                         round(fit_df[fit_cols[5]].mean())))
    print("거리 + 방향 기반 ==========> dis:{} cos:{} corr:{}\n".format(round(fit_df[fit_cols[9]].mean()),
                                                                  round(fit_df[fit_cols[10]].mean()),
                                                          round(fit_df[fit_cols[11]].mean())))
    
    print("compare {}% 상관계수 기반 mean error count, {}% 거리+방향 기반 mean error count".format(test_per, test_per))
    print("상관계수 기반 =================> dis:{} cos:{} corr:{}".format(round(fit_df[fit_cols[6]].mean()),
                                                            round(fit_df[fit_cols[7]].mean()),
                                                         round(fit_df[fit_cols[8]].mean())))
    print("거리 + 방향 기반 ==========> dis:{} cos:{} corr:{}\n".format(round(fit_df[fit_cols[9]].mean()),
                                                                  round(fit_df[fit_cols[10]].mean()),
                                                          round(fit_df[fit_cols[11]].mean())))

    return fit_df

In [36]:
test_func(110)

compare 110% 거리기반 mean error count, 110% 거리+방향 기반 mean error count

compare 110% 방향기반 mean error count, 110% 거리+방향 기반 mean error count

compare 110% 상관계수 기반 mean error count, 110% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,0,9,7,6,0,11,6,10,0,0,2,4
test 2,0,1,11,1,0,15,5,18,0,0,0,10
test 3,0,18,9,21,0,9,6,3,0,0,9,5
test 4,0,0,12,0,0,15,10,12,0,0,0,14
test 5,0,2,10,4,0,12,5,13,0,0,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,1,4,6,4,0,10,3,8,0,1,0,5
test 97,0,3,4,1,0,5,1,2,0,0,1,4
test 98,0,4,4,6,0,5,2,2,0,0,2,2
test 99,1,1,13,1,1,21,13,23,1,1,1,13


In [37]:
test_func(90)

compare 90% 거리기반 mean error count, 90% 거리+방향 기반 mean error count

compare 90% 방향기반 mean error count, 90% 거리+방향 기반 mean error count

compare 90% 상관계수 기반 mean error count, 90% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,0,3,15,3,0,20,12,25,1,0,1,18
test 2,0,8,15,7,1,24,18,19,0,0,5,13
test 3,0,17,18,17,0,12,24,12,0,0,9,15
test 4,0,2,20,2,0,27,22,26,0,0,0,19
test 5,0,20,16,22,0,24,15,25,0,0,9,16
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,1,12,9,9,0,19,7,15,0,1,4,10
test 97,0,0,16,0,0,18,18,20,0,0,0,17
test 98,0,20,20,21,0,20,12,22,0,0,14,12
test 99,0,1,25,0,0,27,30,33,0,0,0,29


In [38]:
test_func(80)

compare 80% 거리기반 mean error count, 80% 거리+방향 기반 mean error count

compare 80% 방향기반 mean error count, 80% 거리+방향 기반 mean error count

compare 80% 상관계수 기반 mean error count, 80% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,1,22,33,27,2,25,24,32,2,2,16,21
test 2,0,8,18,9,0,18,24,27,0,0,5,17
test 3,0,28,27,27,0,16,14,12,0,0,15,10
test 4,0,20,16,26,1,22,23,29,0,0,12,18
test 5,1,10,17,9,1,24,16,17,0,1,3,18
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,1,5,19,5,0,14,13,14,0,1,2,13
test 97,0,30,27,28,0,12,18,12,0,0,18,13
test 98,0,3,22,10,0,25,20,25,0,0,2,18
test 99,0,16,11,16,0,18,14,21,0,0,8,9


In [39]:
test_func(50)

compare 50% 거리기반 mean error count, 50% 거리+방향 기반 mean error count

compare 50% 방향기반 mean error count, 50% 거리+방향 기반 mean error count

compare 50% 상관계수 기반 mean error count, 50% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,0,40,37,41,0,50,32,46,1,1,25,37
test 2,1,26,43,25,1,44,46,45,1,1,13,38
test 3,0,42,32,46,1,23,33,25,0,2,26,19
test 4,1,53,51,60,0,28,53,34,0,3,41,36
test 5,0,28,18,26,1,34,27,42,0,0,14,20
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,1,30,43,33,1,47,40,38,0,1,23,37
test 97,1,5,59,5,1,62,46,60,0,1,3,50
test 98,1,27,43,23,1,53,38,52,1,1,15,41
test 99,1,13,26,10,1,36,29,36,0,1,7,27


In [40]:
test_func(25)

compare 25% 거리기반 mean error count, 25% 거리+방향 기반 mean error count

compare 25% 방향기반 mean error count, 25% 거리+방향 기반 mean error count

compare 25% 상관계수 기반 mean error count, 25% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,1,67,59,78,3,67,74,73,0,5,61,68
test 2,2,58,66,66,2,74,67,76,3,14,58,64
test 3,3,44,76,34,4,75,75,80,4,3,29,80
test 4,2,56,59,54,4,62,57,70,3,9,43,66
test 5,2,60,63,70,2,63,61,62,3,9,45,62
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,4,69,66,76,1,71,57,68,3,17,62,63
test 97,2,56,58,58,3,63,60,75,0,2,42,58
test 98,5,42,74,40,3,77,75,77,3,5,25,73
test 99,5,63,70,67,5,72,64,59,4,18,55,64


In [41]:
test_func(10)

compare 10% 거리기반 mean error count, 10% 거리+방향 기반 mean error count

compare 10% 방향기반 mean error count, 10% 거리+방향 기반 mean error count

compare 10% 상관계수 기반 mean error count, 10% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,13,83,91,82,13,90,90,92,8,51,79,91
test 2,10,88,83,80,13,87,90,92,14,58,79,85
test 3,11,86,90,89,8,91,84,84,11,65,88,86
test 4,13,88,83,89,10,86,85,85,17,58,86,82
test 5,12,85,88,84,12,82,92,91,12,53,81,87
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,13,83,85,88,8,89,79,87,13,39,75,85
test 97,10,84,86,76,8,87,85,91,8,38,75,90
test 98,11,89,84,88,8,81,84,86,9,60,81,83
test 99,10,78,93,77,10,90,86,90,10,38,75,89


In [42]:
test_func(5)

compare 5% 거리기반 mean error count, 5% 거리+방향 기반 mean error count

compare 5% 방향기반 mean error count, 5% 거리+방향 기반 mean error count

compare 5% 상관계수 기반 mean error count, 5% 거리+방향 기반 mean error count



Unnamed: 0,"execute 거리기반, dis_err","execute 거리기반, cos_err","execute 거리기반, corr_err","execute 방향기반, dis_err","execute 방향기반, cos_err","execute 방향기반, corr_err","execute 상관관계 기반, dis_err","execute 상관관계 기반, cos_err","execute 상관관계 기반, corr_err","execute 거리+방향기반, dis_err","execute 거리+방향기반, cos_err","execute 거리+방향기반, corr_err"
test 1,28,94,87,93,29,89,88,95,31,80,94,90
test 2,29,78,87,79,25,89,91,93,29,49,72,92
test 3,29,92,96,96,26,97,96,88,31,81,89,96
test 4,26,83,94,91,29,96,96,94,25,52,81,95
test 5,28,93,93,96,29,93,96,91,25,83,93,92
...,...,...,...,...,...,...,...,...,...,...,...,...
test 96,32,93,91,95,31,87,92,92,26,85,97,96
test 97,28,92,92,91,25,95,95,93,30,62,89,92
test 98,28,89,89,86,30,92,94,97,27,65,88,89
test 99,25,90,91,88,29,92,91,96,22,65,87,92
