In [1]:
import pandas as pd

df = pd.read_csv('https://github.com/TaewoongKong/code_sharing/blob/master/cau_shops_kakao_edit.csv?raw=True')


In [None]:
df_cosin = df[['카카오지도_상호명', '상권업종중분류명', '상권업종소분류명', '카카오지도_카테고리', '카카오지도_별점_점수', '카카오지도_별점_평가수','카카오지도_블로그_리뷰', '카카오지도_블로그_리뷰수']]

In [None]:
df_cosin.columns = ['name',  # 상호명
                    'cate_1',  
                    'cate_2', 
                    'cate_3',
                    'star_point',  # 별점 점수
                    'star_qty',  # 별점 평가 횟수
                    'review_txt',  # 리뷰 텍스트
                    'review_qty'  #  리뷰 횟수 - 유명할수록 많은데 부정적인 평가도 있을 수 있다
                   ]

In [None]:
df_cosin.loc[df_cosin['star_qty'] == 0, 'star_point'] = -1

In [None]:
df_cosin['cate_mix'] = df_cosin['cate_1'] + df_cosin['cate_2'] + df_cosin['cate_3']

In [None]:
df_cosin['cate_mix'] = df_cosin['cate_mix'].str.replace(">", " ").str.replace("/", " ")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# 리뷰의 유사도 학습


count_vect_review = CountVectorizer(min_df=2, ngram_range=(1,2))
place_review = count_vect_review.fit_transform(df_cosin['review_txt']) 
place_simi_review = cosine_similarity(place_review, place_review)
place_simi_review_sorted_ind = place_simi_review.argsort()[:, ::-1]

# 카테고리의 유사도 학습

count_vect_category = CountVectorizer(min_df=0, ngram_range=(1,2))
place_category = count_vect_category.fit_transform(df_cosin['cate_mix']) 
place_simi_cate = cosine_similarity(place_category, place_category) 
place_simi_cate_sorted_ind = place_simi_cate.argsort()[:, ::-1] 

# 별점 평가 횟수




# 카테고리와 리뷰의 중요성을 짬뽕시키는 공식
place_simi_co = (place_simi_review 
                 + place_simi_cate*0.3  # 리뷰 유사도는 0.1만큼 반영
                 + np.repeat([df_cosin['star_qty'].values], len(df_cosin['star_qty']) , axis=0) * 0.001  # 별점평가 갯수
                )

place_simi_co_sorted_ind = place_simi_co.argsort()[:, ::-1] 


def find_simi_place(df, sorted_ind, place_name, top_n=10):
    
    place_title = df[df['name'] == place_name]
    place_index = place_title.index.values
    similar_indexes = sorted_ind[place_index, :(top_n)]
    similar_indexes = similar_indexes.reshape(-1)
    return df.iloc[similar_indexes]


def compare_algo(place_df, place_name, num):
    d = {'before_name': find_simi_place(place_df, place_simi_review_sorted_ind, place_name ,num)['name'].to_list()[1:],
     'before_cate': find_simi_place(place_df, place_simi_review_sorted_ind, place_name ,num)['cate_mix'].to_list()[1:],
     'after_name': find_simi_place(place_df, place_simi_co_sorted_ind, place_name,num)['name'].to_list()[1:],
     'after_cate': find_simi_place(place_df, place_simi_co_sorted_ind, place_name,num)['cate_mix'].to_list()[1:]}
    
    return pd.DataFrame(data=d, columns = ["before_name", "before_cate", "after_name", "after_cate"])



In [None]:
compare_algo(df_cosin, "중앙돼지마을", 15)