In [1]:
import pandas as pd
import numpy as np

from ast import literal_eval                                  # 문자열 파싱 라이브러리
from sklearn.feature_extraction.text import CountVectorizer   # 
from sklearn.metrics.pairwise import cosine_similarity        # 코사인 유사도 라이브러리

import warnings

warnings.filterwarnings('ignore')

# Origin Test - 데이터 전처리

In [2]:
# 데이터 가져오기(맥주 데이터)
beer_load = pd.read_csv('맥주이름_특징_tag_완료_최종.csv',encoding='utf-8')
rating = pd.read_csv('맥주.csv', encoding='utf-8')

In [3]:
# beer_rating: 맥주 종류별 평점평균
beer_rating = pd.DataFrame(columns=['맥주이름','평점평균','평가횟수'])
beer_rating['맥주이름'] = list(set(rating['맥주']))

# 평점평균 구하기
for beer in beer_rating['맥주이름']:
    temp = rating[beer==rating['맥주']]['평점'].mean()
    beer_rating['평점평균'][beer_rating['맥주이름']==beer]=temp
    
# 평가횟수 구하기
for beer in beer_rating['맥주이름']:
    temp = len(rating[beer==rating['맥주']])
    beer_rating['평가횟수'][beer_rating['맥주이름']==beer]=temp

In [4]:
# 총 데이터 합병
beer_data = pd.merge(beer_load, beer_rating)

# 데이터 준비 및 정재 list 형태로 바꿔주기 
# beer_data에서 총 6개의 feature(Aroma, Flavor, Balance, Season, Paring  Food, Body)를 가져온다 
beer_df = pd.DataFrame(beer_data, columns = ['Aroma','Flavor','Balance','Season','Paring Food', 'Body'])

# str형태에서 list로 변환
beer_df['Aroma'] = beer_df['Aroma'].apply(literal_eval)
beer_df['Flavor'] = beer_df['Flavor'].apply(literal_eval)
beer_df['Balance'] = beer_df['Balance'].apply(literal_eval)
beer_df['Season'] = beer_df['Season'].apply(literal_eval)
beer_df['Paring Food'] = beer_df['Paring Food'].apply(literal_eval)
beer_df['Body'] = beer_df['Body'].apply(literal_eval)

In [5]:
ogname = ['Sapporo Premium Beer / Draft Beer',
         'Guinness Original 4.2% (Ireland/UK)',
         'Franziskaner Hefe-Weissbier / Weissbier Naturtrub',
         'Bavaria Pilsener / Premium Beer',
         'Leffe Brune / Bruin / Brown']
name = [
    'Sapporo Premium Beer',
    'Guinness Original 4.2%',
    'Franziskaner Hefe-Weissbier',
    'Bavaria Pilsener',
    'Leffe Brune'
]

for i in range(len(ogname)):
    beer_data['맥주이름'] = beer_data['맥주이름'].apply(lambda x : name[i] if x==ogname[i] else x)


In [6]:
beer_data.to_csv('맥주_cbf_data.csv', encoding='utf-8')
beer_df.to_csv('맥주_cbf_feature.csv', encoding='utf-8')

# 실제 추천 코드

In [7]:
beer_data = pd.read_csv('맥주_cbf_data.csv', encoding='utf-8')
beer_df = pd.read_csv('맥주_cbf_feature.csv', encoding='utf-8')

In [None]:
beer_name = 'Leffe Brune'

something = 'Body'

# str형태에서 list로 변환
beer_df['Aroma'] = beer_df['Aroma'].apply(literal_eval)
beer_df['Flavor'] = beer_df['Flavor'].apply(literal_eval)
beer_df['Balance'] = beer_df['Balance'].apply(literal_eval)
beer_df['Season'] = beer_df['Season'].apply(literal_eval)
beer_df['Paring Food'] = beer_df['Paring Food'].apply(literal_eval)
beer_df['Body'] = beer_df['Body'].apply(literal_eval)

In [None]:
def find_sim_beer_ver1(beer_data, sorted_ind, beer_name, top_n=10):
    
    beer_name = beer_data[beer_data['맥주이름'] == beer_name]
    
    beer_index = beer_name.index.values
    similar_indexes = sorted_ind[beer_index, :(top_n)]
    
    print(similar_indexes)
    
    similar_inidexes = similar_indexes.reshape(-1)
    
    return beer_data.iloc[similar_inidexes]

In [None]:
# CountVectorizer를 활용하기 위한 전처리 
beer_df[something +'_literal'] = beer_df[something].apply(lambda x : (' ').join(x))

#CountVectorizer로 학습시키기
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))   # 특징 2개(ngram_range: 1<= n <=2)
something_mat = count_vect.fit_transform(beer_df[something + '_literal'])

# 77개의 맥주에 대한 가각 유사한 맥주들이 계산 됨
something_sim = cosine_similarity(something_mat,something_mat)

# 순서 보기
# 숫자가 작을수록 유사도가 높은 맥주
something_sim_sorted_ind = something_sim.argsort()[:, ::-1]

similar_beers = find_sim_beer_ver1(beer_data, something_sim_sorted_ind, beer_name,3)
similar_beers

# Real Test

In [None]:
beer_data['맥주이름'].unique()

In [None]:
def find_sim_beer_ver1(beer_data, sorted_ind, beer_name, top_n=4):
    
    beer_name = beer_data[beer_data['맥주이름'] == beer_name]
    
    beer_index = beer_name.index.values
    similar_indexes = sorted_ind[beer_index, :(top_n)]
    
    similar_inidexes = similar_indexes.reshape(-1)
    
    return beer_data.iloc[similar_inidexes]

In [None]:
beer_data = pd.read_csv('맥주_cbf_data.csv', encoding='utf-8')
beer_df = pd.read_csv('맥주_cbf_feature.csv', encoding='utf-8')

beer_name = 'Leffe Brune'

# str형태에서 list로 변환
beer_df['Aroma'] = beer_df['Aroma'].apply(literal_eval)
beer_df['Flavor'] = beer_df['Flavor'].apply(literal_eval)
beer_df['Balance'] = beer_df['Balance'].apply(literal_eval)
beer_df['Season'] = beer_df['Season'].apply(literal_eval)
beer_df['Paring Food'] = beer_df['Paring Food'].apply(literal_eval)
beer_df['Body'] = beer_df['Body'].apply(literal_eval)

# something = 'Aroma'
# something = 'Flavor'
# something = 'Balance'
# something = 'Paring Food'
something = 'Body'

# CountVectorizer를 활용하기 위한 전처리
beer_df[something + '_literal'] = beer_df[something].apply(lambda x: (' ').join(x))

# CountVectorizer로 학습시키기
count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))  # 특징 2개(ngram_range: 1<= n <=2)
something_mat = count_vect.fit_transform(beer_df[something + '_literal'])

# 77개의 맥주에 대한 가각 유사한 맥주들이 계산 됨
something_sim = cosine_similarity(something_mat, something_mat)

# 유사도 높은순으로 역순 배치 후 인덱스 추출
something_sim_sorted_ind = something_sim.argsort()[:, ::-1]

# 10개의 맥주 추천 결과
# Pilsner Urquel로 검색한 결과, 대체로 필스터 혹은 비슷한 성질을 가진 페일에일이 추천되는 것을 알 수 있다.

similar_beers = find_sim_beer_ver1(beer_data, something_sim_sorted_ind, beer_name, 4)

similar_beers = similar_beers[similar_beers['맥주이름'] != beer_name]

result = similar_beers['맥주이름'].tolist()