### 데이터 전처리

In [1]:
# 데이터 불러오기 (각자 저장 경로로 수정)

review = pd.read_csv('./Dataset/review_final.csv', encoding='utf-8')
review.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis='columns', inplace=True)

In [2]:
# 나이, 가격에서 '세', '원', ',' 삭제

review['age'] = review['age'].replace('[\세]', '', regex=True).str.strip()
review['price'] = review['price'].replace('[\원]', '', regex=True).str.strip()
review['price'] = review['price'].replace(',', '', regex=True).str.strip()

In [3]:
# 성별, 피부타입 라벨링

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# 남성 = 0, 여성 = 1
le.fit(review['gender'])
review['n_gender'] = le.transform(review['gender'])

# 건성 = 0, 민감성 = 1, 복합성 = 2, 중성 = 3, 지성 = 4
le.fit(review['skin'])
review['n_skin'] = le.transform(review['skin'])

In [4]:
# price에 오류 데이터가 몇개 있음

review['price'][review['price'].str.contains('g')]

# 오류 데이터 삭제

mask = review['price'].isin(['수퍼슬림 0.08g', '6파이 1.5g', '3.3g'])

review = review[~mask]

In [5]:
# 가격(price), 나이(age) 숫자로 변환 필요!

review['age'] = pd.to_numeric(review['age'])
review['price'] = pd.to_numeric(review['price'])
review.tail(2)

Unnamed: 0,item,company,size,price,id,gender,age,skin,score,review,n_gender,n_skin
211051,[단종] 아임 쿠션 쉐딩,아임미미 (I'M MEME),6ml,18000.0,슈뷔두바,여성,23.0,민감성,2.0,장점-촉촉하니 정말 좋음-저는 쌩얼같은 화장을 선호해서 자연스러워서 괜찮은듯 (거의...,1,1
211052,[단종] 아임 쿠션 쉐딩,아임미미 (I'M MEME),6ml,18000.0,도꾸리,여성,32.0,복합성,2.0,손에 안묻히고 쓸 수있는거때문에 사봤는데색상이 저한텐 안어울리네요전 별로였어요,1,2


In [6]:
review.dtypes

item         object
company      object
size         object
price       float64
id           object
gender       object
age         float64
skin         object
score       float64
review       object
n_gender      int64
n_skin        int64
dtype: object

In [7]:
# item data 불러오기
category = pd.read_csv('./Dataset/glowpick_json_item_rename.csv')
category_lv2 = pd.read_csv('./Dataset/glowpick_json_lv2_rename.csv')
item = pd.read_csv('./Dataset/glowpick_items_data_rename.csv')
item_lv2 = pd.read_csv('./Dataset/glowpick_items_data_lv2_rename.csv')

In [8]:
# NAME column 공백 제거
item['NAME'] = item['NAME'].str.strip()
item_lv2['NAME'] = item_lv2['NAME'].str.strip()
category['name'] = category['name'].str.strip()
category_lv2['name'] = category_lv2['name'].str.strip()

In [9]:
# data merge
df = item.merge(category, left_on='NAME', right_on='name', how='outer')
df2 = item_lv2.merge(category_lv2, left_on='NAME',
                     right_on='name', how='outer')

df.tail(2)

Unnamed: 0,ID,RANK,BRAND,NAME,VOLUME,PRICE,RATE,category,name,keywords
18709,88940,99,에이프릴스킨 (APRILSKIN),턴 업 블리치,40g,4500,2.57,헤어메이크업,턴 업 블리치,['염색모발용']
18710,23473,100,더페이스샵 (THE FACE SHOP),스타일리스트 실키 헤어 컬러 크림,130g,6000,3.28,헤어메이크업,스타일리스트 실키 헤어 컬러 크림,"['모발보호', '염색제']"


In [10]:
df2.tail(2)

Unnamed: 0,ID,RANK,BRAND,NAME,VOLUME,PRICE,RATE,category,name,keywords
6766,51361,99,모에타 (moeta),팝 데빌 컬러 트리트먼트,30ml,15000,3.38,헤어메이크업,팝 데빌 컬러 트리트먼트,"['모발보호', '염색모발용']"
6767,87273,100,토소웅 (TOSOWOONG),매직 헤어쿠션,26g,39600,3.5,헤어메이크업,매직 헤어쿠션,"['헤어라이너', '헤어라인']"


In [11]:
# 데이터 중복 확인
df2.duplicated('ID').sum()

252

In [12]:
df.duplicated('ID').sum()

958

In [13]:
# ID를 기준으로 중복 제거 & 데이터 프레임 합치기

df = df.drop_duplicates('ID')
df2 = df2.drop_duplicates('ID')
dfs = pd.concat([df, df2])
dfs = dfs.drop_duplicates(['ID', 'NAME'])

# name column 중복 삭제

dfs = dfs.drop(['name'], axis=1)

In [None]:
# dfs 저장

dfs.to_csv('./Dataset/item_dfs_final.csv')

In [14]:
# item dataframe 불러오기

item_dfs = pd.read_csv('./Dataset/item_dfs_final.csv', index_col=0)
item_dfs.tail(2)

Unnamed: 0,ID,RANK,BRAND,NAME,VOLUME,PRICE,RATE,category,keywords
6458,120146,-,드림웍스 (DREAMWORKS),뽀송 헤어 핑거 패드,2ea*10,6500,0.0,드라이샴푸,"['두피건강', '두피피지관리', '드라이샴푸']"
6459,10798,-,잇츠스킨 (It'S SKIN),센스 붐붐 헤어 드라이 샴푸,50ml,8000,0.0,드라이샴푸,"['두피피지관리', '드라이샴푸']"


In [15]:
# RATE 타입 변경

item_dfs['RATE'] = item_dfs['RATE'].astype(int)
item_dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19421 entries, 0 to 6459
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        19421 non-null  int64 
 1   RANK      19421 non-null  object
 2   BRAND     19421 non-null  object
 3   NAME      19421 non-null  object
 4   VOLUME    19421 non-null  object
 5   PRICE     19421 non-null  int64 
 6   RATE      19421 non-null  int64 
 7   category  19421 non-null  object
 8   keywords  19421 non-null  object
dtypes: int64(3), object(6)
memory usage: 1.5+ MB


In [16]:
re.findall('[가-힣]+', item_dfs['keywords'][0])

['수분공급', '유수분조절', '저자극', '피부진정']

In [17]:
item_dfs['keywords'] = [re.findall('[가-힣]+', item)
                        for item in item_dfs['keywords']]

In [18]:
# tag column 생성 (띄어쓰기로 구분된 하나의 문장)

item_dfs['tag'] = item_dfs['keywords'].apply(lambda x: (' ').join(x))
item_dfs.tail(2)

Unnamed: 0,ID,RANK,BRAND,NAME,VOLUME,PRICE,RATE,category,keywords,tag
6458,120146,-,드림웍스 (DREAMWORKS),뽀송 헤어 핑거 패드,2ea*10,6500,0,드라이샴푸,"[두피건강, 두피피지관리, 드라이샴푸]",두피건강 두피피지관리 드라이샴푸
6459,10798,-,잇츠스킨 (It'S SKIN),센스 붐붐 헤어 드라이 샴푸,50ml,8000,0,드라이샴푸,"[두피피지관리, 드라이샴푸]",두피피지관리 드라이샴푸


In [19]:
# category unique 값 확인

category_df = pd.DataFrame(item_dfs['category'].unique(), columns=['tag'])
category_df

Unnamed: 0,tag
0,스킨/토너
1,로션/에멀젼
2,에센스/세럼
3,크림
4,미스트
...,...
62,바디미스트
63,바디슬리밍
64,바디메이크업
65,헤어토닉


In [20]:
# tag unique 값 확인

tag = item_dfs.drop_duplicates('tag')
tag_df = pd.DataFrame(set(list(tag['keywords'].sum())), columns=['tag'])
tag_df

Unnamed: 0,tag
0,광택부여
1,모공관리
2,풋스크럽
3,고데기
4,키즈클렌징
...,...
380,샤워퍼프
381,색이상
382,풋스프레이
383,컬러마스카라


----
## 4-3 아이템(태그)기반

 ### 아이템 기반 협업 필터링을 이용한 추천시스템 
----- 
### 1. 제품명 입력

 #### Tfidf Vectorizer , Linear Kernel

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 2),
                      min_df=0)
tfidf_matrix1 = tf1.fit_transform(item_dfs['tag'])
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

titles = item_dfs['NAME']
indices = pd.Series(item_dfs.index, index=item_dfs['NAME'])

In [22]:
def tags_recommendation(product_name):

    product_name = input('제품명을 입력하세요 : ')

    print('\n')
    print('선택한 제품의 정보 : ')

    display(item_dfs[item_dfs['NAME'] == product_name])

    idx = indices[product_name]
    sim_scores = list(enumerate(cosine_sim1[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    cosmetics_indices = [i[0] for i in sim_scores]

    print('\n')
    print('추천 화장품 : ')

    recommend = pd.DataFrame(titles.iloc[cosmetics_indices])
    recommend['sim'] = sim_scores
    recommend = recommend.merge(
        item_dfs[['NAME', 'BRAND', 'keywords']], left_on='NAME', right_on='NAME', how='inner')

    return recommend


reco = tags_recommendation(product_name='product_name')

reco['similarity'] = [r for i in range(
    len(reco)) for r in reco['sim'][i]][1::2]
reco[['NAME', 'BRAND', 'keywords', 'similarity']]

제품명을 입력하세요 : 마데카소사이드 시카 겔


선택한 제품의 정보 : 


Unnamed: 0,ID,RANK,BRAND,NAME,VOLUME,PRICE,RATE,category,keywords,tag
1715,98199,81,어퓨 (Apieu),마데카소사이드 시카 겔,50ml,14000,3,크림,"[미백, 주름개선, 피부보호]",미백 주름개선 피부보호




추천 화장품 : 


Unnamed: 0,NAME,BRAND,keywords,similarity
0,포바하 모이스처 시카 밤 스틱,포바하 (povaha),"[수분공급, 피부보호, 피부진정]",1.0
1,레이저 리쥬버네이션 크림,셀퓨전씨 (Cell Fusion C),"[보습, 수분공급, 피부보호, 피부진정]",0.944619
2,부리치 베이비 바디 버터,더바디샵 (THE BODY SHOP),"[보습, 수분공급, 피부보호, 피부진정]",0.944619
3,레이디모먼트 바디스크럽,지오마 (GEOMAR),"[보습, 수분공급, 피부보호, 피부진정]",0.944619
4,유칼립투스+무화과 풋버터크림,프리맨 (FREEMAN),"[수분공급, 피부보호]",0.802851
5,미스트 앤 픽스,메이크업포에버 (MAKE UP FOR EVER),"[수분공급, 피부보호]",0.802851
6,쉴드업 페이스 픽서,자트인사이트 (SAAT INSIGHT),"[수분공급, 피부보호]",0.802851
7,리제떼 퓨어 선 롤러 [SPF50+/PA+++],리오엘리 (Lioele),"[이상, 보습, 수분공급, 피부보호, 피부진정]",0.796541
8,화이트닝 선블럭,뜨엘 (DDELL),"[이상, 미백, 수분공급, 피부보호, 피부진정]",0.745729
9,데일리 모이스쳐 테라피 페이셜 크림,피지오겔 (PHYSIOGEL),"[보습, 수분공급, 피부보호]",0.736826


----
### 2. 태그 입력

#### Count Vectorizer, Cosine Similarity 

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
tag_mat = count_vect.fit_transform(item_dfs['tag'])
tag_sim = cosine_similarity(tag_mat, tag_mat)

print('tag matrix shape : ', tag_mat.shape)
print('tag similarity shape: ', tag_sim.shape)
print('tag similarity : ')
pd.DataFrame(tag_sim).head(10)

tag matrix shape :  (19421, 3776)
tag similarity shape:  (19421, 19421)
tag similarity : 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19411,19412,19413,19414,19415,19416,19417,19418,19419,19420
0,1.0,0.436436,0.218218,0.433555,0.218218,0.338062,0.251976,0.218218,0.218218,0.218218,...,0.0,0.125988,0.0,0.0,0.169031,0.0,0.0,0.0,0.0,0.0
1,0.436436,1.0,0.0,0.132453,0.333333,0.258199,0.3849,0.333333,0.333333,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.218218,0.0,1.0,0.264906,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.19245,0.0,0.0,0.258199,0.0,0.0,0.0,0.0,0.0
3,0.433555,0.132453,0.264906,1.0,0.264906,0.205196,0.229416,0.132453,0.132453,0.0,...,0.0,0.076472,0.0,0.0,0.102598,0.0,0.0,0.0,0.0,0.0
4,0.218218,0.333333,0.0,0.264906,1.0,0.0,0.57735,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.338062,0.258199,0.0,0.205196,0.0,1.0,0.149071,0.0,0.258199,0.258199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.251976,0.3849,0.0,0.229416,0.57735,0.149071,1.0,0.19245,0.19245,0.19245,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.218218,0.333333,0.0,0.132453,0.333333,0.0,0.19245,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.218218,0.333333,0.333333,0.132453,0.0,0.258199,0.19245,0.0,1.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.218218,0.333333,0.0,0.0,0.0,0.258199,0.19245,0.0,0.333333,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
tag_sim_sorted_ind = tag_sim.argsort()[:, ::-1]
print(tag_sim_sorted_ind[:1])

[[   0 1217  536 ... 9695 9694 9710]]


In [25]:
def find_sim_tag(item_dfs, sorted_ind, select_tag, top_n = 10):
    
    select_tag = input('태그를 입력하세요 : ')
    print('\n')
    
    # 하나의 문장으로 되어 있는 tag 들을 찾는 함수 사용
    P_NAME = item_dfs[item_dfs['tag'].str.contains(select_tag)]
    
    P_INDEX = P_NAME.index.values
    
    similar_indexes = sorted_ind[P_INDEX, :top_n]
    similar_indexes = similar_indexes.reshape(-1)
  
    print('추천 화장품 정보 : ')
    similar_names = pd.DataFrame(item_dfs.iloc[similar_indexes][:top_n])
    similar_names = similar_names[['NAME', 'BRAND','VOLUME', 'PRICE', 'category', 'tag']]
    
    return similar_names

find_sim_tag(item_dfs, tag_sim_sorted_ind, select_tag='select_tag', top_n=10)

태그를 입력하세요 : 스팟


추천 화장품 정보 : 


Unnamed: 0,NAME,BRAND,VOLUME,PRICE,category,tag
822,AC 컨트롤 밸런싱 에멀전,로쥬키스 (ROJUKISS),140ml,23000,로션/에멀젼,보습 수분공급 유수분조절 저자극 트러블케어 피부진정 피지조절
795,아크시스 하이드라 로션,SNP (에스앤피),150ml,20000,로션/에멀젼,보습 수분공급 유수분조절 저자극 트러블케어 피부진정 피지조절
2470,트러블 릴리빙 타임 크림,리듀어 (reduire),75g,28000,크림,수분공급 유수분조절 저자극 트러블케어 피부진정 피지조절
755,컨트롤에이 티트리먼트 모이스처라이저,닥터자르트 (Dr.Jart),50ml,24000,로션/에멀젼,모공관리 미백 보습 수분공급 유수분조절 저자극 트러블케어 피부진정 피지조절
640,프리 페이스 젠틀 밀크,에스쁘아 (espoir),50ml,25000,로션/에멀젼,각질관리 보습 수분공급 유수분조절 저자극 트러블케어 피부진정
791,7무 AC DEW 카밍 모이스쳐라이저,듀이트리 (DEWYTREE),120ml,22000,로션/에멀젼,보습 수분공급 유수분조절 저자극 피부진정 피지조절
812,닥터토니 AC컨트롤 에멀전,토니모리 (TONYMOLY),130ml,12500,로션/에멀젼,보습 수분공급 유수분조절 트러블케어 피부진정 피지조절
815,아크 컨트롤 그린티 로션,보나쥬르 (BONAJOUR),150ml,6900,로션/에멀젼,보습 수분공급 유수분조절 저자극 피부진정 피지조절
744,닥터솔루션 피큐어 튜닝 에멀전,케어존 (CAREZONE),170ml,23000,로션/에멀젼,보습 수분공급 유수분조절 저자극 트러블케어 피지조절
738,토니 랩 에이씨 컨트롤 에멀전,토니모리 (TONYMOLY),160ml,12500,로션/에멀젼,보습 수분공급 저자극 트러블케어 피부진정 피지조절
