In [None]:
import itertools
import re
import json

from konlpy.tag import Kkma
from konlpy.tag import Mecab
from gensim.models import Phrases
from collections import Counter

In [None]:
t = Mecab()
rdf= pd.read_excel('rdf.xlsx')
rdf['photo'] = rdf['photo'].map(lambda x: eval(x))

# 데이터 전처리

In [None]:
def preprocessing(txt):
    txt = txt.replace('\n내용\n', '')
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')
    txt = re.sub('\([0-9]{4}-[0-9]{2}-[0-9]{2}.*구매평\)', '', txt)
    txt = txt.strip()
    return txt

rdf['text'] = rdf['text'].map(preprocessing)

# mecab을 사용하여 빈도가 높은 표현 확인

In [None]:

documents = []
for txt in rdf['text'].values:
    pos = t.pos(txt)
    sentence = ' '.join(['{}_{}'.format(w, p) for w, p in pos])
    documents.append(sentence)
    
    

sentence_stream = [doc.split(" ") for doc in documents]

# bigram > trigram
bigram = Phrases(sentence_stream, min_count=1, delimiter=b' ')
trigram = Phrases(bigram[sentence_stream], min_count=1, delimiter=b' ')

b_li = []
t_li = []
for sent in sentence_stream:
    bigrams_ = [b for b in bigram[sent] if b.count(' ') == 1]
    trigrams_ = [t for t in trigram[bigram[sent]] if t.count(' ') == 2]
    
    b_li.append(bigrams_)
    t_li.append(trigrams_)

    
b_li = Counter(itertools.chain.from_iterable(b_li))
t_li = Counter(itertools.chain.from_iterable(t_li))

# 많이 사용된 표현 추출
sorted(b_li.items(), key=lambda x: x[1], reverse=True)
sorted(t_li.items(), key=lambda x: x[1], reverse=True)

# 상품 정보가 있는 리뷰

In [None]:
hasprd = rdf[rdf['name'].notna()] 
unique_names = hasprd['name'].unique()

In [None]:
res_li = [ ]
for name in unique_names:
    sub = hasprd[hasprd['name'] == name]
    sub['date'] = sub['date'].dt.strftime("%Y-%m-%d") 
    
    nsub = sub[sub['writer'] == '네이****']
    msub = sub[sub['writer'] != '네이****']
    
    psub = sub[sub['photo'].map(lambda x: True if len(x) > 0 else False )]

    rating =sub['rating'].value_counts().sort_index(ascending=False).to_dict()
    rating_avg = round(sub['rating'].mean(), 2)
    
    
    rating_datas = {1:[], 2:[], 3:[], 4:[], 5:[]}
    for score in range(1, 6):
        rsub = sub[sub['rating']==score]
        rating_datas[score] = list(rsub.T.to_dict().values())

    res = {'type':"single", 'name' : name, 't_count': len(sub), 'n_count':len(nsub), 'o_count':len(msub), 'p_count':len(psub),
           'rating_avg':rating_avg, 'rating_data':rating_datas,
           'photo_review':list(psub.T.to_dict().values(), ) }
    
    res_li.append(res) 

## 단품 상품 리뷰만 필터링

In [None]:
singleprd= list(filter(lambda x: '세트' not in x 
            and 'off' not in x 
            and '에디션' not in x
            and '스페셜' not in x
            and '콜렉션' not in x
            and '컬렉션' not in x
            , unique_names))

res_li = [ ]
for name in singleprd:
    sub = hasprd[hasprd['name'] == name]
    sub['date'] = sub['date'].dt.strftime("%Y-%m-%d") 
    
    nsub = sub[sub['writer'] == '네이****']
    msub = sub[sub['writer'] != '네이****']
    
    psub = sub[sub['photo'].map(lambda x: True if len(x) > 0 else False )]

    rating =sub['rating'].value_counts().sort_index(ascending=False).to_dict()
    rating_avg = round(sub['rating'].mean(), 2)
    
    
    rating_datas = {1:[], 2:[], 3:[], 4:[], 5:[]}
    for score in range(1, 6):
        rsub = sub[sub['rating']==score]
        rating_datas[score] = list(rsub.T.to_dict().values())

    res = {'type':"single", 'name' : name, 't_count': len(sub), 'n_count':len(nsub), 'o_count':len(msub), 'p_count':len(psub),
           'rating_avg':rating_avg, 'rating_data':rating_datas,
           'photo_review':list(psub.T.to_dict().values(), ) }
    
    res_li.append(res) 

In [None]:
with open('review.js', 'wt') as f:
    f.write('var datas = ')
    dt = json.dumps(res_li, ensure_ascii=False, indent=2)
    f.write(dt)

## 세트상품 리뷰만 필터링

In [None]:
setprd = list(filter(lambda x: '세트' in x 
            or 'off' in x 
            or '에디션' in x 
            or '스페셜' in x 
            or '콜렉션' in x 
            or '컬렉션' in x 
            , unique_names))


for name in setprd:
    sub = hasprd[hasprd['name'] == name]
    sub['date'] = sub['date'].dt.strftime("%Y-%m-%d") 
    
    nsub = sub[sub['writer'] == '네이****']
    msub = sub[sub['writer'] != '네이****']
    
    psub = sub[sub['photo'].map(lambda x: True if len(x) > 0 else False )]

    rating =sub['rating'].value_counts().sort_index(ascending=False).to_dict()
    rating_avg = round(sub['rating'].mean(), 2)
    
    
    rating_datas = {1:[], 2:[], 3:[], 4:[], 5:[]}
    for score in range(1, 6):
        rsub = sub[sub['rating']==score]
        rating_datas[score] = list(rsub.T.to_dict().values())

    res = {'type':"set", 'name' : name, 't_count': len(sub), 'n_count':len(nsub), 'o_count':len(msub), 'p_count':len(psub),
           'rating_avg':rating_avg, 'rating_data':rating_datas,
           'photo_review':list(psub.T.to_dict().values(), ) }
    
    res_li.append(res)
len(res_li)

In [None]:
with open('review.js', 'wt') as f:
    f.write('var datas = ')
    dt = json.dumps(res_li, ensure_ascii=False, indent=2)
    f.write(dt)