In [49]:
import pandas as pd
import numpy as np
import json

import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel

import time
import pickle
from tqdm import tqdm

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from konlpy.tag import Twitter, Okt

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [33]:
with open('review_jsn_306.json','r') as fp:
    review_jsn_306 = json.load(fp)

In [44]:
home_review_dict = {}
for home_id in review_jsn_306.keys():
    home_review_dict[home_id] = {}
    for review_data in review_jsn_306[home_id]:
        home_review_dict[home_id][str(review_data['id'])] = review_data['comments']
        

home_review_dict 는 {home_id :{review_id: 'review_content'}} 구조


In [46]:
home_review_dict['8213750']['57403381']

'한국에서 비앤비를 이용해본건 처음이었는데 기대이상으로 너무 좋았어요 :D \n방은 사진이랑 거의 같았고 크리스마스 분위기에 맞게 설치된 조명과 인원에 맞게 준비해주신 룸슬리퍼, 마스크팩, 직접 손으로 쓴 카드까지!! 감동의 서비스였어요♡.♡\n여행객들뿐만 아니라 모임이나 파티할 장소가 필요하신 분들에게도 좋은 선택이 될 것 같아요. 서울역 바로앞이라 위치도 정말 좋고 아쉬운 점이 하나도 없는 숙소였네요 :)'

### 무작정 LDA
전처리 x, 생각 x


__gensim.models.ldamodel documentation__ : https://radimrehurek.com/gensim/models/ldamodel.html

In [52]:
okt = Okt() # Twitter 형태소분석기와 같음

In [53]:
okt.tagset

{'Adjective': '형용사',
 'Adverb': '부사',
 'Alpha': '알파벳',
 'Conjunction': '접속사',
 'Determiner': '관형사',
 'Eomi': '어미',
 'Exclamation': '감탄사',
 'Foreign': '외국어, 한자 및 기타기호',
 'Hashtag': '트위터 해쉬태그',
 'Josa': '조사',
 'KoreanParticle': '(ex: ㅋㅋ)',
 'Noun': '명사',
 'Number': '숫자',
 'PreEomi': '선어말어미',
 'Punctuation': '구두점',
 'ScreenName': '트위터 아이디',
 'Suffix': '접미사',
 'Unknown': '미등록어',
 'Verb': '동사'}

In [54]:
corpus = []
for home_id in home_review_dict.keys():
    for re_id, comment in home_review_dict[home_id].items():
        corpus.append(comment)

In [65]:
corpus[:3]

['한국에서 비앤비를 이용해본건 처음이었는데 기대이상으로 너무 좋았어요 :D \n방은 사진이랑 거의 같았고 크리스마스 분위기에 맞게 설치된 조명과 인원에 맞게 준비해주신 룸슬리퍼, 마스크팩, 직접 손으로 쓴 카드까지!! 감동의 서비스였어요♡.♡\n여행객들뿐만 아니라 모임이나 파티할 장소가 필요하신 분들에게도 좋은 선택이 될 것 같아요. 서울역 바로앞이라 위치도 정말 좋고 아쉬운 점이 하나도 없는 숙소였네요 :)',
 '서울역근처에 위치해서 이동하기도 편했고 무엇보다 침구가 깨끗하고 편해서 맘에 들었어요! 또 수시로 호스트와 연락할수있어서 편했어요^^ ',
 'This spot hits my 3 C’s of traveling: comfortable, clean, and conveniently located. The reality matches the pictures, maybe even exceeds them. It’s quiet and homey and I only wish we could have stayed longer. \n\nForeign travelers might want to be aware that the towels you get are very much Korean sized (face/hand towels as opposed to full body sized), and that you might want clarification on how to use the heater if you don’t know some Korean.']

In [66]:
okt.nouns(corpus[1]) # nouns 를 뽑아줌

['서울역', '근처', '위치', '이동', '무엇', '침구', '맘', '또', '호스트', '연락']

In [68]:
corpus_nouns = [okt.nouns(comment) for comment in corpus] 

In [69]:
dic = corpora.Dictionary(corpus_nouns)

In [70]:
type(dic)

gensim.corpora.dictionary.Dictionary

In [71]:
len(dic) #출현 단어 개수

5537

In [73]:
corpus_vector = [dic.doc2bow(doc) for doc in corpus_nouns] #단어별 출현 횟수 vector 구성

In [75]:
corpus_vector[1]

[(14, 1),
 (22, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1)]

In [76]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(
                        corpus_vector, 
                        num_topics=5, 
                        id2word = dic, 
                        passes=10
                        ) #LDA model 생성

CPU times: user 30.3 s, sys: 0 ns, total: 30.3 s
Wall time: 30.3 s


In [77]:
ldamodel.show_topics(num_words=10) #토픽을 구성하는 주요단어들

[(0,
  '0.056*"숙소" + 0.047*"위치" + 0.036*"정말" + 0.032*"호스트" + 0.027*"다음" + 0.026*"또" + 0.024*"이용" + 0.016*"시설" + 0.012*"아주" + 0.011*"가격"'),
 (1,
  '0.018*"곳" + 0.013*"옥" + 0.012*"수" + 0.011*"느낌" + 0.010*"공간" + 0.010*"소음" + 0.009*"골목" + 0.009*"것" + 0.008*"집" + 0.007*"프로젝트"'),
 (2,
  '0.024*"친구" + 0.023*"것" + 0.019*"숙소" + 0.018*"사진" + 0.017*"집" + 0.017*"정말" + 0.015*"수" + 0.014*"분위기" + 0.012*"호스트" + 0.012*"곳"'),
 (3,
  '0.035*"숙소" + 0.026*"바로" + 0.024*"위치" + 0.020*"호스트" + 0.019*"수" + 0.018*"편의점" + 0.015*"근처" + 0.015*"앞" + 0.013*"하나" + 0.012*"역"'),
 (4,
  '0.022*"화장실" + 0.021*"침대" + 0.021*"조금" + 0.020*"좀" + 0.018*"점" + 0.016*"수건" + 0.014*"다만" + 0.012*"방" + 0.011*"것" + 0.009*"명"')]

In [78]:
# Compute Perplexity 
print('\nPerplexity: ', ldamodel.log_perplexity(corpus_vector))  # a measure of how good the model is. lower the better.


Perplexity:  -6.989099434176124


In [79]:
# Compute Coherence Score 
coherence_model_lda = CoherenceModel(model=ldamodel, texts=corpus_nouns, dictionary=dic, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5527338880557034


In [81]:
# Visualize the topics

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus_vector, dic)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


> ![2018-11-04 18-08-01](https://user-images.githubusercontent.com/38183218/47962210-a5ea2a00-e05c-11e8-8292-bd8f2cf58cb6.png)