### 대구 전주 Airbnb reviews

In [1]:
import json
import re

In [2]:
file_lst = ['translated_daegu.json','translated_jeonju.json']

In [3]:
review = {}
for file in file_lst:
    with open(file,'r',encoding='utf-8') as fp:
        review.update(json.load(fp))

In [4]:
review_en={}

for home_id,rev_lsts in review.items():
    review_en[home_id] = []
    for rev_lst in rev_lsts:
        try:
            if str(rev_lst['language'][-2:]) == 'en':
                review_en[home_id].append([rev_lst['rating'],rev_lst['t_comments']])
        except:
            pass

### Data Preprocessing Step
##### 1. Tokenization
##### 2. Stop word elimination
##### 3. Stemming : 단어를 기본형으로 바꾸어준다. 복수형은 단수형으로, 과거형은 현재형으로 바꾸는 과정
##### 4. Representation

In [5]:
import nltk
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
import tqdm

### Tokenization

In [6]:
# 리뷰별 문장단위 tokenizing
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        rev[1] = sent_tokenize(rev[1])

100%|█████████████████████████████████████████████████████████████████████████████| 2626/2626 [00:04<00:00, 585.12it/s]


In [7]:
# 길이가 2보다 작은 문장 없애기
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        r = []
        for s in rev[1] : 
            if len(s) > 2 :
                r.append(s)
        rev[1] = r        

100%|███████████████████████████████████████████████████████████████████████████| 2626/2626 [00:00<00:00, 38640.09it/s]


###  Stop words elimination

In [8]:
# 소문자와 대문자가 아닌 것은 공백으로 대체
for home_id,rev_lst in review_en.items(): 
    for rev in rev_lst:
        for sentence in rev[1]:
            sentence = re.sub('[^a-zA-Z\s]', '', sentence)

In [9]:
from nltk.corpus import stopwords
stopWords =list(stopwords.words('english'))
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
add_stopwords = ['airbnb','im','youre','hongdae', 'seoul', 'daegu', 'jeju' ,'busan', 'korea','gangneung','itaewon','myeongdong','john','gangnam','also','even','dont','namsan','incheon','good','great']
# additional_stopwords = []
stopWords = set(stopWords + add_stopwords)

In [11]:
# 문장별 단어단위 tokenizing 후 pos_tag
for home_id,rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        tmp_lst = []
        for sentence in rev[1]:
            token = nltk.regexp_tokenize(sentence.lower() ,"[\w']+")
            no_stopword_token = [w for w in token if not w in stopWords]
            sent_pos = nltk.pos_tag(no_stopword_token)
            tmp_lst.append(sent_pos)
        rev[1] = tmp_lst

100%|██████████████████████████████████████████████████████████████████████████████| 2626/2626 [15:45<00:00,  2.78it/s]


### 명사, 형용사, 동사, 부사 lemmatization

In [12]:
lemm = WordNetLemmatizer()

def njvr_lemmantizer(sent):
    global lemm
    
    lemm_sent = []
    for word_pos in sent:
        word, pos = word_pos
        if pos[0] == 'N':
            lemm_sent.append(lemm.lemmatize(word,pos='n').lower() +'_N')
        elif pos[0] == 'J':
            lemm_sent.append(lemm.lemmatize(word,pos='a').lower() +'_J')
        elif pos[0] == 'V':
            lemm_sent.append(lemm.lemmatize(word,pos='v').lower() +'_V')
        elif pos[0] == 'R':
            lemm_sent.append(lemm.lemmatize(word,pos='r').lower() +'_R')
        else:pass
    return lemm_sent

In [13]:
for listing_id, rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        rev_tmp = []
        for sent in rev[1]:
            rev_tmp.append(njvr_lemmantizer(sent))
        rev[1] = rev_tmp

100%|█████████████████████████████████████████████████████████████████████████████| 2626/2626 [00:05<00:00, 497.69it/s]


In [14]:
with open('review_postag_daegujeonju.json','w',encoding='utf-8') as fp:
        json.dump(review_en,fp)

### LDA 

In [15]:
import pandas as pd
import numpy as np
import time

In [16]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel



In [17]:
review = review_en

In [18]:
texts = []
for home_id,rev_lst in review.items():
    for rev in rev_lst:
        for sentence in rev[1] :
            texts.append(sentence)

In [19]:
texts[:2]

[['host_N', 'guide_V', 'well_R', 'check_V', 'check_V', 'problem_N'],
 ['accommodation_N', 'see_V', 'screen_N']]

In [20]:
dic = corpora.Dictionary(texts)

In [21]:
corpus = [dic.doc2bow(text) for text in texts]

In [22]:
len(corpus)

135134

In [23]:
# corpus: the corpus used to train the topic model.
# id2word: a dictionary that maps word numerical identifiers to word strings 
# alpha: the topic distribution. If we set it to 'auto', then we tell the algorithm to determine the topic proportions automatically.
# num_topics: how many topics the algorithm should find.
# passes: how many times the algorithm should go through the corpus. If you are using a large corpus , then this can be set to 1
# model.num_topics: the number of topics.
# model.show_topic(topic_number, topn) returns the probability distribution over words in one topic. for instance [(0.06, 'food'), (0.04, 'cook'), (0.03, 'fry'), ...]
# model.alpha[topic_number]: the prominence of a topic.

#%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 6, id2word = dic, passes=30 ,iterations = 3000,alpha='symmetric', eta='auto') #LDA model 생성

  diff = np.log(self.expElogbeta)


In [24]:
ldamodel.save('lda_DaeguJunju.lda')

#### Airnb category : 정확성 / 의사소통 / 청결도 / 위치 / 체크인 / 가치

In [25]:
ldamodel.show_topics(num_words=10) #토픽을 구성하는 주요단어들

[(0,
  '0.084*"host_N" + 0.078*"kind_N" + 0.033*"take_V" + 0.021*"make_V" + 0.020*"rest_N" + 0.019*"day_N" + 0.017*"nice_J" + 0.017*"give_V" + 0.017*"thanks_N" + 0.012*"check_V"'),
 (1,
  '0.097*"room_N" + 0.062*"clean_J" + 0.038*"nice_J" + 0.029*"well_R" + 0.022*"comfortable_J" + 0.022*"bathroom_N" + 0.020*"clean_N" + 0.017*"small_J" + 0.017*"really_R" + 0.016*"warm_J"'),
 (2,
  '0.043*"quiet_J" + 0.038*"house_N" + 0.030*"night_N" + 0.028*"little_J" + 0.020*"floor_N" + 0.017*"beautiful_J" + 0.014*"place_N" + 0.013*"hanok_N" + 0.013*"lot_N" + 0.012*"morning_N"'),
 (3,
  '0.063*"accommodation_N" + 0.048*"location_N" + 0.044*"thank_N" + 0.028*"much_J" + 0.021*"price_N" + 0.021*"well_R" + 0.019*"like_V" + 0.018*"enjoy_V" + 0.017*"best_J" + 0.015*"satisfy_V"'),
 (4,
  '0.102*"village_N" + 0.055*"hanok_J" + 0.050*"hanok_N" + 0.028*"locate_V" + 0.025*"location_N" + 0.020*"close_R" + 0.019*"convenient_N" + 0.017*"walk_V" + 0.016*"restaurant_N" + 0.015*"walk_N"'),
 (5,
  '0.061*"go_V" + 0.059*

In [3]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda_jeju.lda')

In [26]:
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt

In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dic)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [30]:
vis

In [29]:
pyLDAvis.save_html(vis,'LDA_DaeguJunju.html')