### 제주 Airbnb reviews

In [1]:
import json
import re

In [2]:
file_lst = ['translated_jeju.json','translated_seogwipo.json']

In [3]:
review = {}
for file in file_lst:
    with open(file,'r',encoding='utf-8') as fp:
        review.update(json.load(fp))

In [4]:
review_en={}

for home_id,rev_lsts in review.items():
    review_en[home_id] = []
    for rev_lst in rev_lsts:
        try:
            if str(rev_lst['language'][-2:]) == 'en':
                review_en[home_id].append([rev_lst['rating'],rev_lst['t_comments']])
        except:
            pass

In [5]:
len(review_en)

9976

### Data Preprocessing Step
##### 1. Tokenization
##### 2. Stop word elimination
##### 3. Stemming : 단어를 기본형으로 바꾸어준다. 복수형은 단수형으로, 과거형은 현재형으로 바꾸는 과정
##### 4. Representation

In [6]:
import nltk
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
import tqdm

### Tokenization

In [7]:
# 리뷰별 문장단위 tokenizing
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        rev[1] = sent_tokenize(rev[1])

100%|█████████████████████████████████████████████████████████████████████████████| 9976/9976 [00:14<00:00, 689.42it/s]


In [8]:
# 길이가 2보다 작은 문장 없애기
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        r = []
        for s in rev[1] : 
            if len(s) > 2 :
                r.append(s)
        rev[1] = r        

100%|███████████████████████████████████████████████████████████████████████████| 9976/9976 [00:00<00:00, 43824.24it/s]


###  Stop words elimination

In [9]:
# 소문자와 대문자가 아닌 것은 공백으로 대체
for home_id,rev_lst in review_en.items(): 
    for rev in rev_lst:
        for sentence in rev[1]:
            sentence = re.sub('[^a-zA-Z\s]', '', sentence)

In [10]:
from nltk.corpus import stopwords
stopWords =list(stopwords.words('english'))
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [11]:
add_stopwords = ['airbnb','im','youre','hongdae', 'seoul', 'daegu', 'jeju' ,'busan', 'korea','gangneung','itaewon','myeongdong','john','gangnam','also','even','dont','namsan','incheon']
# additional_stopwords = []
stopWords = set(stopWords + add_stopwords)

In [12]:
# 문장별 단어단위 tokenizing 후 pos_tag
for home_id,rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        tmp_lst = []
        for sentence in rev[1]:
            token = nltk.regexp_tokenize(sentence.lower() ,"[\w']+")
            no_stopword_token = [w for w in token if not w in stopWords]
            sent_pos = nltk.pos_tag(no_stopword_token)
            tmp_lst.append(sent_pos)
        rev[1] = tmp_lst

100%|██████████████████████████████████████████████████████████████████████████████| 9976/9976 [55:10<00:00,  3.01it/s]


### 명사, 형용사, 동사, 부사 lemmatization

In [13]:
lemm = WordNetLemmatizer()

def njvr_lemmantizer(sent):
    global lemm
    
    lemm_sent = []
    for word_pos in sent:
        word, pos = word_pos
        if pos[0] == 'N':
            lemm_sent.append(lemm.lemmatize(word,pos='n').lower() +'_N')
        elif pos[0] == 'J':
            lemm_sent.append(lemm.lemmatize(word,pos='a').lower() +'_J')
        elif pos[0] == 'V':
            lemm_sent.append(lemm.lemmatize(word,pos='v').lower() +'_V')
        elif pos[0] == 'R':
            lemm_sent.append(lemm.lemmatize(word,pos='r').lower() +'_R')
        else:pass
    return lemm_sent

In [14]:
for listing_id, rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        rev_tmp = []
        for sent in rev[1]:
            rev_tmp.append(njvr_lemmantizer(sent))
        rev[1] = rev_tmp

100%|█████████████████████████████████████████████████████████████████████████████| 9976/9976 [00:14<00:00, 670.53it/s]


In [15]:
with open('review_postag_jeju.json','w',encoding='utf-8') as fp:
        json.dump(review_en,fp)

### LDA 

In [2]:
import pandas as pd
import numpy as np
import time

In [3]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel



In [4]:
review = {}
with open('review_postag_jeju.json','r',encoding='utf-8') as fp:
    review.update(json.load(fp))

In [5]:
texts = []
for home_id,rev_lst in review.items():
    for rev in rev_lst:
        for sentence in rev[1] :
            texts.append(sentence)

In [6]:
texts[:2]

[['disappointed_J', 'different_J', 'expect_V', 'picture_N'],
 ['couch_J', 'tv_N', 'furniture_N', 'etc_N']]

In [7]:
dic = corpora.Dictionary(texts)

In [8]:
corpus = [dic.doc2bow(text) for text in texts]

In [9]:
len(corpus)

468090

In [10]:
# corpus: the corpus used to train the topic model.
# id2word: a dictionary that maps word numerical identifiers to word strings 
# alpha: the topic distribution. If we set it to 'auto', then we tell the algorithm to determine the topic proportions automatically.
# num_topics: how many topics the algorithm should find.
# passes: how many times the algorithm should go through the corpus. If you are using a large corpus , then this can be set to 1
# model.num_topics: the number of topics.
# model.show_topic(topic_number, topn) returns the probability distribution over words in one topic. for instance [(0.06, 'food'), (0.04, 'cook'), (0.03, 'fry'), ...]
# model.alpha[topic_number]: the prominence of a topic.

#%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 6, id2word = dic, passes = 30 ,iterations = 3000,alpha='symmetric', eta='auto') #LDA model 생성

  diff = np.log(self.expElogbeta)


In [11]:
ldamodel.save('lda_jeju.lda')

#### Airnb category : 정확성 / 의사소통 / 청결도 / 위치 / 체크인 / 가치

In [12]:
ldamodel.show_topics(num_words=10) #토픽을 구성하는 주요단어들

[(0,
  '0.025*"restaurant_N" + 0.023*"location_N" + 0.020*"convenient_N" + 0.018*"airport_N" + 0.017*"minute_N" + 0.017*"car_N" + 0.016*"station_N" + 0.015*"close_R" + 0.015*"bus_N" + 0.014*"walk_V"'),
 (1,
  '0.058*"room_N" + 0.029*"well_R" + 0.019*"clean_J" + 0.018*"big_J" + 0.016*"people_N" + 0.016*"bathroom_N" + 0.013*"towel_N" + 0.013*"floor_N" + 0.012*"good_J" + 0.012*"bed_N"'),
 (2,
  '0.054*"time_N" + 0.041*"go_V" + 0.038*"stay_V" + 0.034*"trip_N" + 0.032*"good_J" + 0.030*"next_J" + 0.028*"come_V" + 0.027*"day_N" + 0.026*"family_N" + 0.022*"island_N"'),
 (3,
  '0.028*"breakfast_N" + 0.023*"morning_N" + 0.022*"sea_N" + 0.022*"see_V" + 0.019*"view_N" + 0.018*"delicious_J" + 0.018*"house_N" + 0.017*"love_V" + 0.015*"nice_J" + 0.013*"pretty_R"'),
 (4,
  '0.070*"host_N" + 0.049*"kind_N" + 0.028*"thank_N" + 0.016*"give_V" + 0.015*"much_J" + 0.015*"make_V" + 0.014*"help_V" + 0.014*"day_N" + 0.012*"owner_N" + 0.011*"helpful_J"'),
 (5,
  '0.087*"place_N" + 0.050*"nice_J" + 0.042*"clean_

In [3]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda_jeju.lda')

In [13]:
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt

In [14]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dic)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [15]:
pyLDAvis.save_html(vis,'LDA_jeju.html')