### 서울 Airbnb reviews

In [1]:
import json
import re

In [2]:
review = {}
with open('translated_seoul.json','r',encoding='utf-8') as fp:
    review.update(json.load(fp))

In [3]:
review_en={}

for home_id,rev_lsts in review.items():
    review_en[home_id] = []
    for rev_lst in rev_lsts:
        try:
            if str(rev_lst['language'][-2:]) == 'en':
                review_en[home_id].append([rev_lst['rating'],rev_lst['t_comments']])
        except:
            pass

In [5]:
len(review_en)

14608

### Data Preprocessing Step
##### 1. Tokenization
##### 2. Stop word elimination
##### 3. Stemming : 단어를 기본형으로 바꾸어준다. 복수형은 단수형으로, 과거형은 현재형으로 바꾸는 과정
##### 4. Representation

In [4]:
import nltk
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer
import tqdm

### Tokenization

In [5]:
# 리뷰별 문장단위 tokenizing
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        rev[1] = sent_tokenize(rev[1])

100%|███████████████████████████████████████████████████████████████████████████| 14608/14608 [00:34<00:00, 419.88it/s]


In [6]:
# 길이가 2보다 작은 문장 없애기
for home_id,rev_lst in tqdm.tqdm(review_en.items()) :
    for rev in rev_lst:
        r = []
        for s in rev[1] : 
            if len(s) > 2 :
                r.append(s)
        rev[1] = r        

100%|█████████████████████████████████████████████████████████████████████████| 14608/14608 [00:00<00:00, 26987.24it/s]


###  Stop words elimination

In [7]:
# 소문자와 대문자가 아닌 것은 공백으로 대체
for home_id,rev_lst in review_en.items(): 
    for rev in rev_lst:
        for sentence in rev[1]:
            sentence = re.sub('[^a-zA-Z\s]', '', sentence)

In [8]:
from nltk.corpus import stopwords
stopWords =list(stopwords.words('english'))
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
add_stopwords = ['airbnb','im','youre','hongdae', 'seoul', 'daegu', 'jeju' ,'busan', 'korea','gangneung',
                 'itaewon','myeongdong','john','gangnam','also','even','dont','namsan','incheon','good','great','little','well','nice']
# additional_stopwords = []
stopWords = set(stopWords + add_stopwords)

In [10]:
# 문장별 단어단위 tokenizing 후 pos_tag
for home_id,rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        tmp_lst = []
        for sentence in rev[1]:
            token = nltk.regexp_tokenize(sentence.lower() ,"[\w']+")
            no_stopword_token = [w for w in token if not w in stopWords]
            sent_pos = nltk.pos_tag(no_stopword_token)
            tmp_lst.append(sent_pos)
        rev[1] = tmp_lst

100%|██████████████████████████████████████████████████████████████████████████| 14608/14608 [2:10:56<00:00,  1.86it/s]


### 명사, 형용사, 동사, 부사 lemmatization

In [11]:
lemm = WordNetLemmatizer()

def njvr_lemmantizer(sent):
    global lemm
    
    lemm_sent = []
    for word_pos in sent:
        word, pos = word_pos
        if pos[0] == 'N':
            lemm_sent.append(lemm.lemmatize(word,pos='n').lower() +'_N')
        elif pos[0] == 'J':
            lemm_sent.append(lemm.lemmatize(word,pos='a').lower() +'_J')
        elif pos[0] == 'V':
            lemm_sent.append(lemm.lemmatize(word,pos='v').lower() +'_V')
        elif pos[0] == 'R':
            lemm_sent.append(lemm.lemmatize(word,pos='r').lower() +'_R')
        else:pass
    return lemm_sent

In [12]:
for listing_id, rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        rev_tmp = []
        for sent in rev[1]:
            rev_tmp.append(njvr_lemmantizer(sent))
        rev[1] = rev_tmp

100%|███████████████████████████████████████████████████████████████████████████| 14608/14608 [00:29<00:00, 502.09it/s]


In [13]:
with open('review_postag_seoul2.json','w',encoding='utf-8') as fp:
        json.dump(review_en,fp)

### LDA 

In [14]:
import pandas as pd
import numpy as np
import time

In [15]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel



In [16]:
review = review_en

In [17]:
texts = []
for home_id,rev_lst in review.items():
    for rev in rev_lst:
        for sentence in rev[1] :
            texts.append(sentence)

In [18]:
texts[:2]

[['september_V',
  'accompany_V',
  'son_N',
  'south_J',
  'go_V',
  'son_N',
  'watch_V',
  'photo_J',
  'reservation_N',
  'second_J',
  'house_N',
  'leave_V',
  'gate_J',
  'national_J',
  'university_N',
  "people's_N",
  'republic_J',
  'china_V',
  'national_J',
  'university_N',
  "people's_N",
  'republic_J',
  'china_N',
  'midnight_N',
  'midnight_N',
  'midnight_N',
  'arrive_V',
  'midnight_R',
  'think_N',
  'hostess_N',
  'hostess_N',
  'wait_V',
  'hospitable_J',
  'host_N',
  'keep_V',
  'second_J',
  'floor_N',
  'clean_J',
  'quiet_J',
  'neat_N',
  'elegant_J',
  'good_J',
  'bed_N',
  'see_V',
  'internet_J',
  'next_J',
  'day_N',
  'bright_J',
  'noise_N',
  'city_N',
  'quiet_J',
  'pastoral_J',
  'nature_N',
  'countryside_N',
  'beautiful_J',
  'view_N',
  'villa_N',
  'green_J',
  'grass_N',
  'small_J',
  'hill_N',
  'surround_V',
  'hill_N',
  'speak_J',
  'english_J',
  'fluently_R',
  'ardent_J',
  'host_N',
  'speak_V',
  'korean_J',
  'english_J',
  't

In [19]:
dic = corpora.Dictionary(texts)

In [20]:
corpus = [dic.doc2bow(text) for text in texts]

In [21]:
len(corpus)

1124076

In [22]:
# corpus: the corpus used to train the topic model.
# id2word: a dictionary that maps word numerical identifiers to word strings 
# alpha: the topic distribution. If we set it to 'auto', then we tell the algorithm to determine the topic proportions automatically.
# num_topics: how many topics the algorithm should find.
# passes: how many times the algorithm should go through the corpus. If you are using a large corpus , then this can be set to 1
# model.num_topics: the number of topics.
# model.show_topic(topic_number, topn) returns the probability distribution over words in one topic. for instance [(0.06, 'food'), (0.04, 'cook'), (0.03, 'fry'), ...]
# model.alpha[topic_number]: the prominence of a topic.

#%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 6, id2word = dic, passes=30 ,iterations = 3000,alpha='symmetric', eta='auto') #LDA model 생성

  diff = np.log(self.expElogbeta)


In [23]:
ldamodel.save('lda_seoul.lda')

#### Airnb category : 정확성 / 의사소통 / 청결도 / 위치 / 체크인 / 가치

In [24]:
ldamodel.show_topics(num_words=10) #토픽을 구성하는 주요단어들

[(0,
  '0.061*"clean_J" + 0.052*"room_N" + 0.048*"location_N" + 0.041*"comfortable_J" + 0.037*"place_N" + 0.036*"apartment_N" + 0.028*"house_N" + 0.023*"clean_N" + 0.018*"quiet_J" + 0.015*"really_R"'),
 (1,
  '0.039*"everything_N" + 0.030*"room_N" + 0.023*"provide_V" + 0.023*"need_V" + 0.016*"thanks_N" + 0.016*"big_J" + 0.016*"bed_N" + 0.015*"question_N" + 0.014*"give_V" + 0.014*"always_R"'),
 (2,
  '0.039*"recommend_V" + 0.032*"place_N" + 0.026*"home_N" + 0.022*"highly_R" + 0.018*"house_N" + 0.017*"stay_V" + 0.015*"experience_N" + 0.014*"korean_J" + 0.013*"make_V" + 0.013*"family_N"'),
 (3,
  '0.056*"time_N" + 0.034*"stay_V" + 0.033*"go_V" + 0.029*"come_V" + 0.026*"next_J" + 0.023*"back_R" + 0.021*"day_N" + 0.021*"thank_N" + 0.020*"night_N" + 0.019*"definitely_R"'),
 (4,
  '0.067*"station_N" + 0.027*"location_N" + 0.025*"restaurant_N" + 0.025*"easy_J" + 0.025*"convenient_N" + 0.023*"walk_V" + 0.023*"subway_N" + 0.021*"minute_N" + 0.017*"area_N" + 0.017*"store_N"'),
 (5,
  '0.084*"host

In [3]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda_seoul.lda')

In [25]:
import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt

In [26]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dic)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [28]:
vis

In [27]:
pyLDAvis.save_html(vis,'LDA_seoul.html')