In [1]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel



In [2]:
import json
import re
import tqdm
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk import sent_tokenize
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [3]:
import numpy as np

In [4]:
review = {}
file = 'translated_gangneung.json'
with open(file,'r',encoding='utf-8') as fp:
    review.update(json.load(fp))

In [5]:
review_postag = {}
file = 'review_postag_gangneung.json'
with open(file,'r',encoding='utf-8') as fp:
    review_postag.update(json.load(fp))

In [6]:
review_en={}

for home_id,rev_lsts in review.items():
    review_en[home_id] = []
    for rev_lst in rev_lsts:
        try:
            if str(rev_lst['language'][-2:]) == 'en':
                review_en[home_id].append([rev_lst['rating'],rev_lst['t_comments']])
        except:
            pass

In [7]:
del review

In [8]:
review_en['21565183'][0]

[0,
 'The host canceled this reservation 5 days before arrival. This is an automated posting.']

In [9]:
review_postag['21565183'][0]

[0,
 [['host', 'cancel', 'reservation', 'day', 'arrival'], ['automate', 'post']]]

## Preprocess 

In [8]:
reviews = []
ratings = []
for id, rev_lst in review_en.items() :
    for rev in rev_lst :
        reviews.append(rev[1])
        ratings.append(rev[0])

In [11]:
print( len(reviews) )
print( len(ratings) )
print( reviews[:2] )
print( ratings[:2] )

17064
17064
['The host canceled this reservation 5 days before arrival. This is an automated posting.', 'The location is pretty far from the station.Except that there are no convenience facilities around here, I am satisfied and the host was very kind.']
[0, 4]


In [9]:
review_actual = []

for r in reviews :
    sentences = nltk.sent_tokenize(r)
    review_actual.append(sentences)

In [10]:
review_sent = []

for id,rev_lst in review_postag.items() :
    for rev in rev_lst :
        review_sent.append(rev[1])

In [11]:
only_sent = []

for id,rev_lst in review_postag.items() :
    for rev in rev_lst :
        for s in rev[1] :
            only_sent.append(s)

In [15]:
review_postag['21565183'][0]

[0,
 [['host', 'cancel', 'reservation', 'day', 'arrival'], ['automate', 'post']]]

In [16]:
review_actual[0]

['The host canceled this reservation 5 days before arrival.',
 'This is an automated posting.']

In [17]:
review_sent[0]

[['host', 'cancel', 'reservation', 'day', 'arrival'], ['automate', 'post']]

In [18]:
only_sent[0]

['host', 'cancel', 'reservation', 'day', 'arrival']

In [12]:
stopWords =list(stopwords.words('english'))
add_stopwords = ['airbnb','im','youre','hongdae', 'seoul', 'daegu', 'jeju' ,'busan', 'korea','gangneung','daejeon','yeosu',
                 'itaewon','myeongdong','john','gangnam','also','even','dont','namsan','incheon','jeonju','good','great','well','gwangju',"'s","...","'ve","``","''","'m",'--',"'ll","'d"]
# additional_stopwords = []
stopWords = set(stopWords + add_stopwords)

In [13]:
words = []
for s in only_sent :
    words += s
freq = FreqDist(words)
vocab = []
for k,v in freq.items():
    if v > 5 :
        vocab.append(k)

vocab_dict = dict(zip(vocab, range(len(vocab))))

In [None]:
len

In [84]:
len(vocab_dict)

2589

In [85]:
vocab_dict

{'host': 0,
 'cancel': 1,
 'reservation': 2,
 'day': 3,
 'arrival': 4,
 'automate': 5,
 'post': 6,
 'location': 7,
 'pretty': 8,
 'far': 9,
 'station': 10,
 'convenience': 11,
 'facility': 12,
 'satisfied': 13,
 'kind': 14,
 'say': 15,
 'first': 16,
 'time': 17,
 'bit': 18,
 'experience': 19,
 'normally': 20,
 'offer': 21,
 'people': 22,
 'lot': 23,
 'expectation': 24,
 'company': 25,
 'wait': 26,
 'service': 27,
 'answer': 28,
 'question': 29,
 'information': 30,
 'check': 31,
 'know': 32,
 'pool': 33,
 'close': 34,
 'something': 35,
 'solve': 36,
 'communication': 37,
 'flat': 38,
 'convenient': 39,
 'bathroom': 40,
 'really': 41,
 'equipment': 42,
 'make': 43,
 'food': 44,
 'space': 45,
 'unfortunately': 46,
 'bed': 47,
 'felt': 48,
 'spring': 49,
 'dog': 50,
 'neighborhood': 51,
 'noisy': 52,
 'last': 53,
 'house': 54,
 'poor': 55,
 'guy': 56,
 'especially': 57,
 'leave': 58,
 'owner': 59,
 'cute': 60,
 'need': 61,
 'eat': 62,
 'problem': 63,
 'bark': 64,
 'reason': 65,
 'bad': 66,

## Aspect segmentation

### get aspect terms

In [14]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda_sea.lda')

In [15]:
ldamodel.show_topics(num_words=5)

[(0,
  '0.064*"host" + 0.050*"kind" + 0.041*"trip" + 0.027*"day" + 0.025*"make"'),
 (1, '0.084*"go" + 0.066*"time" + 0.050*"stay" + 0.047*"want" + 0.046*"come"'),
 (2,
  '0.083*"clean" + 0.049*"room" + 0.040*"nice" + 0.036*"view" + 0.024*"house"'),
 (3,
  '0.045*"thank" + 0.032*"lot" + 0.020*"check" + 0.018*"much" + 0.017*"take"'),
 (4,
  '0.053*"location" + 0.040*"take" + 0.038*"close" + 0.033*"station" + 0.032*"walk"')]

In [50]:
ntopic = 5
aspect_terms = []
for i in range(ntopic) :
    kw = []
    kws_lst = ldamodel.show_topic(i, 10)
    for i in range(len(kws_lst)) :
        kws = kws_lst[i][0]
        kw.append(kws)
    aspect_terms.append(kw)

In [17]:
aspect_terms

[['host',
  'kind',
  'trip',
  'day',
  'make',
  'family',
  'stay',
  'home',
  'thanks',
  'nice'],
 ['go',
  'time',
  'stay',
  'want',
  'come',
  'use',
  'place',
  'next',
  'recommend',
  'visit'],
 ['clean',
  'room',
  'nice',
  'view',
  'house',
  'neat',
  'bed',
  'comfortable',
  'really',
  'accommodation'],
 ['thank',
  'lot',
  'check',
  'much',
  'take',
  'morning',
  'price',
  'breakfast',
  'care',
  'give'],
 ['location',
  'take',
  'close',
  'station',
  'walk',
  'rest',
  'quiet',
  'convenient',
  'best',
  'restaurant']]

In [None]:
# 문장으로 나눠져 있는것 /리뷰로 나눠져있는 것

In [51]:
def chi_sq(a,b,c,d,e):
    # b = 모든 문장에서 w가 나오는 횟수
    c1 = a # aspect Ai에 속하는 문장에서 w가 나오는 횟수
    c2 = b - a
    c3 = c - e # w를 포함하지 않은 aspect Ai의 문장 수 
    c4 = d - e
    c =  d # 리뷰 전체의 문장 수
    print(c1,c2,c3,c4,c)
    return (c * ((c1*c4 - c2*c3)**2)) / ((c1+c3) * (c2+c4) * (c1+c2) * (c3+c4))

def chi_sq_mat():
    global aspect_words, aspect_sent, num_words
    asp_rank = np.zeros(aspect_words.shape)
    for i in range(len(aspect_terms)):
        for j in range(len(vocab)):
            print(vocab[j])
            # aspect_sent = aspect i로 라벨링된 senetence 갯수
            asp_rank[i][j] = chi_sq(aspect_words[i][j], num_words[j], aspect_sent[i], len(only_sent),sents_freq[i][j])
            
    return asp_rank

In [52]:
chi_sq_mat()

host


NameError: name 'sents_freq' is not defined

In [28]:
label_text = ['Host','Re','Cleanliness','Service','Location']

#ALGORITHM

review_labels = []
k = len(aspect_terms) # number of topic = 5
v = len(vocab) # 2589개
aspect_words = np.zeros((k,v))
aspect_sent = np.zeros(k)
num_words = np.zeros(v)

for r in review_sent:  # 17064개의 리뷰에 대해
    labels = []
    for s in r:
        count = np.zeros(len(aspect_terms))
        i = 0
        for a in aspect_terms:
            for w in s:
                if w in vocab_dict :
                    num_words[vocab_dict[w]] += 1
                    if w in a:
                        count[i] += 1   # count : 문장 하나에서 aspect 별 단어의 freq
            i = i + 1

        if max(count) > 0:
            la = np.where(np.max(count) == count)[0].tolist() # 문장이 어떤 aspect 인지 labeling
            labels.append(la)
            for i in la:
                aspect_sent[i] += 1
                for w in s:
                    if w in vocab_dict :
                        aspect_words[i][vocab_dict[w]] += 1
        else:
            labels.append([])
    review_labels.append(labels)

In [29]:
p = 20
aspect_w_rank = chi_sq_mat()

aspect_terms = []
for na in aspect_w_rank:  # na = aspect i에 대한 단어 행렬 , shape = (1,단어수)
    new_terms = []
    x = np.argsort(na)[::-1][:p] # np.argsort = 자료를 정리하는 것이 아니라 순서를 알 수 있슴
    for k,v in vocab_dict.items():
        if vocab_dict[k] in x:
            new_terms.append(k)
    aspect_terms.append(new_terms)

In [29]:
import pandas as pd
from pandas import DataFrame

In [30]:
pd.DataFrame(aspect_w_rank)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2579,2580,2581,2582,2583,2584,2585,2586,2587,2588
0,1054.608368,13.54626,86.576314,435.122272,44.73882,319.779714,365.73939,3293.303429,1752.9421,531.198667,...,6.970116,9.415464,9.415464,6.970116,10.064654,2.686356,2.850836,4.891761,2.754837,4.514151
1,9237.707181,382.196941,1087.595741,2273.278017,440.617441,362.593337,444.819139,3863.623553,1894.958806,568.347876,...,5.939652,2.581992,10.676051,8.13551,7.260782,8.599657,14.237457,5.939652,13.419937,9.891177
2,13516.378064,581.598735,1711.785931,4105.208534,703.334611,544.056265,627.729724,4991.47764,849.60251,833.255367,...,10.46882,16.018972,2.89655,13.096895,10.848406,15.839242,21.362714,8.134748,18.287549,15.747229
3,3311.315604,159.81176,499.035088,979.797555,169.441979,159.069517,181.271055,1759.929212,853.754876,257.126547,...,2.655858,4.683578,4.683578,4.683578,6.245965,7.80895,2.475471,2.655858,3.161278,3.39597
4,6465.382473,273.37627,872.694943,1936.191999,333.424439,259.667562,316.482078,198.51876,1369.691178,163.419199,...,7.645546,5.340841,7.645546,1.968302,7.839319,12.747452,7.839319,7.645546,9.097509,8.920657


In [116]:
aspect_w_rank[0]

array([1054.60836789,   13.54625974,   86.57631404, ...,    4.89176124,
          2.75483743,    4.51415105])

In [117]:
na = aspect_w_rank[0]
np.argsort(na)[::-1][:5]

array([156, 148, 102, 177,  78], dtype=int64)

In [118]:
np.argsort(na)[::-1]

array([ 156,  148,  102, ..., 2000, 2010, 2574], dtype=int64)

In [100]:
count

array([0., 0., 3., 0., 0.])

In [105]:
np.where(np.max(count) == count)[0].tolist()

[2]

In [95]:
aspect_sent

array([12495., 13729., 18209.,  7063., 10623.])

In [97]:
num_words

array([19685.,  1090.,  3380., ...,    30.,    45.,    35.])

In [98]:
review_labels

[[[0], []],
 [[4]],
 [[1],
  [3, 4],
  [],
  [],
  [3],
  [4],
  [],
  [0, 2],
  [0, 2],
  [],
  [0, 1, 2],
  [0],
  [1],
  [3]],
 [[0], [2]],
 [[2], [0]],
 [[4]],
 [[0, 2], [2], [2], [2, 3], [4], [], [0], [1, 2], [4]],
 [[4]],
 [[0, 2], []],
 [[], [4], [0]],
 [[0, 4], [4], [2], []],
 [[0, 1, 2, 4]],
 [[3], [1, 3]],
 [[0, 2]],
 [[2, 4]],
 [[0]],
 [[2], [1], [0], [4]],
 [[2], [3], [], [4], [1]],
 [[2], [3]],
 [[0, 1, 2], [3]],
 [[2], [1]],
 [[]],
 [[1], [2, 3], [2]],
 [[0, 1], [], [0], [1]],
 [[0], [0], [2, 4], [4], [4], [4], [0], [1]],
 [[0], [1], []],
 [[4], [4], [1]],
 [[0], [1, 2], [1], [0], [0]],
 [[1]],
 [[0], [1, 2], [2], [0]],
 [[2], [0], [1, 3], [1]],
 [[2], [2], [0]],
 [[4], [2], [2]],
 [[0], []],
 [[0, 2, 4], [1, 4], [1], [3], [0]],
 [[2], [4]],
 [],
 [[1]],
 [[2], [], []],
 [[1, 2], [0], [1]],
 [[4], [0]],
 [[0]],
 [[1, 2], [3], [0, 1, 2, 3], [], [2], [1]],
 [[2], [0]],
 [[0]],
 [[2], [0, 2], [], [], [2], [1], [3], [3], [0, 1, 2], [4], [1, 2, 3, 4], [3]],
 [[]],
 [[0, 2, 4]]

In [44]:
ntopic = 5
aspect_terms = []
for i in range(ntopic) :
    kw = []
    kws_lst = ldamodel.show_topic(i, 1000)
    for i in range(len(kws_lst)) :
        kws = kws_lst[i][0]
        kw.append(kws)
    print(kw[:10])    
    aspect_terms.append(kw)

['host', 'kind', 'trip', 'day', 'make', 'family', 'stay', 'home', 'thanks', 'nice']
['go', 'time', 'stay', 'want', 'come', 'use', 'place', 'next', 'recommend', 'visit']
['clean', 'room', 'nice', 'view', 'house', 'neat', 'bed', 'comfortable', 'really', 'accommodation']
['thank', 'lot', 'check', 'much', 'take', 'morning', 'price', 'breakfast', 'care', 'give']
['location', 'take', 'close', 'station', 'walk', 'rest', 'quiet', 'convenient', 'best', 'restaurant']


In [45]:
I = 3
p = 100

In [None]:
# for iter in range(I) : 
review_labels = []
k = len(aspect_terms) # number of topic = 5
v = len(vocab) # 2589개
aspect_words = np.zeros((k,v))
sents_freq = np.zeros((k,v))
aspect_sent = np.zeros(k)
num_words = np.zeros(v)

for r in review_sent:  # 17064개의 리뷰에 대해
    labels = []
    for s in r:
        count = np.zeros(len(aspect_terms))
        i = 0
        for a in aspect_terms:
            for w in s:
                if w in vocab_dict :
                    num_words[vocab_dict[w]] += 1
                    if w in a:
                        count[i] += 1   # count : 문장 하나에서 aspect 별 단어의 freq
            i = i + 1

        if max(count) > 0:
            la = np.where(np.max(count) == count)[0].tolist() # 문장이 어떤 aspect 인지 labeling
            labels.append(la)
            for i in la:
                aspect_sent[i] += 1
                for w in s:
                    if w in vocab_dict :
                        aspect_words[i][vocab_dict[w]] += 1
                for w in list(set(s)) :
                    if w in vocab_dict :
                        sents_freq[i][vocab_dict[w]] += 1                          
        else:
            labels.append([])
    review_labels.append(labels)

aspect_w_rank = chi_sq_mat()

aspect_terms = []
for na in aspect_w_rank:  # na = aspect i에 대한 단어 행렬 , shape = (1,단어수)
    new_terms = []
    x = np.argsort(na)[::-1][:p] # np.argsort = 자료를 정리하는 것이 아니라 순서를 알 수 있슴
    for k,v in vocab_dict.items():
        if vocab_dict[k] in x:
            new_terms.append(k)
    print(new_terms[:10])
    aspect_terms.append(new_terms)
    #print(aspect_terms)
#     print('-'*10 + 'iteration :' + str(iter+1))
#     print('-'*100)

In [142]:
import sys

In [34]:
for iter in range(3) :
    ex = []
    for i in range(5) :
        ex.append(i+1)
    print(ex)

[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]


In [29]:
len(aspect_terms)

5