In [90]:
import os
import glob
import json
import nltk
from nltk import sent_tokenize
from nltk.stem import WordNetLemmatizer

import tqdm

In [99]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('tagsets')
# nltk.download('stopword')
# nltk.download('maxent_treebank_pos_tagger')
# nltk.download('averaged_perceptron_tagger')

In [2]:
os.getcwd()

'/home/chankoo/GitHub/BOAZ-projects/airbnb-NLP'

In [3]:
os.chdir('./../airbnb-data')

In [9]:
file_lst = glob.glob('review_*_*.json')
file_lst

['review_jeju_143957.json',
 'review_jeonju_13647.json',
 'review_seoul_326990.json',
 'review_seogwipo_51728.json',
 'review_busan_70028.json',
 'review_daegu_33689.json']

In [10]:
review = {}
for file in file_lst:
    with open(file,'r',encoding='utf-8') as fp:
        review.update(json.load(fp))

In [11]:
len(review)

28853

In [39]:
review_en = {}
for home_id,rev_lst in review.items():
    review_en[home_id] = []
    for rev in rev_lst:
        if rev['language']=='en':
            review_en[home_id].append([rev['rating'],rev['comments']])
        else:
            continue

In [35]:
review_en['10008511']

[[4,
  "It's a great experience staying at Hanuel's house. It's clean, kitchen is equipped with basic needs. Finding this place isn't hard as well, and it's really a good point that it's close to the airport.\r\n\r\nThere's some really good restaurant around the house. We had a good bbq duck meal and it's awesome!\r\n\r\nWould recommend this to anyone who like to visit Jeju!"]]

In [36]:
len(review_en)

28853

In [45]:
sent_tokenize(review_en['10008511'][0][1])

["It's a great experience staying at Hanuel's house.",
 "It's clean, kitchen is equipped with basic needs.",
 "Finding this place isn't hard as well, and it's really a good point that it's close to the airport.",
 "There's some really good restaurant around the house.",
 "We had a good bbq duck meal and it's awesome!",
 'Would recommend this to anyone who like to visit Jeju!']

In [40]:
%%time
# 리뷰별 문장단위 tokenizing
for home_id,rev_lst in review_en.items():
    for rev in rev_lst:
        rev[1] = sent_tokenize(rev[1])

CPU times: user 29.3 s, sys: 114 ms, total: 29.4 s
Wall time: 29.4 s


In [57]:
%%time 
# 문장별 단어단위 tokenizing
for home_id,rev_lst in review_en.items(): 
    for rev in rev_lst:
        tmp_lst = []
        for sent in rev[1]:
            word_lst = nltk.word_tokenize(sent)
            tmp_lst.append(word_lst)
        rev[1] = tmp_lst

CPU times: user 2min 9s, sys: 538 ms, total: 2min 9s
Wall time: 2min 10s


In [61]:
nltk.pos_tag(review_en['10008511'][0][1][0])

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('great', 'JJ'),
 ('experience', 'NN'),
 ('staying', 'VBG'),
 ('at', 'IN'),
 ('Hanuel', 'NNP'),
 ("'s", 'POS'),
 ('house', 'NN'),
 ('.', '.')]

In [66]:
for home_id,rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        tmp_lst = []
        for sent_bow in rev[1]:
            sent_pos = nltk.pos_tag(sent_bow)
            tmp_lst.append(sent_pos)
        rev[1] = tmp_lst

100%|██████████| 28853/28853 [08:32<00:00, 56.35it/s] 


### 데이터 구조를 보자

- review_en == {home_id: [rev0, rev1, ...]}
    - rev0 == [rating, [sent0, sent1, ...]]
        - sent0 == [(word0,POS0),(word1,POS1), ...]
    

In [88]:
review_en['10022410'][0] # rev0의 예시

[4,
 [[('Kyungsoon', 'NNP'),
   ('’', 'NNP'),
   ('s', 'JJ'),
   ('place', 'NN'),
   ('is', 'VBZ'),
   ('located', 'VBN'),
   ('at', 'IN'),
   ('a', 'DT'),
   ('quiet', 'JJ'),
   ('and', 'CC'),
   ('peaceful', 'JJ'),
   ('neighborhood', 'NN'),
   ('not', 'RB'),
   ('far', 'RB'),
   ('from', 'IN'),
   ('the', 'DT'),
   ('city', 'NN'),
   ('.', '.')],
  [('Recommended', 'VBN'),
   ('to', 'TO'),
   ('those', 'DT'),
   ('who', 'WP'),
   ('have', 'VBP'),
   ('rent', 'VBN'),
   ('a', 'DT'),
   ('car', 'NN'),
   (',', ','),
   ('the', 'DT'),
   ('parking', 'VBG'),
   ('area', 'NN'),
   ('is', 'VBZ'),
   ('spacious', 'JJ'),
   ('.', '.')],
  [('The', 'DT'),
   ('house', 'NN'),
   ('is', 'VBZ'),
   ('clean', 'JJ'),
   ('and', 'CC'),
   ('comfortable', 'JJ'),
   ('.', '.')],
  [('We', 'PRP'),
   ('are', 'VBP'),
   ('a', 'DT'),
   ('group', 'NN'),
   ('of', 'IN'),
   ('4', 'CD'),
   (',', ','),
   ('the', 'DT'),
   ('house', 'NN'),
   ('is', 'VBZ'),
   ('just', 'RB'),
   ('nice', 'JJ'),
   ('for'

unpacking과 zip 함수 이용해 rating과 review를 각각을 뽑아 리스트로 만들 수도 있다

In [81]:
rating_lst, rev_lst = zip(*review_en['10022410'])

In [89]:
rating_lst[:5]

(4, 5, 5, 5, 5)

In [85]:
rev_lst[0]

[[('Kyungsoon', 'NNP'),
  ('’', 'NNP'),
  ('s', 'JJ'),
  ('place', 'NN'),
  ('is', 'VBZ'),
  ('located', 'VBN'),
  ('at', 'IN'),
  ('a', 'DT'),
  ('quiet', 'JJ'),
  ('and', 'CC'),
  ('peaceful', 'JJ'),
  ('neighborhood', 'NN'),
  ('not', 'RB'),
  ('far', 'RB'),
  ('from', 'IN'),
  ('the', 'DT'),
  ('city', 'NN'),
  ('.', '.')],
 [('Recommended', 'VBN'),
  ('to', 'TO'),
  ('those', 'DT'),
  ('who', 'WP'),
  ('have', 'VBP'),
  ('rent', 'VBN'),
  ('a', 'DT'),
  ('car', 'NN'),
  (',', ','),
  ('the', 'DT'),
  ('parking', 'VBG'),
  ('area', 'NN'),
  ('is', 'VBZ'),
  ('spacious', 'JJ'),
  ('.', '.')],
 [('The', 'DT'),
  ('house', 'NN'),
  ('is', 'VBZ'),
  ('clean', 'JJ'),
  ('and', 'CC'),
  ('comfortable', 'JJ'),
  ('.', '.')],
 [('We', 'PRP'),
  ('are', 'VBP'),
  ('a', 'DT'),
  ('group', 'NN'),
  ('of', 'IN'),
  ('4', 'CD'),
  (',', ','),
  ('the', 'DT'),
  ('house', 'NN'),
  ('is', 'VBZ'),
  ('just', 'RB'),
  ('nice', 'JJ'),
  ('for', 'IN'),
  ('us', 'PRP'),
  ('.', '.')],
 [('The', 'DT'),

In [101]:
# nltk.help.upenn_tagset()

In [95]:
for sent in rev_lst[0]:
    tmp_sent = [lm.lemmatize(word[0],pos=word[1]) for word in sent]
    sent = tmp_sent
        

KeyError: 'NNP'

In [91]:
lm = WordNetLemmatizer()

In [None]:
lm.lemmatize()

In [None]:
for home_id,rev_lst in tqdm.tqdm(review_en.items()):
    for rev in rev_lst:
        tmp_lst = []
        for sent_bow in rev[1]:
            sent_pos = nltk.pos_tag(sent_bow)
            tmp_lst.append(sent_pos)
        rev[1] = tmp_lst