숙박예약 사이트인 여기어때에서 
평점과 리뷰텍스트를 들고와서 분석하기

In [1]:
%python

# 필요한 library import 하기
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')




In [2]:
%python
# 여기어때에서 크롤링한 평점과 리뷰텍스트 csv 들고오기
df = pd.read_csv('hotel_review.csv')
df.head()

In [3]:
%python

# 약 32000개의 data가 있는 것을 알 수 있다.
df.shape

In [4]:
%python

# 결측치 확인하기
df.isnull().sum()

In [5]:
%python
# 텍스트 데이터 전처리하기
# 형태소 분석기(konlpy), 정규 표현식(re) 이용


from konlpy.tag import Okt
from collections import Counter
import re

def apply_regular_expression(review):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 한글 추출 규칙: 띄어 쓰기(1 개)를 포함한 한글
    result = hangul.sub('', review)  # 위에 설정한 "hangul"규칙을 "review"에 적용(.sub)시킴
    return result



In [6]:
%python

# 말뭉치 생성
corpus = "".join(df['review'].tolist())
corpus



In [7]:
%python

# 말뭉치를 정규 표현식에 적용
apply_regular_expression(corpus)

In [8]:
%python

# 명사빈도 구하기(최상위 15개 출력해보기)

nouns = okt.nouns(apply_regular_expression(corpus))
counter = Counter(nouns)
counter.most_common(15)


In [9]:
%python

# 불용어 사전 load

stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()



In [10]:
%python
# 또, 좀, 방 같은 한 글자는 그냥 제거하기

noun_words = Counter({x: counter[x] for x in counter if len(x) > 1})
noun_words.most_common(15)

add_stopwords = ['남자친구', '방도', '기회', '리뷰', '숙소', '여행','별로','그냥','정도','최고','아주','자주','모두','마음','가성','전체','굿굿','다만','마음','제외','살짝','실망','최악','느낌','보통','호텔','그거','체적','매우','방문','공간','이용','항상','만원','사용','제대로','합리','이틀','짱짱','감사','조금','의사','걱정','서울','애용','다음','실망','최악','느낌','보통','호텔','그거','체적','매우','방문','공간','이용','항상','환경','무엇','비도','이번','모든','의향','정비','이하','이도','여의도','굿굿굿','강주','추천','만족','감동','확인']
for word in add_stopwords:
    stopwords.append(word)



In [11]:
%python
# BOW 벡터 생성하기
from sklearn.feature_extraction.text import CountVectorizer

def text_cleaning(review):
    hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')  # 정규 표현식 처리
    result = hangul.sub('', review)
    okt = Okt()  # 형태소 추출
    nouns = okt.nouns(result)
    nouns = [x for x in nouns if len(x) > 1]  # 한글자 키워드 제거
    nouns = [x for x in nouns if x not in stopwords]  # 불용어 제거
    return nouns

vect = CountVectorizer(tokenizer = lambda x: text_cleaning(x))
bow_vect = vect.fit_transform(df['review'].tolist())
word_list = vect.get_feature_names()
count_list = bow_vect.toarray().sum(axis=0)





In [12]:
%python

# 각 단어가 등장한 횟수
count_list

In [13]:
%python

# 각 단어의 리뷰별 등장 횟수
bow_vect.toarray()

In [14]:
%python

# 단어별 등장 횟수 출력해보기

word_count_dict = dict(zip(word_list, count_list))
word_count_dict


In [15]:
%python
# TF-IDF로 변환하기
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_vectorizer = TfidfTransformer()
tf_idf_vect = tfidf_vectorizer.fit_transform(bow_vect)

# 단어 중요도 하나만 출력해보기
print(tf_idf_vect[0])



In [16]:
%python
print(tf_idf_vect[0].toarray().shape)
print(tf_idf_vect[0].toarray())




In [17]:
%python
invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
print(str(invert_index_vectorizer)[:100]+'...')




In [18]:
%python
# 감성분류하기 Logistic Regression


# df['review'].hist()

In [19]:
%python
# 평점
# 1~3 리뷰 -> 부정적 
# 4~5 -> 긍정적

def star_pos_neg(star):
    if star > 3:
        return 1
    else:
        return 0
    
df['y'] = df['star'].apply(lambda x: star_pos_neg(x))




In [20]:
%python
df.head()

In [21]:
%python
df["y"].value_counts()

In [22]:
%python
# Training set, Test set 나누기

from sklearn.model_selection import train_test_split

x = tf_idf_vect
y = df['y']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=1)

x_train.shape, y_train.shape

In [23]:
%python
x_test.shape, y_test.shape

In [24]:
%python
# Logistic Regression 모델 학습하기
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# fit in training set
lr = LogisticRegression(random_state = 0)
lr.fit(x_train, y_train)

# predict in test set
y_pred = lr.predict(x_test)




In [25]:
%python
# 분류 결과

print('accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('precision: %.2f' % precision_score(y_test, y_pred))
print('recall: %.2f' % recall_score(y_test, y_pred))
print('F1: %.2f' % f1_score(y_test, y_pred))



In [26]:
%python
# confusion matrix

from sklearn.metrics import confusion_matrix

confu = confusion_matrix(y_true = y_test, y_pred = y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()



In [27]:
%python
# 클래스 불균형 조정하기
df['y'].value_counts()

In [28]:
%python
positive_random_idx = df[df['y']==1].sample(1267, random_state=12).index.tolist()
negative_random_idx = df[df['y']==0].sample(1267, random_state=12).index.tolist()



In [29]:
%python
random_idx = positive_random_idx + negative_random_idx
x = tf_idf_vect[random_idx]
y = df['y'][random_idx]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)



In [30]:
%python
x_train.shape, y_train.shape

In [31]:
%python
x_test.shape, y_test.shape

In [32]:
%python
# 재학습 
lr2 = LogisticRegression(random_state = 0)
lr2.fit(x_train, y_train)
y_pred = lr2.predict(x_test)



In [33]:
%python
# test 결과

print('accuracy: %.2f' % accuracy_score(y_test, y_pred))
print('precision: %.2f' % precision_score(y_test, y_pred))
print('recall: %.2f' % recall_score(y_test, y_pred))
print('F1: %.2f' % f1_score(y_test, y_pred))


In [34]:
%python
# confusion matrix

from sklearn.metrics import confusion_matrix

confu = confusion_matrix(y_true = y_test, y_pred = y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(confu, annot=True, annot_kws={'size':15}, cmap='OrRd', fmt='.10g')
plt.title('Confusion Matrix')
plt.show()

# 긍정과 부정 모두 비슷하게 잘 맞춘 것을 확인할 수 있다.

In [35]:
%python

# 긍정 부정 키워드 분석
lr2.coef_

In [36]:
%python
plt.figure(figsize=(10, 8))
plt.bar(range(len(lr2.coef_[0])), lr2.coef_[0])


In [37]:
%python
# 긍정 부정 키워드 top 10 구하기

coef_pos_index = sorted(((value, index) for index, value in enumerate(lr2.coef_[0])), reverse = True)
coef_neg_index = sorted(((value, index) for index, value in enumerate(lr2.coef_[0])), reverse = False)

invert_index_vectorizer = {v: k for k, v in vect.vocabulary_.items()}
invert_index_vectorizer



In [38]:
%python
# 긍정 데이터 출력

for coef in coef_pos_index[:20]:
    print(invert_index_vectorizer[coef[1]], coef[0])



In [39]:
%python
# 부정 데이터 출력

for coef in coef_neg_index[:20]:
    print(invert_index_vectorizer[coef[1]], coef[0])


