<a href="https://colab.research.google.com/github/gks4478/section2_project/blob/main/%EC%84%B9%EC%85%982_%ED%94%84%EB%A1%9C%EC%A0%9D%ED%8A%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. 파일 불러오기

In [32]:
import pandas as pd
df = pd.read_csv('mussorie_reviews.csv')

## column 설명
1. Review : 사용자가 남긴 리뷰
2. Date of stay : 사용 일자
3. Rating : 별점

# 1. 라이브러리

In [2]:
# nltk 설치
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
# nltk 다운로드
nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [90]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 2. 전처리

In [33]:
# 1. 결측치 삭제
df.dropna(inplace = True)

In [34]:
# 2. 중복값 삭제
df.drop_duplicates(inplace = True)

In [35]:
# 3. 필요없는 열 삭제
df.drop('Date of stay', axis = 1, inplace = True)

In [36]:
# 4. Rating에서 평점만 가져오기
df['Rating'] = df['Rating'].apply(lambda x: re.findall('\d', x)[0])

In [37]:
# 5. 인덱스 리셋
df = df.reset_index()

# 3. 특성공학

In [38]:
# 1. 평점의 5점은 positive 나머지는 negative로 설정한다.
df['Sentiment'] = df['Rating'].apply(lambda x: 'positive' if x == '5' else 'negative')

In [39]:
# 2. 필요없는 열은 삭제 한다.
df.drop(['index', 'Rating'], axis = 1, inplace = True)

In [40]:
df.Sentiment.value_counts(normalize = True)

positive    0.761175
negative    0.238825
Name: Sentiment, dtype: float64

In [42]:
# X, y train, test로 나눈다.
target = 'Sentiment'
feature = 'Review'

X = df[feature]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, stratify = y)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, random_state= 42, stratify = y_train)

print('train:', x_train.shape, y_train.shape)
print('test:', x_test.shape, y_test.shape)
print('val:', x_val.shape, y_val.shape)

train: (8188,) (8188,)
test: (2560,) (2560,)
val: (2048,) (2048,)


In [46]:
# 2. 주요 키워드만 담겨있는 열을 생성한다. : 구두점, 불용어, 3자리 이하의 단어를 없앤다.
# TF-IDF 를 이용한다.

tfidf = TfidfVectorizer(stop_words = 'english')
x_train_vector = tfidf.fit_transform(x_train)
x_test_vector = tfidf.transform(x_test)
x_val_vector = tfidf.transform(x_val)

# 4. 모델 선정

## 1) 기준 모델

In [47]:
base = y_train.mode()[0]
baseline = [base] * len(y_train)
baseline_acc = accuracy_score(y_train, baseline)
print('기준모델 정확도(최빈값):', baseline_acc.round(3))

기준모델 정확도(최빈값): 0.761


## 2) SVC (Support Vector Claasifier)

In [49]:
svc = SVC(kernel = 'linear')
svc.fit(x_train_vector, y_train)

In [69]:
print('svc 정확도:', svc.score(x_val_vector, y_val))

svc 정확도: 0.87158203125


In [82]:
print(svc.predict(tfidf.transform(["excellent stay, delightful surprise stay monaco, thoroughly enjoyed stay, room comfortable lovely amenities friendly staff, especially enjoyed hour indulgence, definitely come,  "])))

['positive']


## 3) DecisionTreeClassifier

In [73]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train_vector, y_train)

In [74]:
print('dtc 정확도:', dtc.score(x_val_vector, y_val))

dtc 정확도: 0.77734375


In [81]:
print(dtc.predict(tfidf.transform(["excellent stay, delightful surprise stay monaco, thoroughly enjoyed stay, room comfortable lovely amenities friendly staff, especially enjoyed hour indulgence, definitely come,  "])))

['positive']


## 4) GaussianNB

In [84]:
gnb = GaussianNB()
gnb.fit(x_train_vector.toarray(), y_train)

In [86]:
print('gnb 정확도:', gnb.score(x_val_vector.toarray(), y_val))

gnb 정확도: 0.54296875


## 5) LogisticRegression

In [88]:
log = LogisticRegression()
log.fit(x_train_vector,y_train)

In [89]:
print(log.score(x_val_vector, y_val))

0.86767578125


# 5. SVC 모델 튜닝

In [91]:
params_svc = {
    'C': [1,4,8,16,32],
    'kernel': ['linear', 'rbf']
}
svc = SVC()
svc_grid = GridSearchCV(svc, params_svc, cv = 5)
svc_grid.fit(x_train_vector, y_train)

In [93]:
print(svc_grid.best_params_)

{'C': 8, 'kernel': 'rbf'}
SVC(C=8)


In [94]:
# 파라미터
param = {
    'C':8,
    'kernel':'rbf'
}

# 최종 모델 생성
model = SVC(probability = True)

# 훈련
model.fit(x_train_vector, y_train)

# 예측
preds = model.predict(x_test_vector)

In [107]:
from sklearn.metrics import classification_report

print(classification_report(y_test,
                            preds,
                            labels = ['positive','negative']))

              precision    recall  f1-score   support

    positive       0.88      0.96      0.92      1949
    negative       0.81      0.58      0.68       611

    accuracy                           0.87      2560
   macro avg       0.85      0.77      0.80      2560
weighted avg       0.86      0.87      0.86      2560



In [96]:
df

Unnamed: 0,Review,Sentiment
0,Most beautiful stay at Mussoorie...such a gorg...,positive
1,The Savoy is one of those rare hotels that not...,positive
2,Def Best stay in Uttarakand ! Amazing nature. ...,positive
3,Very very friendly and efficient staff … loved...,positive
4,Exceptional property. Just great experience. V...,positive
...,...,...
12791,I stayed in this hotel with my 5 family member...,negative
12792,the best histrocally place in india,positive
12793,It was a wonderful experience in Mussoorie. Gr...,negative
12794,Sterling Resort at the first look doesnt seem ...,negative


In [99]:
x_test

1033     Visited their with family had a good time with...
2505     I was there a part of my niece Jyotsana’s wedd...
2017     A pleasurable stay. Very courteous staff. Exce...
4873     Neat and clean room, very good room service, v...
4770     This is the one of the best budget hotel in Mu...
                               ...                        
4550     Perfect stay would recomend anyone for stay th...
11710    The first impression is the last impression an...
2821     Superb service, beautiful property and great f...
987      Loved our brief stay at the Savoy. Great get a...
4915     It was a perfect stay . We were a group of 9 f...
Name: Review, Length: 2560, dtype: object

In [100]:
test = pd.DataFrame({
    'Review': x_test,
    'Sentiment': preds
})
test.to_csv('svc_test.csv', index = False)

In [101]:
svc_test = pd.read_csv('svc_test.csv')

In [105]:
svc_test.Sentiment.value_counts(normalize = True)

positive    0.829297
negative    0.170703
Name: Sentiment, dtype: float64