# 머신러닝 기반 감성 분석
## 1. NLTK 영화 리뷰 데이터셋 - movie_reviews
&nbsp; &nbsp; &nbsp;  1) train/test split <br/>
&nbsp; &nbsp; &nbsp;  2) TF-IDF 벡터로 변환 <br/>
&nbsp; &nbsp; &nbsp;  3) 나이브 베이즈 분류기 학습 <br/>
&nbsp; &nbsp; &nbsp;  +) 로지스틱 회귀분석 <br/>

In [1]:
# movie_reviews 불러오기
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews

# id를 이용해 데이터 가져옴
fileids = movie_reviews.fileids()

reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ing06\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [3]:
# train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews, categories, test_size=0.2, random_state=7)   # train_test_split(X, y, test_size, random_state)

print('Train length: ', len(X_train))
print('Test length: ', len(X_test))

Train length:  1600
Test length:  400


In [6]:
# TF-IDF 벡터
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# shape 확인
print('# Train set dimension: ', X_train_tfidf.shape)
print('# Test set dimension: ', X_test_tfidf.shape)

# Train set dimension:  (1600, 36189)
# Test set dimension:  (400, 36189)


In [7]:
# 나이브 베이즈 분류기
from sklearn.naive_bayes import MultinomialNB

NB = MultinomialNB(alpha=0.01)          
NB.fit(X_train_tfidf, y_train)

print('# Train score: {:.3f}'.format(NB.score(X_train_tfidf, y_train)))
print('# Test score: {:.3f}'.format(NB.score(X_test_tfidf, y_test)))
# 감성 사전을 이용한 어휘 기반 감성분석보다 성능이 뛰어남을 알 수 있음!

# Train score: 0.998
# Test score: 0.797


In [9]:
# 로지스틱 회귀분석
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)

print('# Regression train score: {:.3f}'.format(lr.score(X_train_tfidf, y_train)))
print('# Regression test score: {:.3f}'.format(lr.score(X_test_tfidf, y_test)))
# 나이브 베이즈보다 성능이 뛰어남

# predict
y_pred = lr.predict(X_test_tfidf)

# Regression train score: 0.959
# Regression test score: 0.845
