In [1]:
from zipfile import ZipFile
z = ZipFile('sentiment.zip')
data = z.open('sentiment labelled sentences/imdb_labelled.txt')

In [2]:
import pandas as pd
df = pd.read_csv(data, sep='\t', header = None)

## 감성 분석 2가지 방법

1. 감성 단어를 알려주기: good -> 긍정, bad -> 부정
2. 기계학습


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500, stop_words='english')
tdm = cv.fit_transform(df[0])

In [7]:
words = cv.get_feature_names()
words

['10',
 '20',
 '90',
 'ability',
 'absolutely',
 'acting',
 'action',
 'actor',
 'actors',
 'actress',
 'actresses',
 'actually',
 'addition',
 'adorable',
 'aerial',
 'age',
 'amazing',
 'angel',
 'animation',
 'anne',
 'annoying',
 'appearance',
 'appreciate',
 'art',
 'attempt',
 'attention',
 'audience',
 'avoid',
 'away',
 'awesome',
 'awful',
 'bad',
 'barely',
 'beautiful',
 'beginning',
 'believable',
 'believe',
 'best',
 'better',
 'big',
 'billy',
 'bit',
 'black',
 'book',
 'bore',
 'bored',
 'boring',
 'bought',
 'brain',
 'brilliant',
 'budget',
 'called',
 'came',
 'camera',
 'care',
 'cartoon',
 'cast',
 'casting',
 'certainly',
 'character',
 'characters',
 'charles',
 'cheap',
 'chemistry',
 'child',
 'children',
 'cinema',
 'cinematography',
 'classic',
 'clever',
 'close',
 'come',
 'comedy',
 'comes',
 'coming',
 'complete',
 'completely',
 'conclusion',
 'conflict',
 'consider',
 'convincing',
 'cool',
 'costs',
 'couldn',
 'course',
 'cover',
 'crap',
 'created',

## 긍정 점수

In [8]:
# 워드 사전에서 good 단어의 인덱스
# 181열 
words.index('good')

181

In [9]:
# 워드 사전에서 fun 단어의 인덱스
# 164열
words.index('fun')

164

In [10]:
# 긍정 단어 인덱스
pos_index = [181,160]

In [20]:
# 긍정 단어 빈도수 count 를 긍정 score라고 한다
pos_score = tdm[:,pos_index].sum(axis =1)

In [21]:
pos_score > 1

matrix([[False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False

## 부정 점수

In [22]:
# 워드 사전에서 bad 단어의 인덱스
# 31열
words.index('bad')

31

In [24]:
# 워드 사전에서 boring 단어의 인덱스
# 46열
words.index('boring')

46

In [25]:
neg_idx = [31,46]

In [26]:
neg_score = tdm[:,neg_idx].sum(axis=1)

## 예측

In [27]:
import numpy as np

In [29]:
y_pred = np.where(pos_score > neg_score, 1, 0)

## 정확도 평가

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
accuracy_score(df[1],y_pred)

0.5120320855614974

In [32]:
df[1].mean()

0.516042780748663

In [33]:
y_pred.mean()

0.05481283422459893

## 단어추가해서 성능올리기

In [None]:
## 전처리만 해도 성능 올라갈 것 

In [52]:
# 긍정 단어 인덱스
pos_idx = [words.index('good')
             ,words.index('fun')
            ,words.index('best')
            ,words.index('love')
          ,words.index('adorable')
          ,words.index('amazing')
          ,words.index('beautiful')
          ,words.index('great')
          ,words.index('wonderful')
          ,words.index('funny')
          ,words.index('excellent')]

In [56]:
# 부정 단어 인덱스 
neg_idx = [words.index('bad')
             ,words.index('boring')
            ,words.index('lacks')
            ,words.index('awful')
          ,words.index('annoying')]

In [57]:
pos_score = tdm[:,pos_index].sum(axis =1)
neg_score = tdm[:,neg_idx].sum(axis=1)

In [58]:
y_pred = np.where(pos_score > neg_score, 1, 0)

In [59]:
accuracy_score(df[1],y_pred)

0.5401069518716578

단어사전보다 기계학습이 효율적이다!
그리고 전처리 과정을 넣어주면 더 성능 올라갈것