- movie review database를 이용한 sentiment analysis
- vocabulary = feature: positive, negative
- document = words list + 'pos'/'neg'

In [1]:
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

### frequency distribution(빈도수 분포) 확인 -> 가장 자주 나타나는 단어 확인

In [2]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [3]:
len(all_words)

1583820

In [4]:
from nltk.probability import FreqDist

fwds = FreqDist(all_words)
fwds.most_common(20)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961)]

In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
puncs = [' ', ',', '.', ';', "'", '--', '-', ':', '(', ')', '[', ']']
for x in puncs:
    stop_words.append(x)

In [6]:
filteredWords = []
for w in all_words:
    if w not in stop_words:
        filteredWords.append(w)
print(len(all_words))
print(len(filteredWords))

1583820
735506


### 자주 나타나는 3000개의 단어로 word feature 만들기

In [7]:
fwds = FreqDist(filteredWords)
word_features = list(fwds.keys())[:3000]

In [8]:
len(word_features)

3000

In [9]:
word_features[:10]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get']

### positive/negative documents에서 3000개의 단어에 속하는 feature 단어 찾기

In [10]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [11]:
selected_features = find_features(movie_reviews.words('neg/cv000_29416.txt'))

### feature set 찾기: pos, neg에 대한 모든 review 문서에 대해

In [12]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [13]:
len(feature_sets)

2000

### Naive Bayes Classifier를 이용한 학습

- data = 2000
- training = 1900
- test = 100

In [14]:
training_data = feature_sets[:1900]
test_data = feature_sets[1900:]

In [15]:
len(training_data), len(test_data)

(1900, 100)

In [16]:
import nltk

clf = nltk.NaiveBayesClassifier.train(training_data)

In [17]:
print('Classifier accuracy percent: ', (nltk.classify.accuracy(clf, test_data)) * 100)

Classifier accuracy percent:  85.0


#### 가장 가치있는 feature 확인

In [18]:
clf.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =     10.5 : 1.0
                 frances = True              pos : neg    =      9.1 : 1.0
                 idiotic = True              neg : pos    =      7.2 : 1.0
                  regard = True              pos : neg    =      7.0 : 1.0
              schumacher = True              neg : pos    =      7.0 : 1.0
                  crappy = True              neg : pos    =      7.0 : 1.0
                    mena = True              neg : pos    =      7.0 : 1.0
             silverstone = True              neg : pos    =      7.0 : 1.0
                  suvari = True              neg : pos    =      7.0 : 1.0
           unimaginative = True              neg : pos    =      7.0 : 1.0
                  turkey = True              neg : pos    =      6.5 : 1.0
                 singers = True              pos : neg    =      6.4 : 1.0
                   jumbo = True              neg : pos    =      6.3 : 1.0

In [19]:
import pickle

save_clf = open('naivebayes.pickle', 'wb')
pickle.dump(clf, save_clf)
save_clf.close()

In [20]:
re_clf = open('naivebayes.pickle', 'rb')
clf = pickle.load(re_clf)
re_clf.close()