## Import modules

In [1]:
import nltk

from konlpy.tag import Twitter, Mecab, Kkma
twitter = Twitter()

from random import shuffle

## List / Random shuffle / POS tagging

In [75]:
pos_sent = open("positive_sents.txt").readlines()[:2000]
neg_sent = open("negative_sents.txt").readlines()[:2000]



positive_sent=[]

for sent in pos_sent:
    sent = [[sent], 'pos']
    positive_sent.append(sent)
    
    
    
    
negative_sent=[]

for sent in neg_sent:
    sent = [[sent], 'neg']
    negative_sent.append(sent)

    
    
sentences = positive_sent + negative_sent
shuffle(sentences)

In [4]:
print(sentences[:3])

[[['기말 프리젠테이션을 위해 한학기동안 13의 수업을 사용하여 준비합니다.\n'], 'pos'], [['중간 기말 모두 동일한 방식의 시험이고 수업내용에서 전부 나옵니다.\n'], 'pos'], [['잘하는 학생들에게 맞춰주시는 경향이 있지만 발표도 많이 하고 적극적으로 노력하는 모습을 보이면 좋은 점수 받을 수 있을거라고 생각합니다.\n'], 'neg']]


In [76]:
sent_list = []

for [sent], n_p in sentences:
    sent = twitter.pos(sent, norm=True, stem=True)
    sent_list.append([sent, n_p])

In [7]:
print(sent_list[:3])

[[[('기다', 'Verb'), ('프리젠테이션', 'Noun'), ('을', 'Josa'), ('위해', 'Noun'), ('한', 'Determiner'), ('학기', 'Noun'), ('동안', 'Noun'), ('13', 'Number'), ('의', 'Noun'), ('수업', 'Noun'), ('을', 'Josa'), ('사용', 'Noun'), ('하다', 'Verb'), ('준비', 'Noun'), ('하다', 'Verb'), ('.', 'Punctuation')], 'pos'], [[('중간', 'Noun'), ('기다', 'Verb'), ('모두', 'Noun'), ('동일하다', 'Adjective'), ('방식', 'Noun'), ('의', 'Josa'), ('시험', 'Noun'), ('이고', 'Josa'), ('수업', 'Noun'), ('내용', 'Noun'), ('에서', 'Josa'), ('전부', 'Noun'), ('나오다', 'Verb'), ('.', 'Punctuation')], 'pos'], [[('잘하다', 'Verb'), ('학생', 'Noun'), ('들', 'Suffix'), ('에게', 'Josa'), ('맞추다', 'Verb'), ('경향', 'Noun'), ('이', 'Josa'), ('있다', 'Adjective'), ('발표', 'Noun'), ('도', 'Josa'), ('많이', 'Adverb'), ('하다', 'Verb'), ('적극', 'Noun'), ('적', 'Suffix'), ('으로', 'Josa'), ('노력', 'Noun'), ('하다', 'Verb'), ('모습', 'Noun'), ('을', 'Josa'), ('보이', 'Noun'), ('면', 'Josa'), ('좋다', 'Adjective'), ('점수', 'Noun'), ('받다', 'Verb'), ('수', 'Noun'), ('있다', 'Adjective'), ('생각', 'Noun'), ('하다', 'Verb'), ('.'

## Remove stopwords / Adjective&Adverb

In [8]:
pos_top100 = open("pos_top100.txt").readlines()
neg_top100 = open("neg_top100.txt").readlines()

pos_stopwords_list = []
neg_stopwords_list = []

for word in pos_top100:
    word = word[:-1]
    pos_stopwords_list.append(word)

for word in neg_top100:
    word = word[:-1]
    neg_stopwords_list.append(word)

In [35]:
print(pos_stopwords_list[:10])
print(neg_stopwords_list[:10])

['하다', '수업', '있다', '교수', '시험', '것', '좋다', '강의', '되다', '이다']
['하다', '수업', '있다', '교수', '시험', '것', '되다', '강의', '이다', '않다']


### Positive

In [10]:
pos_word_list = []
stopwords = ['괜찮다', '야하다', '그래도', '되게']

for sent in pos_sent:
    sent = twitter.pos(sent, norm=True, stem=True)
    sent = [word for word, tag in sent if tag == 'Adverb' or tag == 'Adjective']
    sent = [word for word in sent if word not in pos_stopwords_list and word not in stopwords]
    
    
    if len(sent) >= 1:
        pos_word_list.append(sent)

In [15]:
print(pos_word_list[:7])

[['특히'], ['귀찮다'], ['복잡하다'], ['근데'], ['좋아하다'], ['높다'], ['근데', '확실하다', '엄청나다']]


In [11]:
pos_word = []
for word_list in pos_word_list:
    for word in word_list:
        pos_word.append(word)

        
        
positive = []
for word in set(pos_word):
    a = (word, 'pos')
    positive.append(a)

In [17]:
print(positive[:7])

[('그나마', 'pos'), ('함께', 'pos'), ('그러면', 'pos'), ('그만큼', 'pos'), ('버겁다', 'pos'), ('그건', 'pos'), ('심하다', 'pos')]


### Negative

In [19]:
neg_word_list = []
stopwords = ['괜찮다', '야하다', '그래도', '되게']


for sent in neg_sent:
    sent = twitter.pos(sent, norm=True, stem=True)
    sent = [word for word, tag in sent if tag == 'Adverb' or tag == 'Adjective']
    sent = [word for word in sent if word not in neg_stopwords_list and word not in stopwords]
    
    if len(sent) >= 1:
        neg_word_list.append(sent)

In [20]:
neg_word = []
for word_list in neg_word_list:
    for word in word_list:
        neg_word.append(word)

negative = []
for word in set(neg_word):
    b = (word, 'neg')
    negative.append(b)

In [51]:
total = positive + negative

shuffle(total)

In [22]:
print(total[:5])

[('그러면', 'pos'), ('미치다', 'pos'), ('매주', 'neg'), ('가능하다', 'neg'), ('눈부시다', 'pos')]


## Data Features

In [52]:
def data_features(documents):
    features = {}
    document = [word for word, tag in documents[0]]
    document_words = set(document)
    
    word_features = [word for word, pn in total]
    
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
        
    return features

In [84]:
featuresets = []

for sent in sent_list:
    featureset = (data_features(sent), sent[1])
    featuresets.append(featureset)

In [83]:
print(sent_list[2])

[[('음', 'Noun'), ('그러니까', 'Adverb'), ('유익하다', 'Adjective'), ('힘들다', 'Adjective'), ('수업', 'Noun'), ('이라는', 'Josa'), ('표현', 'Noun'), ('이', 'Josa'), ('옳다', 'Adjective'), ('것', 'Noun'), ('같다', 'Adjective')], 'neg']


In [85]:
featuresets[2]

({'contains(가급적)': False,
  'contains(가깝다)': False,
  'contains(가능하다)': False,
  'contains(가볍다)': False,
  'contains(간간히)': False,
  'contains(간단하다)': False,
  'contains(강력하다)': False,
  'contains(강하다)': False,
  'contains(걍)': False,
  'contains(거뜬히)': False,
  'contains(거슬리다)': False,
  'contains(결국)': False,
  'contains(고맙다)': False,
  'contains(골고루)': False,
  'contains(관계없이)': False,
  'contains(괴롭다)': False,
  'contains(굉장하다)': False,
  'contains(굳다)': False,
  'contains(귀엽다)': False,
  'contains(귀찮다)': False,
  'contains(그건)': False,
  'contains(그걸)': False,
  'contains(그나마)': False,
  'contains(그래서)': False,
  'contains(그러니까)': True,
  'contains(그러면)': False,
  'contains(그런대로)': False,
  'contains(그렇게)': False,
  'contains(그립다)': False,
  'contains(그만)': False,
  'contains(그만큼)': False,
  'contains(그저)': False,
  'contains(근데)': False,
  'contains(기대하다)': False,
  'contains(길다)': False,
  'contains(깊다)': False,
  'contains(깍)': False,
  'contains(꼬이다)': False,
  'contains(꼼꼼하다)

In [80]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

## Decision Tree Classifier

In [81]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.575

## Naive Bayes Classifier

In [82]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.565