# Introduction

## NLP input 
### NSMC
Naver Sentimental Movie Col
- 한국어 감성 분석 (sentiment analysis) 을 위하여 네이버 영화 리뷰를 정리해둔 Naver sentiment movie corpus v1.
<src img='img/'>


In [13]:
import numpy as np
import pandas as pd
from konlpy.utils import pprint


In [14]:
def read_data(filename):
    with open(filename, 'r') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:]   # header 제외
    return data
train_data = read_data('./data/ratings_train.txt')
test_data = read_data('./data/ratings_test.txt')

print('[shape] train_data: ', len(train_data))
print('[shape] test_data: ', len(test_data))

print('\n [Example]')
pprint(train_data[:5])

[shape] train_data:  150000
[shape] test_data:  50000

 [Example]
[['9976970', '아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['3819312', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['10265843', '너무재밓었다그래서보는것을추천한다', '0'],
 ['9045019', '교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'],
 ['6483659',
  '사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다',
  '1']]


In [15]:
from konlpy.tag import Okt
pos_tagger = Okt()
def tokenize(doc):
    # norm, stem은 optional
    return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]

train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

pprint(train_docs[0])

(['아/Exclamation',
  '더빙/Noun',
  '../Punctuation',
  '진짜/Noun',
  '짜증나다/Adjective',
  '목소리/Noun'],
 '0')


In [17]:
import json
if os.path.isfile('data/train_docs.json'):
    with open('data/train_docs.json') as f:
        train_docs = json.load(f)
    with open('data/test_docs.json') as f:
        test_docs = json.load(f)
else:

    train_docs = [(tokenize(row[1]), row[2]) for row in train_data]
    test_docs = [(tokenize(row[1]), row[2]) for row in test_data]

    with open('train_docs.json', 'w', encoding="utf-8") as make_file:
        json.dump(train_docs, make_file, ensure_ascii=False, indent="\t")
    with open('test_docs.json', 'w', encoding="utf-8") as make_file:
        json.dump(test_docs, make_file, ensure_ascii=False, indent="\t")


In [16]:
tokens = [t for d in train_docs for t in d[0]]
print(len(tokens))

2159921


In [20]:
import nltk
text = nltk.Text(tokens, name='NMSC')
print(text)

# Return token numbers
print(len(text.tokens))

# Return number of unique tokens
print(len(set(text.tokens)))

# Return frequency distribution
pprint(text.vocab().most_common(10))


<Text: NMSC>
2159921
49895
[('./Punctuation', 67778),
 ('영화/Noun', 50818),
 ('하다/Verb', 41209),
 ('이/Josa', 38540),
 ('보다/Verb', 38538),
 ('의/Josa', 30188),
 ('../Punctuation', 29055),
 ('가/Josa', 26627),
 ('에/Josa', 26468),
 ('을/Josa', 23118)]


## Sentimental Classification with term existance

In [21]:
# 여기서는 최빈도 단어 2000개를 피쳐로 사용
# WARNING: 쉬운 이해를 위한 코드이며 time/memory efficient하지 않습니다
selected_words = [f[0] for f in text.vocab().most_common(2000)]
def term_exists(doc):
    return {'exists({})'.format(word): (word in set(doc)) for word in selected_words}
# 시간 단축을 위한 꼼수로 training corpus의 일부만 사용할 수 있음
train_docs = train_docs[:10000]
train_xy = [(term_exists(d), c) for d, c in train_docs]
test_xy = [(term_exists(d), c) for d, c in test_docs]

In [22]:
train_xy[0]

({'exists(./Punctuation)': False,
  'exists(영화/Noun)': False,
  'exists(하다/Verb)': False,
  'exists(이/Josa)': False,
  'exists(보다/Verb)': False,
  'exists(의/Josa)': False,
  'exists(../Punctuation)': True,
  'exists(가/Josa)': False,
  'exists(에/Josa)': False,
  'exists(을/Josa)': False,
  'exists(.../Punctuation)': False,
  'exists(도/Josa)': False,
  'exists(은/Josa)': False,
  'exists(들/Suffix)': False,
  'exists(,/Punctuation)': False,
  'exists(는/Josa)': False,
  'exists(없다/Adjective)': False,
  'exists(를/Josa)': False,
  'exists(있다/Adjective)': False,
  'exists(좋다/Adjective)': False,
  'exists(너무/Adverb)': False,
  'exists(?/Punctuation)': False,
  'exists(이/Determiner)': False,
  'exists(재밌다/Adjective)': False,
  'exists(정말/Noun)': False,
  'exists(것/Noun)': False,
  'exists(되다/Verb)': False,
  'exists(!/Punctuation)': False,
  'exists(진짜/Noun)': True,
  'exists(같다/Adjective)': False,
  'exists(적/Suffix)': False,
  'exists(으로/Josa)': False,
  'exists(이/Noun)': False,
  'exists(점/Nou

In [23]:

classifier = nltk.NaiveBayesClassifier.train(train_xy)
print(nltk.classify.accuracy(classifier, test_xy))

classifier.show_most_informative_features(10)

0.80414
Most Informative Features
         exists(수작/Noun) = True                1 : 0      =     38.0 : 1.0
     exists(이딴/Modifier) = True                0 : 1      =     32.1 : 1.0
         exists(최악/Noun) = True                0 : 1      =     30.1 : 1.0
       exists(♥/Foreign) = True                1 : 0      =     24.5 : 1.0
         exists(노잼/Noun) = True                0 : 1      =     22.1 : 1.0
         exists(짜증/Noun) = True                0 : 1      =     19.5 : 1.0
        exists(쓰레기/Noun) = True                0 : 1      =     19.4 : 1.0
         exists(여운/Noun) = True                1 : 0      =     18.9 : 1.0
          exists(굿/Noun) = True                1 : 0      =     17.1 : 1.0
        exists(발연기/Noun) = True                0 : 1      =     16.9 : 1.0
