# 형태소 분석

## 영어

In [1]:
from nltk.corpus import gutenberg

In [2]:
corpus = gutenberg.open('austen-emma.txt').read()

In [3]:
from nltk import regexp_tokenize, Text

In [4]:
pattern = r"([a-zA-Z0-9]+)+"

In [5]:
tokens = regexp_tokenize(corpus, pattern)

In [6]:
len(tokens)

161980

In [7]:
en = Text(tokens)

In [8]:
type(en)

nltk.text.Text

## 한글

In [9]:
from konlpy.corpus import kolaw

In [10]:
kolaw.fileids()

['constitution.txt']

In [11]:
corpus = kolaw.open('constitution.txt').read()

In [12]:
from konlpy.tag import Kkma

In [13]:
from konlpy.tag import Mecab # mac 상에서 제일 좋은 한글 형태소 분석기 

In [14]:
kkma = Kkma()

In [15]:
tokens = kkma.morphs(corpus)

In [16]:
tokens[:5]

['대한민국', '헌법', '유구', '하', 'ㄴ']

In [17]:
ko = Text(tokens)

In [18]:
len(ko)

10053

***

## 정규식으로 

In [19]:
sentence = "The little yellow dog barked at the Persian cat"

In [20]:
tokens = sentence.split()

In [21]:
tokens

['The', 'little', 'yellow', 'dog', 'barked', 'at', 'the', 'Persian', 'cat']

In [22]:
import nltk

In [23]:
nltk.pos_tag(tokens) # 형태소 분석 (어절-품사)

[('The', 'DT'),
 ('little', 'JJ'),
 ('yellow', 'JJ'),
 ('dog', 'NN'),
 ('barked', 'VBD'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('Persian', 'JJ'),
 ('cat', 'NN')]

In [24]:
enTags = nltk.pos_tag(tokens)

In [25]:
sentence = "내 친구는 잠을 많이 잔다."

In [26]:
kkma.pos(sentence) # 한글 형태소 분석 

[('내', 'NP'),
 ('친구', 'NNG'),
 ('는', 'JX'),
 ('잠', 'NNG'),
 ('을', 'JKO'),
 ('많이', 'MAG'),
 ('자', 'VV'),
 ('ㄴ다', 'EFN'),
 ('.', 'SF')]

In [27]:
koTags = kkma.pos(sentence)

In [28]:
enParser = nltk.RegexpParser("NP: {<DT>?<JJ>*<NN.*>*}")

In [29]:
enChunks = enParser.parse(enTags)

In [None]:
enChunks.draw()

In [30]:
koParser = nltk.RegexpParser("""
    NP: {<N.*>*<N.*>?}
    VP: {<M.*>*<V.*>*<E.*>*}
""")

In [31]:
koChunks = koParser.parse(koTags)

In [None]:
koChunks.draw()

In [None]:
koParser = nltk.RegexpParser("""
    NP: {<N.*>*<N.*>?}
    OP: {<N.*>*<JKO>*}
    VP: {<M.*>*<V.*>*<E.*>*}
""")

## Collocations
- 위치 기반으로 하는 유사도 분석
- 유사도 분석 단어를 긍정으로 쓸 경우 감정 분석이 가능하다. 

In [32]:
from nltk import collocations

In [33]:
measures = collocations.BigramAssocMeasures()
tagged_words = kkma.pos(corpus)
finder = collocations.BigramCollocationFinder.from_words(tagged_words)
for row in finder.nbest(measures.pmi, 5):
    print(row)

(('가부', 'NNG'), ('동수', 'NNG'))
(('강제', 'NNG'), ('노역', 'NNG'))
(('경자', 'NNG'), ('유전', 'NNG'))
(('고', 'ECS'), ('채취', 'NNG'))
(('공무', 'NNG'), ('담임', 'NNG'))


In [34]:
words = [w for (w, t) in tagged_words]
finder = collocations.BigramCollocationFinder.from_words(words)
for row in finder.nbest(measures.pmi, 5):
    print(row)

('가부', '동수')
('강제', '노역')
('경자', '유전')
('공무', '담임')
('공중', '도덕')


In [35]:
tags = [t for (w, t) in tagged_words]
finder = collocations.BigramCollocationFinder.from_words(tags)
for row in finder.nbest(measures.pmi, 5):
    print(row)

('XR', 'XSA')
('JKC', 'VCN')
('EPT', 'EPT')
('VCN', 'ECD')
('ECD', 'VX')


## word Cloud

In [36]:
ko.vocab()

FreqDist({'의': 532, '하': 457, '.': 359, '에': 328, '는': 281, 'ㄴ다': 243, 'ㄴ': 234, '을': 232, '은': 195, '이': 192, ...})

In [37]:
type(ko.vocab())

nltk.probability.FreqDist

In [38]:
for row in ko.vocab().items():
    print(row)

('대한민국', 11)
('헌법', 69)
('유구', 1)
('하', 457)
('ㄴ', 234)
('역사', 1)
('와', 42)
('전통', 1)
('에', 328)
('빛나', 1)
('는', 281)
('우리', 3)
('대하', 19)
('국민', 69)
('은', 195)
('3', 24)
('·', 145)
('1', 28)
('운동', 3)
('으로', 57)
('건립', 1)
('되', 113)
('임시', 3)
('정부', 23)
('의', 532)
('법통', 1)
('과', 82)
('불의', 2)
('항거', 1)
('4', 14)
('19', 2)
('민주', 6)
('이념', 1)
('을', 232)
('계승', 2)
('고', 39)
(',', 101)
('조국', 3)
('개혁', 1)
('평화적', 4)
('통일', 9)
('사명', 2)
('입각', 2)
('여', 149)
('정의', 2)
('인도', 1)
('동포애', 1)
('로써', 5)
('민족', 3)
('단결', 1)
('공고히', 1)
('모든', 37)
('사회적', 5)
('폐습', 1)
('를', 135)
('타파', 1)
('며', 45)
('자율', 2)
('조화', 2)
('바탕', 1)
('자유', 21)
('민주적', 4)
('기본', 8)
('질서', 8)
('더욱', 1)
('확고히', 1)
('하여', 3)
('정치', 4)
('경제', 18)
('사회', 4)
('문화', 3)
('영역', 4)
('있', 99)
('어서', 9)
('각인', 1)
('기회', 4)
('균등', 4)
('히', 1)
('능력', 3)
('최고', 3)
('도로', 1)
('발휘', 1)
('게', 6)
('권리', 21)
('따르', 7)
('책임', 5)
('의무', 20)
('완수', 1)
('안', 15)
('생활', 8)
('향상', 4)
('기하', 1)
('밖', 1)
('항구적', 1)
('이', 192)
('세계', 1)
('평화', 5)


('지속', 1)
('새로', 1)
('새롭', 1)
('존속', 1)


In [39]:
import csv
with open('words.csv', 'w', encoding = 'utf-8') as f:
    f.write('word, freq\n')
    writer = csv.writer(f)
    for (k, n) in ko.vocab().items():
        f.write('{0}, {1}\n'.format(k, n))

## WordCloud

In [40]:
from collections import Counter

In [41]:
import pytagcloud

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html


In [42]:
tokens = [k for (k, v) in kkma.pos(corpus) if v.startswith('NN')]

In [43]:
tags = Counter(tokens)

In [44]:
tags  = tags.most_common(40)

In [45]:
tagList = pytagcloud.make_tags(tags, maxsize=40)

In [46]:
tagList

[{'color': (51, 205, 185), 'size': 44, 'tag': '법률'},
 {'color': (173, 60, 12), 'size': 35, 'tag': '수'},
 {'color': (60, 93, 182), 'size': 33, 'tag': '대통령'},
 {'color': (160, 146, 190), 'size': 30, 'tag': '국가'},
 {'color': (190, 169, 186), 'size': 29, 'tag': '헌법'},
 {'color': (142, 136, 32), 'size': 29, 'tag': '국민'},
 {'color': (38, 81, 179), 'size': 26, 'tag': '조'},
 {'color': (21, 164, 178), 'size': 25, 'tag': '국회'},
 {'color': (51, 207, 150), 'size': 25, 'tag': '때'},
 {'color': (111, 171, 88), 'size': 20, 'tag': '회의'},
 {'color': (130, 104, 152), 'size': 19, 'tag': '바'},
 {'color': (68, 203, 146), 'size': 16, 'tag': '필요'},
 {'color': (152, 32, 165), 'size': 16, 'tag': '위원'},
 {'color': (135, 44, 116), 'size': 16, 'tag': '국무'},
 {'color': (160, 144, 167), 'size': 15, 'tag': '기타'},
 {'color': (204, 29, 19), 'size': 15, 'tag': '선거'},
 {'color': (81, 145, 173), 'size': 14, 'tag': '보장'},
 {'color': (140, 66, 209), 'size': 13, 'tag': '정부'},
 {'color': (66, 40, 192), 'size': 13, 'tag': '사항'

In [47]:
pytagcloud.create_tag_image(tagList, "wordcloud.jpg", fontname = "godoMaum")