# NLTK
---

In [None]:
!pip install konlpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.3 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.4.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (453 kB)
[K     |████████████████████████████████| 453 kB 59.6 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.0 konlpy-0.6.0


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미를 지닌 작은 단위로 나누는 것
- 나누어진 단위를 토큰이라고 함
- 종류
    - 문장 토큰화
    - 단어 토큰화
    


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

#nltk Corpus 말뭉치 다운로드 받기
nltk.download('all', quiet=True)

True

In [None]:
raw_text = "My life for aiur, I long for combat, What now calls, Honor guides me"

raw_text2 = "My life for aiur.\
            I long for combat.\
            What now calls.\
            Honor guides me."

In [None]:
#단어 단위 토큰화
result = word_tokenize(raw_text)
print(result)

['My', 'life', 'for', 'aiur', ',', 'I', 'long', 'for', 'combat', ',', 'What', 'now', 'calls', ',', 'Honor', 'guides', 'me']


In [None]:
# 문장단위 토큰화
result = sent_tokenize(raw_text2)
print(result)

['My life for aiur.', 'I long for combat.', 'What now calls.', 'Honor guides me.']


### 여러문장에 대한 토큰 추출

In [None]:
# 문장 단위로 추출
sents = sent_tokenize(raw_text2)

all_tokens = []

for sent in sents:
    tokens=  word_tokenize(sent)
    for token in tokens:
        all_tokens.append(token)

print(all_tokens)

['My', 'life', 'for', 'aiur', '.', 'I', 'long', 'for', 'combat', '.', 'What', 'now', 'calls', '.', 'Honor', 'guides', 'me', '.']


### 한글 분석

In [None]:
from konlpy.tag import Okt

In [None]:
# 형태소 분리 객체
okt = Okt()

result = okt.morphs("오늘은 너무 졸립니다...")
print(result)

['오늘', '은', '너무', '졸립니다', '...']


In [None]:
# 형태소 분리 후 태깅
result2 = okt.pos("오늘은 너무 잠이 옵니다.")
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('너무', 'Adverb'), ('잠', 'Noun'), ('이', 'Josa'), ('옵니다', 'Eomi'), ('.', 'Punctuation')]


In [None]:
# 어간 추출
result2 = okt.pos("오늘은 월요일 입니다.", stem=True)
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


## [2] 정제 및 정규화
---
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화
    * 대소문자 통일
    * 문장의 길이

### [2-1] 불용어(Stopword)

In [None]:
len(nltk.corpus.stopwords.words("english"))

179

### [2-2] 어간 및 표제어 처리

In [None]:
from nltk.stem import LancasterStemmer

In [None]:
lstem = LancasterStemmer()

lstem.stem("working"), lstem.stem('worked')

('work', 'work')

In [None]:
lstem.stem('amused'), lstem.stem('amusing')

('amus', 'amus')

In [None]:
# 표제어: 사전에 있는 단어 추출
from nltk.stem import WordNetLemmatizer

In [None]:
welmma = WordNetLemmatizer()

welmma.lemmatize('working', 'v'), welmma.lemmatize('worked', 'v')

('work', 'work')

In [None]:
welmma.lemmatize('amused', 'v'), welmma.lemmatize('amusing', 'v')

('amuse', 'amuse')

## [3] 텍스트 벡터화
---
- 텍스트 -> 수치화
- 희소행렬(OHE): BOW 방식 -> Count기반, TF-IDF 기반
- 밀집백터: Embedding 방식 -> Word2Vect

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
corpus = [raw_text2]

ohe = CountVectorizer()
result = ohe.fit_transform(corpus)
print(result.toarray())

[[1 1 1 2 1 1 1 1 1 1 1 1]]


In [None]:
corpus = [raw_text2]

vec = TfidfVectorizer()
result = vec.fit_transform(corpus)
print(result.toarray())

[[0.25819889 0.25819889 0.25819889 0.51639778 0.25819889 0.25819889
  0.25819889 0.25819889 0.25819889 0.25819889 0.25819889 0.25819889]]


### 예제

In [None]:
sent='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [None]:
sent_lower = sent.lower()

tokens = word_tokenize(sent_lower)

In [None]:
stop_word = set(nltk.corpus.stopwords.words("english"))

result = []

for token in tokens:
    if token not in stop_word:
        result.append(token)     

print(result)
print(len(result))

['wiki', 'ward', 'original', 'description', ':', 'simplest', 'online', 'database', 'could', 'possibly', 'work.wiki', 'piece', 'server', 'software', 'allows', 'users', 'freely', 'create', 'edit', 'web', 'page', 'content', 'using', 'web', 'browser', '.', 'wiki', 'supports', 'hyperlinks', 'simple', 'text', 'syntax', 'creating', 'new', 'pages', 'crosslinks', 'internal', 'pages', 'fly.wiki', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'allows', 'organization', 'contributions', 'edited', 'addition', 'content', 'itself.like', 'many', 'simple', 'concepts', ',', '``', 'open', 'editing', "''", 'profound', 'subtle', 'effects', 'wiki', 'usage', '.', 'allowing', 'everyday', 'users', 'create', 'edit', 'page', 'web', 'site', 'exciting', 'encourages', 'democratic', 'use', 'web', 'promotes', 'content', 'composition', 'nontechnical', 'users', '.']
84


## [4] Tokenizer 객체 생성
---

In [None]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [None]:
tokens = text_to_word_sequence(sent)

print(len(tokens), tokens)

128 ['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', '

In [None]:
my_token = Tokenizer()

my_token.fit_on_texts(tokens)

In [None]:
print(my_token.word_counts)

OrderedDict([('wiki', 5), ('is', 5), ('in', 5), ('ward', 1), ('original', 1), ('description', 1), ('the', 5), ('simplest', 1), ('online', 1), ('database', 1), ('that', 4), ('could', 1), ('possibly', 1), ('work', 1), ('a', 3), ('piece', 1), ('of', 3), ('server', 1), ('software', 1), ('allows', 2), ('users', 3), ('to', 4), ('freely', 1), ('create', 2), ('and', 6), ('edit', 2), ('web', 4), ('page', 2), ('content', 3), ('using', 1), ('any', 2), ('browser', 1), ('supports', 1), ('hyperlinks', 1), ('has', 2), ('simple', 2), ('text', 1), ('syntax', 1), ('for', 1), ('creating', 1), ('new', 1), ('pages', 2), ('crosslinks', 1), ('between', 1), ('internal', 1), ('on', 2), ('fly', 1), ('unusual', 1), ('among', 1), ('group', 1), ('communication', 1), ('mechanisms', 1), ('it', 2), ('organization', 1), ('contributions', 1), ('be', 1), ('edited', 1), ('addition', 1), ('itself', 1), ('like', 1), ('many', 1), ('concepts', 1), ('open', 1), ('editing', 1), ('some', 1), ('profound', 1), ('subtle', 1), ('ef

In [None]:
print(my_token.word_index)

{'and': 1, 'wiki': 2, 'is': 3, 'in': 4, 'the': 5, 'that': 6, 'to': 7, 'web': 8, 'a': 9, 'of': 10, 'users': 11, 'content': 12, 'allows': 13, 'create': 14, 'edit': 15, 'page': 16, 'any': 17, 'has': 18, 'simple': 19, 'pages': 20, 'on': 21, 'it': 22, 'ward': 23, 'original': 24, 'description': 25, 'simplest': 26, 'online': 27, 'database': 28, 'could': 29, 'possibly': 30, 'work': 31, 'piece': 32, 'server': 33, 'software': 34, 'freely': 35, 'using': 36, 'browser': 37, 'supports': 38, 'hyperlinks': 39, 'text': 40, 'syntax': 41, 'for': 42, 'creating': 43, 'new': 44, 'crosslinks': 45, 'between': 46, 'internal': 47, 'fly': 48, 'unusual': 49, 'among': 50, 'group': 51, 'communication': 52, 'mechanisms': 53, 'organization': 54, 'contributions': 55, 'be': 56, 'edited': 57, 'addition': 58, 'itself': 59, 'like': 60, 'many': 61, 'concepts': 62, 'open': 63, 'editing': 64, 'some': 65, 'profound': 66, 'subtle': 67, 'effects': 68, 'usage': 69, 'allowing': 70, 'everyday': 71, 'site': 72, 'exciting': 73, 'enc

### sequential
- 제공한 문정에 대한 단어사전 -> voca
- 단어사전에 존재하지 않는 단어 -> Out Of Voca: oov

In [None]:
sentence = [
    "i love my dog",
    'i love my cat',
    'you love my cat',
    'we love our dog',
    'do you think my dog is amazing?'
]

In [None]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(sentence)

In [None]:
print(tokenizer.word_index)

{'love': 1, 'my': 2, 'dog': 3, 'i': 4, 'cat': 5, 'you': 6, 'we': 7, 'our': 8, 'do': 9, 'think': 10, 'is': 11, 'amazing': 12}


In [None]:
print(tokenizer.word_counts)

OrderedDict([('i', 2), ('love', 4), ('my', 4), ('dog', 3), ('cat', 2), ('you', 2), ('we', 1), ('our', 1), ('do', 1), ('think', 1), ('is', 1), ('amazing', 1)])


In [None]:
# 문장을 생성된 사전을 기반으로 수치화
seq_voca = tokenizer.texts_to_sequences(sentence)

print(tokenizer.texts_to_sequences(sentence))

[[4, 1, 2, 3], [4, 1, 2, 5], [6, 1, 2, 5], [7, 1, 8, 3], [9, 6, 10, 2, 3, 11, 12]]


## [5] One-Hot-Encoding 변환
---
- sklearn OneHotEncoder 객체 생성
- keras 함수

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
to_categorical(seq_voca[4])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)

## [6] Padding
---
- 길이가 다를 경우 길이를 같게 맞추는 과정

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
pad_sequences(seq_voca)

array([[ 0,  0,  0,  4,  1,  2,  3],
       [ 0,  0,  0,  4,  1,  2,  5],
       [ 0,  0,  0,  6,  1,  2,  5],
       [ 0,  0,  0,  7,  1,  8,  3],
       [ 9,  6, 10,  2,  3, 11, 12]], dtype=int32)