# NLP Preprocessing

In [1]:
import warnings
warnings.filterwarnings('ignore')

# I. Tokenization

> ## 1) 영어 : NLTK(Natural Language ToolKit)

In [2]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

> ### (1) 문장 토큰화 : sent_tokenize( )

In [3]:
from nltk import sent_tokenize

sentences = 'The X-Files is an American science fiction drama television series \
created by Chris Carter. \
             The original television series aired from September 10, 1993 \
to May 19, 2002 on Fox. \
             The program spanned nine seasons, with 202 episodes.'

sent_tokenize(sentences)

['The X-Files is an American science fiction drama television series created by Chris Carter.',
 'The original television series aired from September 10, 1993 to May 19, 2002 on Fox.',
 'The program spanned nine seasons, with 202 episodes.']

> ### (2) 단어 토큰화 : word_tokenize( )

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
text = 'The truth is out there.'

word_tokenize(text)

['The', 'truth', 'is', 'out', 'there', '.']

> ### (3) 단어 품사(Part Of Speech) 태깅 : pos_tag( )

In [6]:
from nltk.tag import pos_tag

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [8]:
x = word_tokenize(text)

pos_tag(x)

[('The', 'DT'),
 ('truth', 'NN'),
 ('is', 'VBZ'),
 ('out', 'RP'),
 ('there', 'RB'),
 ('.', '.')]

> ### (4) Stop Words

* Import Package and Download 'stopwords'

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

* 'English' Stop Words

In [10]:
print('English stop words :',len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

English stop words : 179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


* tokenize_text( ) 정의
 - 여러개의 문장별로 단어 토큰 생성 함수 정의

In [11]:
from nltk import word_tokenize, sent_tokenize

def tokenize_text(doc):
    sentences = sent_tokenize(doc)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

* 문장별 단어 토큰화 수행

In [12]:
word_tokens = tokenize_text(sentences)

print(type(word_tokens),len(word_tokens))

<class 'list'> 3


* 문장별 단어 토큰화 결과 확인

In [13]:
print(word_tokens)

[['The', 'X-Files', 'is', 'an', 'American', 'science', 'fiction', 'drama', 'television', 'series', 'created', 'by', 'Chris', 'Carter', '.'], ['The', 'original', 'television', 'series', 'aired', 'from', 'September', '10', ',', '1993', 'to', 'May', '19', ',', '2002', 'on', 'Fox', '.'], ['The', 'program', 'spanned', 'nine', 'seasons', ',', 'with', '202', 'episodes', '.']]


* Stop Words 제거

In [14]:
stopwords = nltk.corpus.stopwords.words('english')
all_tokens = []

for sentence in word_tokens:
    filtered_words = []

    for word in sentence:
        word = word.lower()
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

* Stop Words 처리 결과

In [15]:
print(all_tokens)

[['x-files', 'american', 'science', 'fiction', 'drama', 'television', 'series', 'created', 'chris', 'carter', '.'], ['original', 'television', 'series', 'aired', 'september', '10', ',', '1993', 'may', '19', ',', '2002', 'fox', '.'], ['program', 'spanned', 'nine', 'seasons', ',', '202', 'episodes', '.']]


> ### (5) Stemming(어간 추출)

* 변화된 단어의 원형으로 처리
 - work
 - amuse
 - happy
 - fancy

In [16]:
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('amuses'),stemmer.stem('amused'))
print(stemmer.stem('happier'),stemmer.stem('happiest'))
print(stemmer.stem('fancier'),stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


> ### (6) Lemmatization(표제어 추출)

* 변화된 단어의 원형을 처리
 - Stemming 보다 정확한 처리 가능
 - '품사'를 지정하여 사용

In [17]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

* 'v' 동사, 'a' 형용사

In [18]:
from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

print(lemma.lemmatize('amusing', 'v'), lemma.lemmatize('amuses', 'v'), \
      lemma.lemmatize('amused', 'v'))
print(lemma.lemmatize('happier', 'a'), lemma.lemmatize('happiest', 'a'))
print(lemma.lemmatize('fancier', 'a'), lemma.lemmatize('fanciest', 'a'))

amuse amuse amuse
happy happy
fancy fancy


> ## 2) 한국어 : KoNLPy

> ### (1) KoNLPy 패키지 설치

In [19]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.3MB/s 
[?25hCollecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/cd/a5/9781e2ef4ca92d09912c4794642c1653aea7607f473e156cf4d423a881a1/JPype1-1.2.1-cp37-cp37m-manylinux2010_x86_64.whl (457kB)
[K     |████████████████████████████████| 460kB 53.0MB/s 
[?25hCollecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 7.2MB/s 
Installing collected packages: col

> ### (2) Okt 형태소 분석기(Open Korea Text, Twitter)

* 형태소(Morpheme)

In [20]:
from konlpy.tag import Okt

* 토큰화 : okt.morphs( )

In [21]:
okt = Okt()

print(okt.morphs('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

['지난', '몇', '달', '간', '전', '세계', '모든', '사람', '은', '코로나', '19', '로', '인해', '전례', '없는', '고통', '을', '겪으며', '다양한', '방식', '으로', '심각하게', '피해', '를', '겪었습니다', '.']


* 품사 태깅 : okt.pos( )

In [22]:
print(okt.pos('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

[('지난', 'Noun'), ('몇', 'Noun'), ('달', 'Noun'), ('간', 'Suffix'), ('전', 'Noun'), ('세계', 'Noun'), ('모든', 'Noun'), ('사람', 'Noun'), ('은', 'Josa'), ('코로나', 'Noun'), ('19', 'Number'), ('로', 'Noun'), ('인해', 'Adjective'), ('전례', 'Noun'), ('없는', 'Adjective'), ('고통', 'Noun'), ('을', 'Josa'), ('겪으며', 'Verb'), ('다양한', 'Adjective'), ('방식', 'Noun'), ('으로', 'Josa'), ('심각하게', 'Adjective'), ('피해', 'Noun'), ('를', 'Josa'), ('겪었습니다', 'Verb'), ('.', 'Punctuation')]


* 명사 추출 : okt.nouns( )

In [23]:
print(okt.nouns('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

['지난', '몇', '달', '전', '세계', '모든', '사람', '코로나', '로', '전례', '고통', '방식', '피해']


> ### (3) Kkma 형태소 분석기

* 형태소(Morpheme)

In [24]:
from konlpy.tag import Kkma

* 토큰화 : kkma.morphs( )

In [25]:
kkma = Kkma()  

print(kkma.morphs('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

['지나', 'ㄴ', '몇', '달', '간', '전', '세계', '모든', '사람', '은', '코로나', '19', '로', '인하', '어', '전례', '없', '는', '고통', '을', '겪', '으며', '다양', '하', 'ㄴ', '방식', '으로', '심각', '하', '게', '피해', '를', '겪', '었', '습니다', '.']


* 품사 태깅 : kkma.pos( )

In [26]:
print(kkma.pos('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

[('지나', 'VV'), ('ㄴ', 'ETD'), ('몇', 'MDT'), ('달', 'NNG'), ('간', 'NNB'), ('전', 'NNG'), ('세계', 'NNG'), ('모든', 'MDT'), ('사람', 'NNG'), ('은', 'JX'), ('코로나', 'NNG'), ('19', 'NR'), ('로', 'JKM'), ('인하', 'VV'), ('어', 'ECS'), ('전례', 'NNG'), ('없', 'VA'), ('는', 'ETD'), ('고통', 'NNG'), ('을', 'JKO'), ('겪', 'VV'), ('으며', 'ECE'), ('다양', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('방식', 'NNG'), ('으로', 'JKM'), ('심각', 'XR'), ('하', 'XSA'), ('게', 'ECD'), ('피해', 'NNG'), ('를', 'JKO'), ('겪', 'VV'), ('었', 'EPT'), ('습니다', 'EFN'), ('.', 'SF')]


* 명사 추출 : kkma.nouns( )

In [27]:
print(kkma.nouns('지난 몇 달간 전 세계 모든 사람은 코로나19로 인해 \
전례 없는 고통을 겪으며 다양한 방식으로 심각하게 피해를 겪었습니다.'))

['달', '달간', '간', '전', '세계', '사람', '코로나', '코로나19', '19', '전례', '고통', '다양', '방식', '피해']


# II. Encoding

> ## 1) Encoding with Keras

> ### (1) Import Package

In [28]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

> ### (2) 실습 문장

In [29]:
sentence = '가지마라 가지마라 그녀는 위험해 매력이 너무 넘치는 Girl \
하지마라 하지마라 사랑은 위험해 \
내가 내가 내가 먼저 네게 네게 네게 빠져 빠져 빠져 버려 baby'

> ## 2) 정수 인코딩(Integer Encoding)

> ### (1) Tokenizer.fit_on_texts( )

* Tokenization & Integer Indexing

In [30]:
from keras.preprocessing.text import Tokenizer

tknz = Tokenizer()
tknz.fit_on_texts([sentence])

In [31]:
print(tknz.word_index)

{'내가': 1, '네게': 2, '빠져': 3, '가지마라': 4, '위험해': 5, '하지마라': 6, '그녀는': 7, '매력이': 8, '너무': 9, '넘치는': 10, 'girl': 11, '사랑은': 12, '먼저': 13, '버려': 14, 'baby': 15}


> ### (2) Tokenizer.texts_to_sequences( )

* Integer Encoding

In [32]:
LBE = tknz.texts_to_sequences([sentence])

In [33]:
print(LBE)

[[4, 4, 7, 5, 8, 9, 10, 11, 6, 6, 12, 5, 1, 1, 1, 13, 2, 2, 2, 3, 3, 3, 14, 15]]


> ## 3) 원-핫 인코딩(One-Hot Encoding)

> ### (1) to_categorical( )

* One-Hot Encoding

In [34]:
from keras.utils import to_categorical

OHE = to_categorical(LBE)

In [35]:
print(OHE)

[[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 

# 
# 
# 
# The End
# 
# 
# 