# 텍스트 전처리
----
- 패키지 설치
    * NLTK : pip install nltk
    * KoNLPy : pip install konlpy

In [58]:
# NLTK 패키지 설치
!pip install nltk



In [59]:
!pip install konlpy



## [1] 토큰화(Tokenization)
---
- 문장/문서를 의미를 지닌 작은 단위로 나누는 것
- 나누어진 단어를 토큰(Token)이라 함
- 종류
    * 문장 토큰화
    * 단어 토큰화

In [60]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [61]:
import nltk

In [62]:
# NLTK Corpus 말뭉치 데이터셋 다운로드 받기
nltk.download('all', quiet=True)

True

In [71]:
raw_text1="hen tokenizing a Unicode string.\
           NLTK tokenizers can produce token-spans.\
           hen tokenizing a Unicode string."
raw_text2="This particular tokenizer requires the Punkt sentence tokenization.\
           which splits text on whitespace and punctuation."

In [72]:
# 단어 단위 토큰화
result1=word_tokenize(raw_text1)

In [73]:
print(result1)

['hen', 'tokenizing', 'a', 'Unicode', 'string', '.', 'NLTK', 'tokenizers', 'can', 'produce', 'token-spans', '.', 'hen', 'tokenizing', 'a', 'Unicode', 'string', '.']


In [74]:
# 문장 단위 토큰화
raw_text=[raw_text1, raw_text2]

In [75]:
raw_text

['hen tokenizing a Unicode string.           NLTK tokenizers can produce token-spans.           hen tokenizing a Unicode string.',
 'This particular tokenizer requires the Punkt sentence tokenization.           which splits text on whitespace and punctuation.']

In [76]:
result=sent_tokenize(raw_text1)

In [77]:
print(result, len(result))

['hen tokenizing a Unicode string.', 'NLTK tokenizers can produce token-spans.', 'hen tokenizing a Unicode string.'] 3


### 여러 문장에 토큰 추출
---

In [86]:
# 문장 단위로 추출
for sent in raw_text:
    total_token=set()
    #문장 추출
    sentResult=sent_tokenize(sent)
    
    # 문장에서 추출한 토큰
    print(f'sent => {sentResult}')
    
    for ele in sentResult:
        print(f'ele => {ele}')
        wordResult=word_tokenize(ele)
        print(f'wordResult => {wordResult}')

sent => ['hen tokenizing a Unicode string.', 'NLTK tokenizers can produce token-spans.', 'hen tokenizing a Unicode string.']
ele => hen tokenizing a Unicode string.
wordResult => ['hen', 'tokenizing', 'a', 'Unicode', 'string', '.']
ele => NLTK tokenizers can produce token-spans.
wordResult => ['NLTK', 'tokenizers', 'can', 'produce', 'token-spans', '.']
ele => hen tokenizing a Unicode string.
wordResult => ['hen', 'tokenizing', 'a', 'Unicode', 'string', '.']
sent => ['This particular tokenizer requires the Punkt sentence tokenization.', 'which splits text on whitespace and punctuation.']
ele => This particular tokenizer requires the Punkt sentence tokenization.
wordResult => ['This', 'particular', 'tokenizer', 'requires', 'the', 'Punkt', 'sentence', 'tokenization', '.']
ele => which splits text on whitespace and punctuation.
wordResult => ['which', 'splits', 'text', 'on', 'whitespace', 'and', 'punctuation', '.']


#### 한글 
---

In [106]:
from konlpy.tag import Okt

# 행태소 분리 객체
okt=Okt()

In [96]:
# 형태소 분리
result=okt.morphs("오늘은 월요일입니다.")
print(result)

['오늘', '은', '월요일', '입니다', '.']


In [107]:
# 행태소 분리 후 태깅(Tagging) => 품사
result2=okt.pos("오늘은 월요일입니다.")

In [105]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('입니다', 'Adjective'), ('.', 'Punctuation')]


In [108]:
result2=okt.pos("오늘은 월요일입니다.", stem=True)

In [110]:
print(result2)

[('오늘', 'Noun'), ('은', 'Josa'), ('월요일', 'Noun'), ('이다', 'Adjective'), ('.', 'Punctuation')]


### [2] 정제 & 정규화
---
- 불용어 제거 => 노이즈 제거
- 텍스트의 동일화 
    * 대문자 또는 소문자로 통일
    * 문장의 길이

### [2-1] 불용어 (Stopword)

In [111]:
en_stopwords=nltk.corpus.stopwords.words('english')

In [112]:
len(en_stopwords)

179

In [113]:
en_stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

### [2-2] 어간 및 표제어 처리
---

In [115]:
from nltk.stem import LancasterStemmer

In [121]:
# 어간 추출
lstem=LancasterStemmer()

In [117]:
lstem.stem('working'), lstem.stem('worked'), lstem.stem('worken')

('work', 'work', 'work')

In [118]:
lstem.stem('happy'), lstem.stem('happiness')

('happy', 'happy')

In [120]:
lstem.stem('amuse'), lstem.stem('amused')

('amus', 'amus')

In [122]:
# 표제어(사전에 등록된 단어 추출)
from nltk.stem import WordNetLemmatizer

In [123]:
wlemma=WordNetLemmatizer()

In [124]:
wlemma.lemmatize('working', 'v'), wlemma.lemmatize('worked', 'v')

('work', 'work')

In [125]:
wlemma.lemmatize('amusing', 'v'), wlemma.lemmatize('amused', 'v')

('amuse', 'amuse')

### [3] 텍스트 벡터화
---
- 텍스트 => 수치화
- 희소벡터(OHE) : BOW 방식 -->  Count기반, TF-IDF 기반
- 밀집벡터 : Embedding 방식 , Word2Vect

In [126]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [127]:
corpus=[raw_text1, raw_text2]

In [128]:
ohe=CountVectorizer()

In [129]:
ohe.fit(corpus)

In [132]:
ret=ohe.transform(corpus)

In [137]:
print(type(ret), ret, sep='\n')

<class 'scipy.sparse._csr.csr_matrix'>
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 6)	1
  (0, 11)	1
  (0, 13)	2
  (0, 17)	1
  (0, 20)	1
  (0, 21)	2
  (0, 22)	2
  (1, 0)	1
  (1, 4)	1
  (1, 5)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (1, 12)	1
  (1, 14)	1
  (1, 15)	1
  (1, 16)	1
  (1, 18)	1
  (1, 19)	1
  (1, 23)	1
  (1, 24)	1


In [138]:
ret=ret.toarray()

In [142]:
print(ret.shape, ret, sep='\n')

(2, 25)
[[0 1 2 1 0 0 1 0 0 0 0 1 0 2 0 0 0 1 0 0 1 2 2 0 0]
 [1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 0 1 1]]


In [144]:
## TF-IDF 기반
tfIdf=TfidfVectorizer()

In [146]:
tf_corpus=tfIdf.fit_transform(corpus)

In [147]:
type(tf_corpus)

scipy.sparse._csr.csr_matrix

In [148]:
tf_corpus= tf_corpus.toarray()

In [149]:
print(tf_corpus)

[[0.         0.21320072 0.42640143 0.21320072 0.         0.
  0.21320072 0.         0.         0.         0.         0.21320072
  0.         0.42640143 0.         0.         0.         0.21320072
  0.         0.         0.21320072 0.42640143 0.42640143 0.
  0.        ]
 [0.25819889 0.         0.         0.         0.25819889 0.25819889
  0.         0.25819889 0.25819889 0.25819889 0.25819889 0.
  0.25819889 0.         0.25819889 0.25819889 0.25819889 0.
  0.25819889 0.25819889 0.         0.         0.         0.25819889
  0.25819889]]


## 실습 --------------------------------------------------
---
- 단어 단위 토큰화
- 불용어 제거

In [152]:
#볼용어 추출
from nltk import corpus

In [154]:
en_stopwords=corpus.stopwords.words('english')

In [157]:
texts='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [158]:
wordTokens=word_tokenize(texts)

In [161]:
len(wordTokens), type(wordTokens)

(132, list)

In [170]:
# 불용어 제거
wordTokens2=[]
for word in wordTokens:
    if word not in en_stopwords:
        wordTokens2.append(word)

print(f'wordTokens2 : {len(wordTokens2)}')

wordTokens2 : 85


In [169]:
wordTokens3=[ word for word in wordTokens if word not in en_stopwords ]

print(f'wordTokens3 : {len(wordTokens2)}')

85


## Tokenizer 객체 생성
---

In [254]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

In [255]:
raw_text='Wiki is in Ward is original description: The simplest online database that could possibly work.\
Wiki is a piece of server software that allows users to freely create and edit Web page content using any Web browser. Wiki supports hyperlinks and has a simple text syntax for creating new pages and crosslinks between internal pages on the fly.\
Wiki is unusual among group communication mechanisms in that it allows the organization of contributions to be edited in addition to the content itself.Like many simple concepts, "open editing" has some profound and subtle effects on Wiki usage. Allowing everyday users to create and edit any page in a Web site is exciting in that it encourages democratic use of the Web and promotes content composition by nontechnical users.'

In [260]:
# 토큰으로 나누기
tokens=text_to_word_sequence(raw_text)

In [261]:
print(len(tokens), tokens)

128 ['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', '

In [262]:
print(tokens)

['wiki', 'is', 'in', 'ward', 'is', 'original', 'description', 'the', 'simplest', 'online', 'database', 'that', 'could', 'possibly', 'work', 'wiki', 'is', 'a', 'piece', 'of', 'server', 'software', 'that', 'allows', 'users', 'to', 'freely', 'create', 'and', 'edit', 'web', 'page', 'content', 'using', 'any', 'web', 'browser', 'wiki', 'supports', 'hyperlinks', 'and', 'has', 'a', 'simple', 'text', 'syntax', 'for', 'creating', 'new', 'pages', 'and', 'crosslinks', 'between', 'internal', 'pages', 'on', 'the', 'fly', 'wiki', 'is', 'unusual', 'among', 'group', 'communication', 'mechanisms', 'in', 'that', 'it', 'allows', 'the', 'organization', 'of', 'contributions', 'to', 'be', 'edited', 'in', 'addition', 'to', 'the', 'content', 'itself', 'like', 'many', 'simple', 'concepts', 'open', 'editing', 'has', 'some', 'profound', 'and', 'subtle', 'effects', 'on', 'wiki', 'usage', 'allowing', 'everyday', 'users', 'to', 'create', 'and', 'edit', 'any', 'page', 'in', 'a', 'web', 'site', 'is', 'exciting', 'in',

### Tokenizer 객체 --------------------------------------------------------------------
- 제공한 문서/문장에 대한 단어사전(voca)
- 단어사전(voca)에 존재하지 않는 단어 => Out Of Voca : oov

In [290]:
sentences = [
  'I love my dog',
  'I love my cat',
  'You love my dog!',
  'Do you think my dog is amazing?'
]
# {'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, '
#  do': 7, 'think': 8, 'is': 9, 'amazing': 10}

In [335]:
tokenizer = Tokenizer()

# 단어 빈도수가 높은 순으로 낮은 정수 인덱스 부여
tokenizer.fit_on_texts(sentences)

In [336]:
# 단어 인덱스  : 단어 인덱스
print(tokenizer.word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}


In [337]:
# 단어 출력 갯수
print(tokenizer.word_counts)

OrderedDict([('i', 2), ('love', 3), ('my', 4), ('dog', 3), ('cat', 1), ('you', 2), ('do', 1), ('think', 1), ('is', 1), ('amazing', 1)])


In [338]:
# 문장을 생성된 사전(voca)를 기반으로 수치화 
print(tokenizer.texts_to_sequences(sentences))

[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


## One-Hot-Encoding 변환
---
- sklearn OneHotEncoder객체 생성
- kears 함수

In [339]:
from tensorflow.keras.utils import to_categorical

In [None]:
to_categorical()