# 텍스트 전처리(Text Preprocesing)
---------------------
## 1) 정제(Cleaning): 분석에 불필요한 노이즈 제거(noise removal)  
### &nbsp; &nbsp; ex) 불용어(stopwords) 제거 <br/>
## 2) 토큰화(Tokenization): 주어진 텍스트를 토큰으로 나누는 작업
### &nbsp; &nbsp; ex) **단어 토큰화(word tokenization)**, 문장 토큰화 <br/>

In [2]:
# 영어 토큰화 - nltk 이용
import nltk

# 구텐베르크 불러오기
nltk.download('gutenberg')

from nltk.corpus import gutenberg
file_names = gutenberg.fileids()
print(file_names)

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ing06\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
# 맥베스 샘플 출력
doc_macbeth = gutenberg.open('shakespeare-macbeth.txt').read()
print(doc_macbeth[:500])

[The Tragedie of Macbeth by William Shakespeare 1603]


Actus Primus. Scoena Prima.

Thunder and Lightning. Enter three Witches.

  1. When shall we three meet againe?
In Thunder, Lightning, or in Raine?
  2. When the Hurley-burley's done,
When the Battaile's lost, and wonne

   3. That will be ere the set of Sunne

   1. Where the place?
  2. Vpon the Heath

   3. There to meet with Macbeth

   1. I come, Gray-Malkin

   All. Padock calls anon: faire is foule, and foule is faire,
Houer through 


In [5]:
# 1. 문장 토큰화 - 문장으로 토큰을 나눔
from nltk.tokenize import sent_tokenize
sample = doc_macbeth[:500]

print(sent_tokenize(sample))

['[The Tragedie of Macbeth by William Shakespeare 1603]\n\n\nActus Primus.', 'Scoena Prima.', 'Thunder and Lightning.', 'Enter three Witches.', '1.', 'When shall we three meet againe?', 'In Thunder, Lightning, or in Raine?', '2.', "When the Hurley-burley's done,\nWhen the Battaile's lost, and wonne\n\n   3.", 'That will be ere the set of Sunne\n\n   1.', 'Where the place?', '2.', 'Vpon the Heath\n\n   3.', 'There to meet with Macbeth\n\n   1.', 'I come, Gray-Malkin\n\n   All.', 'Padock calls anon: faire is foule, and foule is faire,\nHouer through']


In [6]:
# 2. 단어 토큰화 
from nltk.tokenize import word_tokenize

print(word_tokenize(sample))
# 괄호나 점 같은 필요없는 문자들이 같이 나옴

['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']', 'Actus', 'Primus', '.', 'Scoena', 'Prima', '.', 'Thunder', 'and', 'Lightning', '.', 'Enter', 'three', 'Witches', '.', '1', '.', 'When', 'shall', 'we', 'three', 'meet', 'againe', '?', 'In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?', '2', '.', 'When', 'the', 'Hurley-burley', "'s", 'done', ',', 'When', 'the', 'Battaile', "'s", 'lost', ',', 'and', 'wonne', '3', '.', 'That', 'will', 'be', 'ere', 'the', 'set', 'of', 'Sunne', '1', '.', 'Where', 'the', 'place', '?', '2', '.', 'Vpon', 'the', 'Heath', '3', '.', 'There', 'to', 'meet', 'with', 'Macbeth', '1', '.', 'I', 'come', ',', 'Gray-Malkin', 'All', '.', 'Padock', 'calls', 'anon', ':', 'faire', 'is', 'foule', ',', 'and', 'foule', 'is', 'faire', ',', 'Houer', 'through']


In [7]:
# 정규표현식 사용
from nltk.tokenize import RegexpTokenizer

# \w: 문자, 숫자, _를 포함 [\w] = [0-9A-Za-z]
tokenizer = RegexpTokenizer("[\w']+")

print(tokenizer.tokenize(sample))

['The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', 'Actus', 'Primus', 'Scoena', 'Prima', 'Thunder', 'and', 'Lightning', 'Enter', 'three', 'Witches', '1', 'When', 'shall', 'we', 'three', 'meet', 'againe', 'In', 'Thunder', 'Lightning', 'or', 'in', 'Raine', '2', 'When', 'the', 'Hurley', "burley's", 'done', 'When', 'the', "Battaile's", 'lost', 'and', 'wonne', '3', 'That', 'will', 'be', 'ere', 'the', 'set', 'of', 'Sunne', '1', 'Where', 'the', 'place', '2', 'Vpon', 'the', 'Heath', '3', 'There', 'to', 'meet', 'with', 'Macbeth', '1', 'I', 'come', 'Gray', 'Malkin', 'All', 'Padock', 'calls', 'anon', 'faire', 'is', 'foule', 'and', 'foule', 'is', 'faire', 'Houer', 'through']


In [8]:
tokenizer = RegexpTokenizer("[\w']{3,}")         # 3글자 이상
print(tokenizer.tokenize(sample))

['The', 'Tragedie', 'Macbeth', 'William', 'Shakespeare', '1603', 'Actus', 'Primus', 'Scoena', 'Prima', 'Thunder', 'and', 'Lightning', 'Enter', 'three', 'Witches', 'When', 'shall', 'three', 'meet', 'againe', 'Thunder', 'Lightning', 'Raine', 'When', 'the', 'Hurley', "burley's", 'done', 'When', 'the', "Battaile's", 'lost', 'and', 'wonne', 'That', 'will', 'ere', 'the', 'set', 'Sunne', 'Where', 'the', 'place', 'Vpon', 'the', 'Heath', 'There', 'meet', 'with', 'Macbeth', 'come', 'Gray', 'Malkin', 'All', 'Padock', 'calls', 'anon', 'faire', 'foule', 'and', 'foule', 'faire', 'Houer', 'through']


In [10]:
# 불용어 제거
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')

tokenizer = RegexpTokenizer("[\w']+")
token = tokenizer.tokenize(sample)

result = [word for word in token if word not in english_stopwords]
print(result)

['The', 'Tragedie', 'Macbeth', 'William', 'Shakespeare', '1603', 'Actus', 'Primus', 'Scoena', 'Prima', 'Thunder', 'Lightning', 'Enter', 'three', 'Witches', '1', 'When', 'shall', 'three', 'meet', 'againe', 'In', 'Thunder', 'Lightning', 'Raine', '2', 'When', 'Hurley', "burley's", 'done', 'When', "Battaile's", 'lost', 'wonne', '3', 'That', 'ere', 'set', 'Sunne', '1', 'Where', 'place', '2', 'Vpon', 'Heath', '3', 'There', 'meet', 'Macbeth', '1', 'I', 'come', 'Gray', 'Malkin', 'All', 'Padock', 'calls', 'anon', 'faire', 'foule', 'foule', 'faire', 'Houer']


<br/>
<br/>

## 3) 정규화: 같은 의미를 가진 다른 형태의 단어들을 통일시키는 작업
### &nbsp; &nbsp; - 어간 추출(Stemming): 단어의 형태가 변화할 때, 변하지 않는 부분(어간)을 추출
&nbsp; &nbsp; &nbsp; &nbsp; 따라서 어간 추출 결과는 사전에 등록되지 않은 단어일 확률이 높음
&nbsp; &nbsp; &nbsp; &nbsp; ex) 가다, 간다, 갔다 -> '가' &nbsp; &nbsp; &nbsp; 작다, 작고, 작으니 -> '작' 
<br/>
<br/>

### &nbsp; &nbsp; - 표제어 추출(Lemmatization): 단어를 기본형으로 변환하는 것
&nbsp; &nbsp; &nbsp; &nbsp; ex) cooking -> cook(동사)

In [1]:
# 어간 추출(Stemming)
# 1. 포터 스테머
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookeri cookbook


In [4]:
sample = doc_macbeth[:500]

from nltk.tokenize import word_tokenize

tokens = word_tokenize(sample)
print("토큰화: ", tokens)
stem = [stemmer.stem(token) for token in tokens]
print("어간 추출: ", stem)

토큰화:  ['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']', 'Actus', 'Primus', '.', 'Scoena', 'Prima', '.', 'Thunder', 'and', 'Lightning', '.', 'Enter', 'three', 'Witches', '.', '1', '.', 'When', 'shall', 'we', 'three', 'meet', 'againe', '?', 'In', 'Thunder', ',', 'Lightning', ',', 'or', 'in', 'Raine', '?', '2', '.', 'When', 'the', 'Hurley-burley', "'s", 'done', ',', 'When', 'the', 'Battaile', "'s", 'lost', ',', 'and', 'wonne', '3', '.', 'That', 'will', 'be', 'ere', 'the', 'set', 'of', 'Sunne', '1', '.', 'Where', 'the', 'place', '?', '2', '.', 'Vpon', 'the', 'Heath', '3', '.', 'There', 'to', 'meet', 'with', 'Macbeth', '1', '.', 'I', 'come', ',', 'Gray-Malkin', 'All', '.', 'Padock', 'calls', 'anon', ':', 'faire', 'is', 'foule', ',', 'and', 'foule', 'is', 'faire', ',', 'Houer', 'through']
어간 추출:  ['[', 'the', 'tragedi', 'of', 'macbeth', 'by', 'william', 'shakespear', '1603', ']', 'actu', 'primu', '.', 'scoena', 'prima', '.', 'thunder', 'and', 'lightn', '.

In [5]:
# 2. 랭카스터 스테머
from nltk.stem import LancasterStemmer

stemmer = LancasterStemmer()

print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookery cookbook


In [None]:
# 표제어 추출(Lemmatization)
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking'), pos='v')