<a href="https://colab.research.google.com/github/chu-ise/378A-2022/blob/main/notebooks/04/01_tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
%pip install ekorpkit[tokenize]==0.1.12.post0.dev14
%pip install transformers

# Tokenization

In [2]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## WordPunctTokenizer

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer

text = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is as cheery as cheery goes for a pastry shop."


In [4]:
print(word_tokenize(text))

['Do', "n't", 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr.', 'Jone', "'s", 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


In [5]:
print(WordPunctTokenizer().tokenize(text))

['Don', "'", 't', 'be', 'fooled', 'by', 'the', 'dark', 'sounding', 'name', ',', 'Mr', '.', 'Jone', "'", 's', 'Orphanage', 'is', 'as', 'cheery', 'as', 'cheery', 'goes', 'for', 'a', 'pastry', 'shop', '.']


## Penn Treebank Tokenization

In [6]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()
text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."


In [7]:
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


## 문장 토큰화(Sentence Tokenization)

In [8]:
text = "For strains harboring the pYV plasmid and Yop-encoding plasmids, bacteria were grown with aeration at 26 °C overnight in broth supplemented with 2.5 mm CaCl2 and 100 μg/ml ampicillin and then subcultured and grown at 26 °C until A600 of 0.2. At this point, the cultures were shifted to 37 °C and aerated for 1 h. A multiplicity of infection of 50:1 was used for YPIII(p-) incubations, and a multiplicity of infection of 25:1 was used for other derivatives. For the pYopE-expressing plasmid, 0.1 mm isopropyl-β-d-thiogalactopyranoside was supplemented during infection to induce YopE expression."

In [9]:
from ekorpkit.preprocessors.segmenter import NLTKSegmenter
seg = NLTKSegmenter()
print(seg.segment(text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
['For strains harboring the pYV plasmid and Yop-encoding plasmids, bacteria were grown with aeration at 26 °C overnight in broth supplemented with 2.5 mm CaCl2 and 100 μg/ml ampicillin and then subcultured and grown at 26 °C until A600 of 0.2.', 'At this point, the cultures were shifted to 37 °C and aerated for 1 h. A multiplicity of infection of 50:1 was used for YPIII(p-) incubations, and a multiplicity of infection of 25:1 was used for other derivatives.', 'For the pYopE-expressing plasmid, 0.1 mm isopropyl-β-d-thiogalactopyranoside was supplemented during infection to induce YopE expression.']


In [10]:
from ekorpkit.preprocessors.segmenter import PySBDSegmenter
seg = PySBDSegmenter()
print(seg.segment(text))

['For strains harboring the pYV plasmid and Yop-encoding plasmids, bacteria were grown with aeration at 26 °C overnight in broth supplemented with 2.5 mm CaCl2 and 100 μg/ml ampicillin and then subcultured and grown at 26 °C until A600 of 0.2. ', 'At this point, the cultures were shifted to 37 °C and aerated for 1 h. ', 'A multiplicity of infection of 50:1 was used for YPIII(p-) incubations, and a multiplicity of infection of 25:1 was used for other derivatives. ', 'For the pYopE-expressing plasmid, 0.1 mm isopropyl-β-d-thiogalactopyranoside was supplemented during infection to induce YopE expression.']


In [11]:
text = "일본기상청과 태평양지진해일경보센터는 3월 11일 오후 2시 49분경에 일본 동해안을 비롯하여 대만, 알래스카, 하와이, 괌, 캘리포니아, 칠레 등 태평양 연안 50여 국가에 지진해일 주의보와 경보를 발령하였다. 다행히도 우리나라는 지진발생위치로부터 1,000km 이상 떨어진데다 일본 열도가 가로막아 지진해일이 도달하지 않았다. 지진해일은 일본 소마항에 7.3m, 카마이시항에 4.1m, 미야코항에 4m 등 일본 동해안 전역에서 관측되었다. 지진해일이 원해로 전파되면서 대만(19시 40분)에서 소규모 지진해일과 하와이 섬에서 1.4m(23시 9분)의 지진해일이 관측되었다. 다음날인 3월 12일 새벽 1시 57분경에는 진앙지로부터 약 7,500km 떨어진 캘리포니아 크레센트시티에서 2.2m의 지진해일이 관측되었다."

In [12]:
from ekorpkit.preprocessors.segmenter import KSSSegmenter
seg = KSSSegmenter()
print(seg.segment(text))

['일본기상청과 태평양지진해일경보센터는 3월 11일 오후 2시 49분경에 일본 동해안을 비롯하여 대만, 알래스카, 하와이, 괌, 캘리포니아, 칠레 등 태평양 연안 50여 국가에 지진해일 주의보와 경보를 발령하였다.', '다행히도 우리나라는 지진발생위치로부터 1,000km 이상 떨어진데다 일본 열도가 가로막아 지진해일이 도달하지 않았다.', '지진해일은 일본 소마항에 7.3m, 카마이시항에 4.1m, 미야코항에 4m 등 일본 동해안 전역에서 관측되었다.', '지진해일이 원해로 전파되면서 대만(19시 40분)에서 소규모 지진해일과 하와이 섬에서 1.4m(23시 9분)의 지진해일이 관측되었다.', '다음날인 3월 12일 새벽 1시 57분경에는 진앙지로부터 약 7,500km 떨어진 캘리포니아 크레센트시티에서 2.2m의 지진해일이 관측되었다.']


## 표제어 추출(Lemmatization)

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

In [14]:
print([lemmatizer.lemmatize(word) for word in words])

['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [15]:
lemmatizer.lemmatize('dies', 'v')

'die'

In [16]:
lemmatizer.lemmatize('starting', 'v')

'start'

## 어간 추출(Stemming)

In [17]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()
text = "Gold is often seen as an alternative currency in times of global economic uncertainty and a refuge from financial risk."

In [18]:
tokenized_text = word_tokenize(text)
print([stemmer.stem(word) for word in tokenized_text])

['gold', 'is', 'often', 'seen', 'as', 'an', 'altern', 'currenc', 'in', 'time', 'of', 'global', 'econom', 'uncertainti', 'and', 'a', 'refug', 'from', 'financi', 'risk', '.']


In [19]:
words = ['formalize', 'allowance', 'electricical']

print([stemmer.stem(word) for word in words])

['formal', 'allow', 'electric']


In [20]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
text = "Gold is often seen as an alternative currency in times of global economic uncertainty and a refuge from financial risk."
tokenized_text = word_tokenize(text)


In [21]:
print([porter_stemmer.stem(w) for w in tokenized_text])
print([lancaster_stemmer.stem(w) for w in tokenized_text])

['gold', 'is', 'often', 'seen', 'as', 'an', 'altern', 'currenc', 'in', 'time', 'of', 'global', 'econom', 'uncertainti', 'and', 'a', 'refug', 'from', 'financi', 'risk', '.']
['gold', 'is', 'oft', 'seen', 'as', 'an', 'altern', 'cur', 'in', 'tim', 'of', 'glob', 'econom', 'uncertainty', 'and', 'a', 'refug', 'from', 'fin', 'risk', '.']


## 불용어(Stopword)

In [22]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [23]:
stop_words_list = stopwords.words('english')
print(len(stop_words_list))
print(stop_words_list[:10])

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [24]:
word_tokens = word_tokenize(text)

result = []
for word in word_tokens: 
    if word not in stop_words_list: 
        result.append(word) 

print(word_tokens) 
print(result)


['Gold', 'is', 'often', 'seen', 'as', 'an', 'alternative', 'currency', 'in', 'times', 'of', 'global', 'economic', 'uncertainty', 'and', 'a', 'refuge', 'from', 'financial', 'risk', '.']
['Gold', 'often', 'seen', 'alternative', 'currency', 'times', 'global', 'economic', 'uncertainty', 'refuge', 'financial', 'risk', '.']


## POS Tagging

In [25]:
text = "다음날인 3월 12일 새벽 1시 57분경에는 진앙지로부터 약 7,500km 떨어진 캘리포니아 크레센트시티에서 2.2m의 지진해일이 관측되었다."

In [26]:
from ekorpkit.preprocessors.tokenizer import PynoriTokenizer
pt = PynoriTokenizer()
print(pt.tokenize(text))

[ekorpkit]: Initializing Pynori...


['다음/NNG', '날/NNG', '인/VCP+ETM', ' /SP', '3/SN', '월/NNBC', ' /SP', '12/SN', '일/NNBC', ' /SP', '새벽/NNG', ' /SP', '1/SN', '시/NNBC', ' /SP', '57/SN', '분경/NNG', '에/JKB', '는/JX', ' /SP', '진/VX+ETM', '앙지/NNG', '로부터/JKB', ' /SP', '약/MM', ' /SP', '7/SN', ',/SC', '500/SN', 'km/SL', ' /SP', '떨어진/VV+ETM', ' /SP', '캘리포니아/NNP', ' /SP', '크레/NNP', '센트/NNG', '시티/NNG', '에서/JKB', ' /SP', '2/SN', './SY', '2/SN', 'm/SL', '의/JKG', ' /SP', '지진/NNG', '해/XSV+EC', '일/NNG', '이/JKS', ' /SP', '관측/NNG', '되/XSV', '었/EP', '다/EF', './SF']


In [27]:
from ekorpkit.preprocessors.tokenizer import MecabTokenizer
mt = MecabTokenizer()
print(mt.tokenize(text))

[ekorpkit]: Initializing mecab...)


['다음/NNG', '날/NNG', '인/VCP+ETM', ' /SP', '3/SN', '월/NNBC', ' /SP', '12/SN', '일/NNBC', ' /SP', '새벽/NNG', ' /SP', '1/SN', '시/NNBC', ' /SP', '57/SN', '분경/NNG', '에/JKB', '는/JX', ' /SP', '진/VX+ETM', '앙지/NNG', '로부터/JKB', ' /SP', '약/MM', ' /SP', '7/SN', ',/SC', '500/SN', 'km/SL', ' /SP', '떨어진/VV+ETM', ' /SP', '캘리포니아/NNP', ' /SP', '크레/NNP', '센트/NNG', '시티/NNG', '에서/JKB', ' /SP', '2/SN', './SY', '2/SN', 'm/SL', '의/JKG', ' /SP', '지진/NNG', '해/XSV+EC', '일/NNG', '이/JKS', ' /SP', '관측/NNG', '되/XSV', '었/EP', '다/EF', './SF']


In [28]:
from ekorpkit.preprocessors.tokenizer import MecabTokenizer
mt = MecabTokenizer(tokenize_each_word=True, join_pos=False)
print(mt.tokenize(text))

[ekorpkit]: Initializing mecab...)


['다음', '##날인', '3', '##월', '12', '##일', '새벽', '1', '##시', '57', '##분경', '##에', '##는', '진', '##앙지', '##로부터', '약', '7', '##,', '##500', '##km', '떨어진', '캘리포니아', '크레', '##센트', '##시티', '##에서', '2', '.', '2', '##m', '##의', '지진', '##해', '##일', '##이', '관측', '##되', '##었', '##다', '.']


In [29]:
from ekorpkit.preprocessors.tokenizer import BWPTokenizer
bt = BWPTokenizer()
print(bt.tokenize(text))

[ekorpkit]: Initializing BertWordPieceTokenizer...


Downloading:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/563 [00:00<?, ?B/s]

['다음', '##날', '##인', '3', '##월', '12', '##일', '새벽', '1', '##시', '57', '##분', '##경', '##에', '##는', '진', '##앙', '##지', '##로', '##부터', '약', '7', ',', '500', '##k', '##m', '떨어진', '캘리포니아', '크레', '##센트', '##시티', '##에', '##서', '2', '.', '2', '##m', '##의', '지진', '##해', '##일', '##이', '관측', '##되', '##었', '##다', '.']
