In [1]:
import nltk
from nltk.tokenize import word_tokenize, WordPunctTokenizer, TreebankWordTokenizer

# 텍스트 전처리

## NLTK word tokenizer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/choigww/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.'
word_tokens = word_tokenize(text)
print(word_tokens)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


## 신문기사 웹페이지 html parsing

In [4]:
import requests
from bs4 import BeautifulSoup

In [5]:
url = 'https://www.forbes.com/sites/adrianbridgwater/2019/04/15/what-drove-the-ai-renaissance/'
response = requests.get(url)

In [6]:
response.text[:1000]

'<!DOCTYPE html><html lang="en"><head><title>What Drove The AI Renaissance?</title><meta charset="utf-8"><meta http-equiv="Content-Language" content="en_US"><link rel="shortcut icon" href="https://i.forbesimg.com/48X48-F.png"><meta name="referrer" content="no-referrer-when-downgrade"><link rel="canonical" itemprop="url" href="https://www.forbes.com/sites/adrianbridgwater/2019/04/15/what-drove-the-ai-renaissance/"><link rel="amphtml" href="https://www.forbes.com/sites/adrianbridgwater/2019/04/15/what-drove-the-ai-renaissance/amp/"><link rel="alternate" type="application/rss+xml" title="What Drove The AI Renaissance? - RSS" href="https://www.forbes.com/sites/adrianbridgwater/feed/"><meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=5,minimum-scale=1,user-scalable=yes"><meta name="description" itemprop="description" content="The current renaissance of Artificial Intelligence (AI) with its sister discipline Machine Learning (ML) has led every IT firm worth its 

In [7]:
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
# css selector
eng_news = soup.select('p')

## 신문기사 웹페이지 p태그 텍스트 토크나이징

In [9]:
eng_news[3].text

"And yes, she does mean everybody's job from yours to mine and onward to the role of grain farmers in Egypt, pastry chefs in Paris and dog walkers in Oregon i.e. every job. We will now be able to help direct all workers’ actions and behavior with a new degree of intelligence that comes from predictive analytics, all stemming from the AI engines we will now increasingly depend upon."

In [10]:
print(word_tokenize(eng_news[3].text))

['And', 'yes', ',', 'she', 'does', 'mean', 'everybody', "'s", 'job', 'from', 'yours', 'to', 'mine', 'and', 'onward', 'to', 'the', 'role', 'of', 'grain', 'farmers', 'in', 'Egypt', ',', 'pastry', 'chefs', 'in', 'Paris', 'and', 'dog', 'walkers', 'in', 'Oregon', 'i.e', '.', 'every', 'job', '.', 'We', 'will', 'now', 'be', 'able', 'to', 'help', 'direct', 'all', 'workers', '’', 'actions', 'and', 'behavior', 'with', 'a', 'new', 'degree', 'of', 'intelligence', 'that', 'comes', 'from', 'predictive', 'analytics', ',', 'all', 'stemming', 'from', 'the', 'AI', 'engines', 'we', 'will', 'now', 'increasingly', 'depend', 'upon', '.']


## WordPunctTokenizer
- Tokenize a text into a sequence of alphabetic and
non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``.

In [11]:
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.'
word_punct_tokens = WordPunctTokenizer().tokenize(text)
print(word_punct_tokens)

['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']


## TreebankWordTokenizer
- The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This is the method that is invoked by ``word_tokenize()``.  It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

In [12]:
text = 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks.'
word_treebank_tokens = TreebankWordTokenizer().tokenize(text)
print(word_treebank_tokens)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']


## Pos Tagging

In [13]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/choigww/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [14]:
taggedToken = pos_tag(word_tokens)
print(taggedToken)

[('Good', 'JJ'), ('muffins', 'NNS'), ('cost', 'VBP'), ('$', '$'), ('3.88', 'CD'), ('in', 'IN'), ('New', 'NNP'), ('York', 'NNP'), ('.', '.'), ('Please', 'NNP'), ('buy', 'VB'), ('me', 'PRP'), ('two', 'CD'), ('of', 'IN'), ('them', 'PRP'), ('.', '.'), ('Thanks', 'NNS'), ('.', '.')]


## NER

In [15]:
nltk.download('words')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package words to /Users/choigww/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/choigww/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [16]:
from nltk import ne_chunk
neToken = ne_chunk(taggedToken)
print(neToken)

(S
  (GPE Good/JJ)
  muffins/NNS
  cost/VBP
  $/$
  3.88/CD
  in/IN
  (GPE New/NNP York/NNP)
  ./.
  Please/NNP
  buy/VB
  me/PRP
  two/CD
  of/IN
  them/PRP
  ./.
  Thanks/NNS
  ./.)


## 원형복원

nltk document > stem 검색 (영문 stemmer 굉장히 많음)

In [19]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem(word='running')

'run'

In [20]:
ps.stem(word='beautiful')

'beauti'

In [21]:
ps.stem(word='believes')

'believ'

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/choigww/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [23]:
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
wl.lemmatize('running')

'running'

In [24]:
wl.lemmatize('beautiful')

'beautiful'

In [27]:
wl.lemmatize('has')

'ha'

In [29]:
wl.lemmatize('believes')

'belief'

In [33]:
stopPos = ['IN', 'CC', 'UH', 'TO', 'MD', 'DT']
word = []

for tag in taggedToken:
    if tag[1] not in stopPos:
        word.append(tag[0])
    
print(word)

['Good', 'muffins', 'cost', '$', '3.88', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'them', '.', 'Thanks', '.']


## Tagger
- Komoran
- Hannanum
- Okt
- Kkma

In [34]:
from konlpy.tag import Komoran
komoran = Komoran()
kor_text = "인간이 컴퓨터와 대화하고 있다는 것을 깨닫지 못하고 인간과 대화를 계속할 수 있다면 컴퓨터는 지능적인 것으로 간주될 수 있습니다."

komoran_tokens = komoran.morphs(kor_text)
print(komoran_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으로', '간주', '되', 'ㄹ', '수', '있', '습니다', '.']


In [35]:
from konlpy.tag import Hannanum
hannanum = Hannanum()
hannanum_tokens = hannanum.morphs(kor_text)
print(hannanum_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능적', '이', 'ㄴ', '것', '으로', '간주', '되', 'ㄹ', '수', '있', '습니다', '.']


In [37]:
from konlpy.tag import Okt
okt = Okt()
okt_tokens = okt.morphs(kor_text)
print(okt_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하고', '있다는', '것', '을', '깨닫지', '못', '하고', '인간', '과', '대화', '를', '계속', '할', '수', '있다면', '컴퓨터', '는', '지능', '적', '인', '것', '으로', '간주', '될', '수', '있습니다', '.']


In [40]:
from konlpy.tag import Kkma
kkma = Kkma()
kkma_tokens = kkma.morphs(kor_text)
print(kkma_tokens)

['인간', '이', '컴퓨터', '와', '대화', '하', '고', '있', '다는', '것', '을', '깨닫', '지', '못하', '고', '인간', '과', '대화', '를', '계속', '하', 'ㄹ', '수', '있', '다면', '컴퓨터', '는', '지능', '적', '이', 'ㄴ', '것', '으로', '간주', '되', 'ㄹ', '수', '있', '습니다', '.']


# 한글 POS tagging

In [46]:
print(komoran.pos(kor_text))

[('인간', 'NNG'), ('이', 'JKS'), ('컴퓨터', 'NNG'), ('와', 'JC'), ('대화', 'NNG'), ('하', 'XSV'), ('고', 'EC'), ('있', 'VV'), ('다는', 'ETM'), ('것', 'NNB'), ('을', 'JKO'), ('깨닫', 'VV'), ('지', 'EC'), ('못하', 'VX'), ('고', 'EC'), ('인간', 'NNG'), ('과', 'JC'), ('대화', 'NNG'), ('를', 'JKO'), ('계속', 'NNG'), ('하', 'XSV'), ('ㄹ', 'ETM'), ('수', 'NNB'), ('있', 'VV'), ('다면', 'EC'), ('컴퓨터', 'NNG'), ('는', 'JX'), ('지능', 'NNG'), ('적', 'XSN'), ('이', 'VCP'), ('ㄴ', 'ETM'), ('것', 'NNB'), ('으로', 'JKB'), ('간주', 'NNG'), ('되', 'XSV'), ('ㄹ', 'ETM'), ('수', 'NNB'), ('있', 'VX'), ('습니다', 'EF'), ('.', 'SF')]


In [43]:
print(hannanum.pos(kor_text))

[('인간', 'N'), ('이', 'J'), ('컴퓨터', 'N'), ('와', 'J'), ('대화', 'N'), ('하고', 'J'), ('있', 'P'), ('다는', 'E'), ('것', 'N'), ('을', 'J'), ('깨닫', 'P'), ('지', 'E'), ('못하', 'P'), ('고', 'E'), ('인간', 'N'), ('과', 'J'), ('대화', 'N'), ('를', 'J'), ('계속', 'N'), ('하', 'X'), ('ㄹ', 'E'), ('수', 'N'), ('있', 'P'), ('다면', 'E'), ('컴퓨터', 'N'), ('는', 'J'), ('지능적', 'N'), ('이', 'J'), ('ㄴ', 'E'), ('것', 'N'), ('으로', 'J'), ('간주', 'N'), ('되', 'X'), ('ㄹ', 'E'), ('수', 'N'), ('있', 'P'), ('습니다', 'E'), ('.', 'S')]


In [44]:
print(okt.pos(kor_text))

[('인간', 'Noun'), ('이', 'Josa'), ('컴퓨터', 'Noun'), ('와', 'Josa'), ('대화', 'Noun'), ('하고', 'Josa'), ('있다는', 'Adjective'), ('것', 'Noun'), ('을', 'Josa'), ('깨닫지', 'Verb'), ('못', 'Noun'), ('하고', 'Josa'), ('인간', 'Noun'), ('과', 'Josa'), ('대화', 'Noun'), ('를', 'Josa'), ('계속', 'Noun'), ('할', 'Verb'), ('수', 'Noun'), ('있다면', 'Adjective'), ('컴퓨터', 'Noun'), ('는', 'Josa'), ('지능', 'Noun'), ('적', 'Suffix'), ('인', 'Josa'), ('것', 'Noun'), ('으로', 'Josa'), ('간주', 'Noun'), ('될', 'Verb'), ('수', 'Noun'), ('있습니다', 'Adjective'), ('.', 'Punctuation')]


In [45]:
print(kkma.pos(kor_text))

[('인간', 'NNG'), ('이', 'JKS'), ('컴퓨터', 'NNG'), ('와', 'JKM'), ('대화', 'NNG'), ('하', 'XSV'), ('고', 'ECE'), ('있', 'VXV'), ('다는', 'ETD'), ('것', 'NNB'), ('을', 'JKO'), ('깨닫', 'VV'), ('지', 'ECD'), ('못하', 'VX'), ('고', 'ECE'), ('인간', 'NNG'), ('과', 'JKM'), ('대화', 'NNG'), ('를', 'JKO'), ('계속', 'NNG'), ('하', 'XSV'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VA'), ('다면', 'ECE'), ('컴퓨터', 'NNG'), ('는', 'JX'), ('지능', 'NNG'), ('적', 'XSN'), ('이', 'VCP'), ('ㄴ', 'ETD'), ('것', 'NNB'), ('으로', 'JKM'), ('간주', 'NNG'), ('되', 'XSV'), ('ㄹ', 'ETD'), ('수', 'NNB'), ('있', 'VV'), ('습니다', 'EFN'), ('.', 'SF')]


### 빈도수 확인

In [49]:
from collections import Counter

In [53]:
print(Counter(kkma.pos(kor_text)).most_common())

[(('인간', 'NNG'), 2), (('컴퓨터', 'NNG'), 2), (('대화', 'NNG'), 2), (('하', 'XSV'), 2), (('고', 'ECE'), 2), (('것', 'NNB'), 2), (('ㄹ', 'ETD'), 2), (('수', 'NNB'), 2), (('이', 'JKS'), 1), (('와', 'JKM'), 1), (('있', 'VXV'), 1), (('다는', 'ETD'), 1), (('을', 'JKO'), 1), (('깨닫', 'VV'), 1), (('지', 'ECD'), 1), (('못하', 'VX'), 1), (('과', 'JKM'), 1), (('를', 'JKO'), 1), (('계속', 'NNG'), 1), (('있', 'VA'), 1), (('다면', 'ECE'), 1), (('는', 'JX'), 1), (('지능', 'NNG'), 1), (('적', 'XSN'), 1), (('이', 'VCP'), 1), (('ㄴ', 'ETD'), 1), (('으로', 'JKM'), 1), (('간주', 'NNG'), 1), (('되', 'XSV'), 1), (('있', 'VV'), 1), (('습니다', 'EFN'), 1), (('.', 'SF'), 1)]


In [54]:
stop_words = ['의', '이', '로', '두고', '들', '를', '은', '과', '수',' 했다']
stop_pos = ['Josa', 'Suffix', 'Punctuation', 'Adjective']

In [55]:
word = []
for item in okt.pos(kor_text):
    if (item[0] not in stop_words) and (item[1] not in stop_pos):
        word.append(item[0])

In [57]:
print(word)

['인간', '컴퓨터', '대화', '것', '깨닫지', '못', '인간', '대화', '계속', '할', '컴퓨터', '지능', '것', '간주', '될']


In [58]:
import nltk
from nltk import bigrams, word_tokenize
from nltk.util import ngrams

In [59]:
tokens = word_tokenize(kor_text)
bigram = bigrams(tokens)
print(bigram)

for t in bigram:
    print(t)

<generator object bigrams at 0x1ac29cb950>
('인간이', '컴퓨터와')
('컴퓨터와', '대화하고')
('대화하고', '있다는')
('있다는', '것을')
('것을', '깨닫지')
('깨닫지', '못하고')
('못하고', '인간과')
('인간과', '대화를')
('대화를', '계속할')
('계속할', '수')
('수', '있다면')
('있다면', '컴퓨터는')
('컴퓨터는', '지능적인')
('지능적인', '것으로')
('것으로', '간주될')
('간주될', '수')
('수', '있습니다')
('있습니다', '.')


In [60]:
trigram = ngrams(tokens, 3)
for t in trigram:
    print(t)

('인간이', '컴퓨터와', '대화하고')
('컴퓨터와', '대화하고', '있다는')
('대화하고', '있다는', '것을')
('있다는', '것을', '깨닫지')
('것을', '깨닫지', '못하고')
('깨닫지', '못하고', '인간과')
('못하고', '인간과', '대화를')
('인간과', '대화를', '계속할')
('대화를', '계속할', '수')
('계속할', '수', '있다면')
('수', '있다면', '컴퓨터는')
('있다면', '컴퓨터는', '지능적인')
('컴퓨터는', '지능적인', '것으로')
('지능적인', '것으로', '간주될')
('것으로', '간주될', '수')
('간주될', '수', '있습니다')
('수', '있습니다', '.')
