In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('webtext')

[nltk_data] Downloading package punkt to /Users/leecoder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/leecoder/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leecoder/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leecoder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     /Users/leecoder/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


True

In [2]:
para = "Hello everyone. It's good to see you. Let's start our text mining class!"

from nltk.tokenize import sent_tokenize
print(sent_tokenize(para))


['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


In [3]:
para_french = """Je t'ai demand si tu m'aimais bien, Tu m'a r pondu non.
Je t'ai demand si j' tais jolie, Tu m'a r pondu non.
Je t'ai demand si j' tais dans ton coeur, Tu m'a r pondu non."""

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
print(tokenizer.tokenize(para_french))

["Je t'ai demand si tu m'aimais bien, Tu m'a r pondu non.", "Je t'ai demand si j' tais jolie, Tu m'a r pondu non.", "Je t'ai demand si j' tais dans ton coeur, Tu m'a r pondu non."]


In [4]:
para_kor = '안녕하세요, 반갑습니다. 저는 텍스트마이닝 강의를 진행하고 있습니다.'

print(sent_tokenize(para_kor))

['안녕하세요, 반갑습니다.', '저는 텍스트마이닝 강의를 진행하고 있습니다.']


In [5]:
# 단어 토큰화

In [6]:
from nltk.tokenize import word_tokenize
print(word_tokenize(para))

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']


In [7]:
from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


In [8]:
print(word_tokenize(para_kor))

['안녕하세요', ',', '반갑습니다', '.', '저는', '텍스트마이닝', '강의를', '진행하고', '있습니다', '.']


In [9]:
# 정규표현식을 활용한 토큰화

In [10]:
import re
re.findall("[abc]","How are you, boy?")

['a', 'b']

In [11]:
re.findall("[0123456789]","3a7b5c9d")

['3', '7', '5', '9']

In [12]:
re.findall("[\w]","3a 7b_ '.^&5c9d")

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']

In [13]:
re.findall("[\W]","3a 7b_ '.^&5c9d")

[' ', ' ', "'", '.', '^', '&']

In [14]:
re.findall("[_]+","a_v, c__d, e___f")

['_', '__', '___']

In [15]:
re.findall("[\w]+","How are you, boy?")

['How', 'are', 'you', 'boy']

In [16]:
re.findall("[o]{2,4}","oh, hoow are yoooou, booooooy?")

['oo', 'oooo', 'oooo', 'oo']

In [17]:
from nltk.tokenize import RegexpTokenizer

#regular expression (정규식) 을 이용한 tokenizer
#단어 단위로 tokenize \w: 문자나 숫자를 의미, 즉 문자나 숫자 혹은 '가 반복되는 것을 찾아 냄
tokenizer = RegexpTokenizer("[\w']+")

#can't를 하나의 단어로 인식
print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', "can't", 'go', 'there']


In [18]:
text1 = "Sorry, I can't go there."
tokenizer = RegexpTokenizer("[\w']{3,}")
print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']


In [19]:
# 노이즈와 불용어 제거

In [20]:
from nltk.corpus import stopwords # 일반적으로 분석대상이 아닌 단어들
eng_stops = set(stopwords.words('english')) #반복되지 않도록 set으로 변환

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w]+")
tokens = tokenizer.tokenize(text1.lower()) #word_tokenize로 토큰화

# stopwords를 제외한 단어들로만 list를 생성
result = [word for word in tokens if word not in eng_stops]
# 리스트 명 = [표현식 for 변수 in 반복 가능한 대상]


print(result)

['sorry', 'go', 'movie', 'yesterday']


In [21]:
print(eng_stops)

{'this', "you'd", 'who', 'any', 'no', 'haven', 'not', 'hers', 'only', 'didn', "needn't", 'her', 'it', "didn't", 'other', 'whom', 'him', 'ourselves', 'themselves', 'through', 'to', 'will', 'll', 'm', 'herself', 'against', "you've", 'mustn', 'me', 'is', 'did', 'then', 'under', 'when', 'd', 'won', 'on', 've', 'than', 'yours', 'my', 'ours', 'having', 'so', 'a', "don't", 'out', 'here', 'shan', 'their', 'very', 'being', 'just', 'he', 'himself', 'from', 'before', 'your', 'there', 'was', 'are', 'our', "weren't", "you'll", 'am', "she's", "won't", 'had', 'own', 'myself', 'what', 'shouldn', 'do', 'between', 'theirs', 'isn', 'as', 'the', 'yourself', "hadn't", 'but', 'his', 'by', 'don', 'itself', "haven't", 'wouldn', "shouldn't", 'has', "should've", 'needn', 'hasn', 'while', 'hadn', 'weren', 's', 'which', "wouldn't", 'those', 're', 'couldn', 'and', "mustn't", 'at', 'after', 'each', 'ain', 'if', 'does', 'because', 'they', 'y', 'i', 'up', 'in', 'same', 'about', 'or', 'down', 'some', 'aren', "aren't",

In [22]:
# 자신만의 stopwords를 만들고 이용
# 한글처리에도 유용함
# 나만의 stopword를 리스트로 정의

my_stopword = ['i','go','to']
result = [word for word in tokens if word not in my_stopword]
print(tokens)
print(result)

['sorry', 'i', 'couldn', 't', 'go', 'to', 'movie', 'yesterday']
['sorry', 'couldn', 't', 'movie', 'yesterday']


In [23]:
# 2.3 정규화

In [24]:
#2.3.1 어간추출

In [25]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem('cooking'),stemmer.stem('cookery'),stemmer.stem('cookbooks'))

cook cookeri cookbook


In [26]:
from nltk.tokenize import word_tokenize
para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para)
print(tokens)
result = [stemmer.stem(token) for token in tokens] #모든 토큰에 대해 스테밍 실행
print(result)

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['hello', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mine', 'class', '!']


In [27]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'),stemmer.stem('cookery'),stemmer.stem('cookbooks'))

cook cookery cookbook


In [28]:
# 2.3.2 표제어 추출

In [29]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking'))
print(lemmatizer.lemmatize('cooking',pos='v')) #품사를 지정
print(lemmatizer.lemmatize('cookery'))
print(lemmatizer.lemmatize('cookbooks'))

cooking
cook
cookery
cookbook


In [30]:
#lemmatizing와 stemming 비교
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print('stemming result:', stemmer.stem('believes'))
print('stemming result:', lemmatizer.lemmatize('believes'))
print('stemming result:', lemmatizer.lemmatize('believes',pos='v'))

stemming result: believ
stemming result: belief
stemming result: believe


In [31]:
# 2.4 품사태깅

In [32]:
#2.4.2 품사태깅
import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


In [33]:
nltk.help.upenn_tagset("VB")

VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...


In [34]:
my_tag_set = ['NN','VB','JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']


In [35]:
words_with_tag = ['/'.join(item) for item in nltk.pos_tag(tokens)]
print(words_with_tag)

['Hello/NNP', 'everyone/NN', './.', 'It/PRP', "'s/VBZ", 'good/JJ', 'to/TO', 'see/VB', 'you/PRP', './.', 'Let/VB', "'s/POS", 'start/VB', 'our/PRP$', 'text/NN', 'mining/NN', 'class/NN', '!/.']


In [36]:
sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비추어줄까.
정희성, 희망 공부'''

tokens = word_tokenize(sentence)
print(tokens)
print(nltk.pos_tag(tokens))

['절망의', '반대가', '희망은', '아니다', '.', '어두운', '밤하늘에', '별이', '빛나듯', '희망은', '절망', '속에', '싹트는', '거지', '만약에', '우리가', '희망함이', '적다면', '그', '누가', '세상을', '비추어줄까', '.', '정희성', ',', '희망', '공부']
[('절망의', 'JJ'), ('반대가', 'NNP'), ('희망은', 'NNP'), ('아니다', 'NNP'), ('.', '.'), ('어두운', 'VB'), ('밤하늘에', 'JJ'), ('별이', 'NNP'), ('빛나듯', 'NNP'), ('희망은', 'NNP'), ('절망', 'NNP'), ('속에', 'NNP'), ('싹트는', 'NNP'), ('거지', 'NNP'), ('만약에', 'NNP'), ('우리가', 'NNP'), ('희망함이', 'NNP'), ('적다면', 'NNP'), ('그', 'NNP'), ('누가', 'NNP'), ('세상을', 'NNP'), ('비추어줄까', 'NNP'), ('.', '.'), ('정희성', 'NN'), (',', ','), ('희망', 'NNP'), ('공부', 'NNP')]


In [46]:
from konlpy.tag import Okt
t = Okt()

In [49]:
print('형태소:',t.morphs(sentence))
print()
print('명사:',t.nouns(sentence))
print()
print('품사 태깅 결과:',t.pos(sentence))

형태소: ['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '\n', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', '\n', '희망', '은', '절망', '속', '에', '싹트는', '거지', '\n', '만약', '에', '우리', '가', '희망', '함', '이', '적다면', '\n', '그', '누가', '세상', '을', '비추어줄까', '.', '\n', '정희성', ',', '희망', '공부']

명사: ['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', '누가', '세상', '정희성', '희망', '공부']

품사 태깅 결과: [('절망', 'Noun'), ('의', 'Josa'), ('반대', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('은', 'Josa'), ('아니다', 'Adjective'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('어', 'Noun'), ('두운', 'Noun'), ('밤하늘', 'Noun'), ('에', 'Josa'), ('별', 'Noun'), ('이', 'Josa'), ('빛나듯', 'Verb'), ('\n', 'Foreign'), ('희망', 'Noun'), ('은', 'Josa'), ('절망', 'Noun'), ('속', 'Noun'), ('에', 'Josa'), ('싹트는', 'Verb'), ('거지', 'Noun'), ('\n', 'Foreign'), ('만약', 'Noun'), ('에', 'Josa'), ('우리', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('함', 'Noun'), ('이', 'Josa'), ('적다면', 'Verb'), ('\n', 'Foreign'), ('그', 'Noun'), ('누가', 'Noun'), ('세상