컴퓨터는 텍스트보다 숫자를 더 잘 처리
=> 자연어처리에서는 텍스트를 숫자로 바꾸는 여러가지 기법
먼저 각 단어를 고유한 정수에 맵핑(mapping)시키는 전처리 작업이 필요

**단어에 정수를 부여하는 하나의 방법으로 단어를 빈도수 순으로 정렬한 단어 집합을 만들고 빈도수가 높은 순서대로 차례로 낮은 숫자부터 정수를 부여하는 방법**

In [15]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from collections import defaultdict

text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
sentences = sent_tokenize(text)

In [26]:
vocab = defaultdict(int)
stop_words = set(stopwords.words("english"))
sentence_list = []
for sentence in sentences:
    sentence = word_tokenize(sentence)
    result = []
    for word in sentence:
        word = word.lower()
        if word not in stop_words and len(word) > 2:
            vocab[word] += 1
            result.append(word)
    sentence_list.append(result)
print(vocab)

defaultdict(<class 'int'>, {'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [27]:
vocab_sorted = sorted(vocab.items(), key=lambda x : x[1], reverse = True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


In [32]:
word2idx = {}
idx2word = {}

word2idx["OOV"] = 0
idx2word[0] ="OOV"
for idx, (word, freq) in enumerate(vocab_sorted):
    if freq < 2:
        break
    word2idx[word] = idx + 1
    idx2word[idx + 1] = word
print(f"word2idx : {word2idx}")
print(f"idx2word : {idx2word}")

word2idx : {'OOV': 0, 'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}
idx2word : {0: 'OOV', 1: 'barber', 2: 'secret', 3: 'huge', 4: 'kept', 5: 'person', 6: 'word', 7: 'keeping'}


In [35]:
encode = []
for sentence in sentence_list:
    tmp = []
    for word in sentence:
        try:
            tmp.append(word2idx[word])
        except:
            tmp.append(word2idx["OOV"])
    encode.append(tmp)
print(encode)

[[1, 5], [1, 0, 5], [1, 3, 5], [0, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 0, 1, 0], [1, 0, 3, 0]]


Counter 사용하기

In [43]:
import numpy as np

from collections import Counter

print(sentence_list)
sentence_list = np.hstack(sentence_list)
# 빈도수로 정렬된 dictionary
vocab_Counter = Counter(sentence_list)
print(vocab_Counter)

['barber' 'person' 'barber' 'good' 'person' 'barber' 'huge' 'person'
 'knew' 'secret' 'secret' 'kept' 'huge' 'secret' 'huge' 'secret' 'barber'
 'kept' 'word' 'barber' 'kept' 'word' 'barber' 'kept' 'secret' 'keeping'
 'keeping' 'huge' 'secret' 'driving' 'barber' 'crazy' 'barber' 'went'
 'huge' 'mountain']
Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [52]:
# NLTK의 FreqDist => Counter같이 사용가능
from nltk import FreqDist

vocab = FreqDist(sentence_list)
print(vocab.most_common(5))

word2idx = {word[0] : index + 1 for index, word in enumerate(vocab)}

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3)]
