# 07) 패딩(Padding)

### 1. Numpy로 패딩하기

In [1]:
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

In [2]:
sentences = [['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]

* 정수 인코딩 수행

In [3]:
 text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."

In [4]:
text = sent_tokenize(text)
print(text)

['A barber is a person.', 'a barber is good person.', 'a barber is huge person.', 'he Knew A Secret!', 'The Secret He Kept is huge secret.', 'Huge secret.', 'His barber kept his word.', 'a barber kept his word.', 'His barber kept his secret.', 'But keeping and keeping such a huge secret to himself was driving the barber crazy.', 'the barber went up a huge mountain.']


In [5]:
words = sum(sentences, [])
print(words)

['barber', 'person', 'barber', 'good', 'person', 'barber', 'huge', 'person', 'knew', 'secret', 'secret', 'kept', 'huge', 'secret', 'huge', 'secret', 'barber', 'kept', 'word', 'barber', 'kept', 'word', 'barber', 'kept', 'secret', 'keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy', 'barber', 'went', 'huge', 'mountain']


In [6]:
# 정제 와 단어 토큰화
sentences = []
stop_words = set(stopwords.words('english'))

for i in text:
    sentence = word_tokenize(i) # 단어 토큰화를 수행
    result = []
    
    for word in sentence:
        word = word.lower() # 모든 단어를 소문자화하여 단어의 개수를 줄입니다.
        if word not in stop_words:
            if len(word) > 2:
                result.append(word)
    sentences.append(result)
print(sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [7]:
vocab = Counter(words)
print(vocab)

Counter({'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3, 'word': 2, 'keeping': 2, 'good': 1, 'knew': 1, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1})


In [8]:
vocab_sorted = sorted(vocab.items(), key = lambda x:x[1], reverse = True) #빈도수 높은 순으로 정렬

In [9]:
word_to_index = {}
i = 0
for(word,frequency) in vocab_sorted:
    if frequency > 1 : #빈도수가 적은 단어는 제외
        i = i + 1
        word_to_index[word] = i
print(word_to_index)

{'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'word': 6, 'keeping': 7}


In [10]:
word_to_index['OOV'] =len(word_to_index) + 1

In [11]:
word_to_index #OOV에 대해서는 특정 값 부여

{'barber': 1,
 'secret': 2,
 'huge': 3,
 'kept': 4,
 'person': 5,
 'word': 6,
 'keeping': 7,
 'OOV': 8}

In [12]:
encoded = []

for s in sentences:
    tmp = []
    for word in s:
        try :
            tmp.append(word_to_index[word])
        except :
            tmp.append(word_to_index['OOV'])
    encoded.append(tmp)
print(encoded)

[[1, 5], [1, 8, 5], [1, 3, 5], [8, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [7, 7, 3, 2, 8, 1, 8], [1, 8, 3, 8]]


* 동일한 길이로 맞춰주기 위해 가장 긴 문장의 길이 계산

In [13]:
max_len = max(len(item) for item in encoded)
print(max_len)

7


* Zero Padding

In [14]:
for item in encoded: #각 문장에 대해서
    while len(item) < max_len : #max_len보다 작으면
        item.append(0)
padded_np = np.array(encoded)
padded_np

array([[1, 5, 0, 0, 0, 0, 0],
       [1, 8, 5, 0, 0, 0, 0],
       [1, 3, 5, 0, 0, 0, 0],
       [8, 2, 0, 0, 0, 0, 0],
       [2, 4, 3, 2, 0, 0, 0],
       [3, 2, 0, 0, 0, 0, 0],
       [1, 4, 6, 0, 0, 0, 0],
       [1, 4, 6, 0, 0, 0, 0],
       [1, 4, 2, 0, 0, 0, 0],
       [7, 7, 3, 2, 8, 1, 8],
       [1, 8, 3, 8, 0, 0, 0]])