# 2.3 통계 기반 기법

2.3.1 파이썬으로 말뭉치(corpus) 전처리하기

In [None]:
# text로 문장 하나 할당
text = 'You say goodbye and I say hello.'
# 전처리
text = text.lower()
text = text.replace('.', ' .')
text

'you say goodbye and i say hello .'

In [None]:
# text를 단어 단위로 분할
words = text.split(' ')
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']

In [8]:
len([]), len('')

(0, 0)

In [None]:
# 단어에 id 부여하여 id와 단어 짝지어 주기
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id: # 없을 경우에만 딕셔너리에 삽입
        new_id = len(word_to_id) # 0부터 id 부여
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [10]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [5]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [11]:
import numpy as np
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [1]:
from utils import preprocess
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
corpus, word_to_id, id_to_word

(array([0, 1, 2, 3, 4, 1, 5, 6]),
 {'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6},
 {0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'})

2.3.4 동시발생 행렬

In [3]:
import numpy as np
from utils import preprocess
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)

print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [4]:
vocab_size = len(id_to_word)
window_size = 1
corpus_size = len(corpus)
co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

for idx, word_id in enumerate(corpus):
    for i in range(1, window_size + 1):
        left_idx = idx - i
        right_idx = idx + i

        if left_idx >= 0: # co_matrix 왼쪽 경계 벗어나는지 확인
            left_word_id = corpus[left_idx]
            co_matrix[word_id, left_word_id] += 1

        if right_idx < corpus_size: # co_matrix 오른쪽 경계 벗어나는지 확인
            right_word_id = corpus[right_idx]
            co_matrix[word_id, right_word_id] += 1

co_matrix

array([[0, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0]])

In [None]:
import numpy as np
from utils import preprocess
from utils import create_co_matrix
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

vocab_size = len(id_to_word)
create_co_matrix(corpus, vocab_size)

array([[0, 1, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1, 0]])

2.3.5 벡터 간 유사도