<a href="https://colab.research.google.com/github/falconlee236/DeepLearningFrom_Scratch/blob/main/Book_2/ch02/Chapter_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 2 Distributed Representation of Natural Language and word

**2.3 Statistical-Based Techniques**

*2.3.1 Corpus preprocessing from python*

In [None]:
text = "You say goodbye and I say hello."

In [None]:
text = text.lower()
text = text.replace('.', ' .')
text

In [None]:
words = text.split(' ')
words

In [None]:
word_to_id = {}
id_to_word = {}

for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [None]:
id_to_word

In [None]:
word_to_id

In [None]:
id_to_word[1]

In [None]:
word_to_id['hello']

In [None]:
import numpy as np
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

In [None]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.', ' .')
    words = text.split()

    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    
    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

In [None]:
text = "You say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)

*2.3.4 Co-occurence Matrix*

In [None]:
import numpy as np
text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)

print(corpus)
# [01 2 3 4 1 5 6]

print(id_to_word)
# {0: 'you', 1: 'say', 2:'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [None]:
C = np.array([
              [0, 1, 0, 0, 0, 0, 0],
              [1, 0, 1, 0, 1, 1, 0],
              [0, 1, 0, 1, 0, 0, 0],
              [0, 0, 1, 0, 1, 0, 0],
              [0, 1, 0, 1, 0, 0, 0],
              [0, 1, 0, 0, 0, 0, 1],
              [0, 0, 0, 0, 0, 1, 0]
], dtype=np.int32)

In [None]:
print(C[0]) # vector representation of word that has 0 id
# [0 1 0 0 0 0 0]

print(C[4]) # vector representation of word that has 4 id
# [0 1 0 1 0 0 0]

print(C[word_to_id['goodbye']]) # vector representation of 'goodbye'
# [0 1 0 1 0 0 0]

In [None]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = idx - i
            right_idx = idx + i

            if left_idx >= 0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id, left_word_id] += 1
            
            if right_idx < corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
        
    return co_matrix

*2.3.5 Between vector Similarity*

In [None]:
def cos_similarity(x, y):
    nx = x / np.sqrt(np.sum(x ** 2)) # normalization of x
    ny = y / np.sqrt(np.sum(y ** 2)) # normalization of y
    return np.dot(nx, ny)

In [None]:
def cos_similarity(x, y, eps=1e-8):
    nx = x / np.sqrt(np.sum(x ** 2) + eps)
    ny = y / np.sqrt(np.sum(y ** 2) + eps)
    return np.dot(nx, ny)

In [None]:
import numpy as np

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']] # word vector of "you"
c1 = C[word_to_id['i']] # word vector of "i"
print(cos_similarity(c0, c1))

*2.3.6 Ranking presentation of similar word*

In [None]:
def most_similar(query, word_to_id, id_to_word, word_matrix, top=5):
    # 1. pop query
    if query not in word_to_id:
        print(f"{query} is not found.")
        return
    
    print('\n[query] ' + query)
    query_id = word_to_id[query]
    query_vec = word_matrix[query_id]

    # 2. cos simliarity calculate
    vocab_size = len(id_to_word)
    similarity = np.zeros(vocab_size)
    for i in range(vocab_size):
        similarity[i] = cos_similarity(word_matrix[i], query_vec)

    # 3. print descending order cos-simliarity based
    count = 0
    for i in (-1 * similarity).argsort():
        if id_to_word[i] == query:
            continue
        print(f"{id_to_word[i]}: {similarity[i]}")

        count += 1
        if count >= top:
            return

In [None]:
x = np.array([100, -20, 2])
x.argsort()

In [None]:
(-x).argsort()

In [None]:
import numpy as np

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

most_similar('you', word_to_id, id_to_word, C, top=5)

**2.4 Improving Statistical-Based Techniques**

*2.4.1 Pointwise Mutual Information(PMI)*

In [None]:
def ppmi(C, verbose=False, eps=1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i] + eps))
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print(f"{100*cnt/total: .1f}% complete")
    return M

In [None]:
import numpy as np

text = 'You say goodby and I say Hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

np.set_printoptions(precision=3) # 3 Significant Digits
print("co-occurence matrix")
print(C)
print('-'*50)
print('PPMI')
print(W)

*2.4.3 Dimensionality reduction about SVD*

In [None]:
import numpy as np
import matplotlib.pyplot as plt

text = 'You say goodby and I say Hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
W = ppmi(C)

# SVD
U, S, V = np.linalg.svd(W)

In [None]:
print(C[0]) # co-occurence matrix
print(W[0]) # PPMI matrix
print(U[0]) # SVD

In [None]:
print(U[0, :2])

In [None]:
for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))

plt.scatter(U[:, 0], U[:, 1], alpha=0.5)
plt.show()

*2.4.4 PTB Dataset*

In [None]:
%cd drive/MyDrive/DeepLearningFrom_Scratch/Book_2/ch02

In [None]:
import sys
sys.path.append('..')
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')

print(f'corpus size: {len(corpus)}')
print(f"corpus[:30]: {corpus[:30]}")
print()
print(f"id_to_word[0]: {id_to_word[0]}")
print(f"id_to_word[1]: {id_to_word[1]}")
print(f"id_to_word[2]: {id_to_word[2]}")
print()
print(f"word_to_id['car']: {word_to_id['car']}")
print(f"word_to_id['happy']: {word_to_id['happy']}")
print(f"word_to_id['lexus']: {word_to_id['lexus']}")

*2.4.5 PTB dataset Review*

In [None]:
import sys
sys.path.append("..")
import numpy as np
from dataset import ptb

window_size = 2
wordvec_size = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)
print('co occurence number calculating ...')
C = create_co_matrix(corpus, vocab_size, window_size)
print('PPMI calculating ...')
W = ppmi(C, verbose=True)

print('SVD calculating ...')
try:
    # truncated SVD (fast!)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5, random_state=None)

except ImportError:
    # SVD (slow)
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)