In [22]:
from nltk.tokenize import word_tokenize
import nltk
import re
import numpy as np
from collections import defaultdict
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\furka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
def get_data():
    with open("data/shakespeare.txt") as f:
        text = f.read()
    text = re.sub(r"[,!?;-]", ".", text)  
    data = word_tokenize(text)
    data = [w.lower() for w in data if w.isalpha() or w == "."]
    return data

In [4]:
def get_dict(data):
    vocab = sorted(list(set(data)))
    word2Ind = {}
    Ind2word = {}
    idx = 0
    for word in vocab:
        word2Ind[word] = idx
        Ind2word[idx] = word
        idx += 1
    return word2Ind, Ind2word

In [5]:
def initialize_model(N,V):
    '''
    Inputs: 
        N:  dimension of hidden vector 
        V:  dimension of vocabulary
        random_seed: random seed for consistent results in the unit tests
     Outputs: 
        W1, W2, b1, b2: initialized weights and biases
    '''
    W1 = np.random.rand(N, V)
    b1 = np.random.rand(N, 1)
    W2 = np.random.rand(V, N)
    b2 = np.random.rand(V, 1)
    
    return W1, W2, b1, b2

In [6]:
def softmax(z):
    '''
    Inputs: 
        z: output scores from the hidden layer
    Outputs: 
        yhat: prediction (estimate of y)
    '''
    yhat = np.exp(z) / np.sum(np.exp(z), axis = 0, keepdims = True)
    return yhat

In [7]:
def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
     Outputs: 
        z:  output score vector
    '''
    h = np.dot(W1, x) + b1
    h = np.maximum(0, h)
    z = np.dot(W2, h) + b2
    return z, h

In [8]:
def compute_cost(y, yhat, batch_size):
    logprobs = np.multiply(np.log(yhat), y)
    cost = - (1/batch_size) * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [23]:
def get_idx(context_words, word2Ind):
    """
    Inputs:
        context_word:
        word2Ind
    Outputs:
        idxs : kelimelerin idx karsiligi liste
    """
    idxs = []
    for word in context_words:
        idxs.append(word2Ind[word])
    return idxs


In [32]:
def pack_idx_with_frequency(context_words, word2Ind):
    """
    Inputs:
        context_words : center word oncesi ve sonraki kelimler
        word2Ind : kelimlerin karsiliginda indexlere sahip dict
    Outputs:
        
    """
    
    freq_dict = defaultdict(int)
    for word in context_words:
        freq_dict[word] += 1
    idxs = get_idx(context_words, word2Ind)
    packed = []
    for i in range(len(idxs)):
        idx = idxs[i]
        freq = freq_dict[context_words[i]]
        packed.append((idx, freq))
    return packed

In [33]:
def get_vectors(data, word2Ind, V, C):
    """
    Inputs:
        data : corpus
        word2Ind : kelimlerin karsiliginda indexlere sahip dict
        V : vocablary
        C : context half size
    Outputs:
        x : context'de bulunan kelimlerin one-hot toplamlari / context_size
        y : center word one-hot vektor
    """
    i = C # center word i ile takip edilir
    while True:
        y = np.zeros(V)
        x = np.zeros(V)
        center_word = data[i]
        y[word2Ind[center_word]] = 1
        context_words = data[i - 2: i] + data[i + 1 : i + 1 + C]
        context_len = len(context_words)
        for idx, freq in pack_idx_with_frequency(context_words, word2Ind):
            x[idx] = freq / context_len
        yield x, y
        i += 1
        if i >= len(data):
            i = 0
            print("i'ye 0 atandi")

In [34]:
def get_batches(data, word2Ind, V, C, batch_size):
    """
    data : corpus
    word2Ind : Kelimlerin indexlerine sahip bir dict
    V : Vocablary
    C : context half size
    batch_size : Modele verilecek girdi sayisi
    """
    batch_x = []
    batch_y = []
    for x, y in get_vectors(data, word2Ind, V, C):
        while len(batch_x) < batch_size:
            batch_x.append(x)
            batch_y.append(y)
        else:
            yield np.array(batch_x).T, np.array(batch_y).T
            batch_x = []
            batch_y = []
        

### Test All Functions

In [37]:
data = get_data()
word2Ind, Ind2word = get_dict(data)
tmp_V = len(word2Ind)
tmp_C = 2
tmp_N = 50
tmp_batch_size = 4

tmp_x, tmp_y = next(get_batches(data, word2Ind, tmp_V, tmp_C, tmp_batch_size))
print(tmp_x.shape, tmp_y.shape)


W1, W2, b1, b2 = initialize_model(tmp_N, tmp_V)
tmp_z, tmp_h = forward_prop(tmp_x, W1, W2, b1, b2)
tmp_yhat = softmax(tmp_z)
tmp_cost = compute_cost(tmp_y, tmp_yhat, tmp_batch_size)
print(tmp_cost)

(5775, 4) (5775, 4)
12.812297426050804


## Backpropagation

\begin{align}
 \frac{\partial J}{\partial \mathbf{W_1}} &= \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right )\mathbf{x}^\top \tag{1}\\
 \frac{\partial J}{\partial \mathbf{W_2}} &= (\mathbf{\hat{y}} - \mathbf{y})\mathbf{h^\top} \tag{1}\\
 \frac{\partial J}{\partial \mathbf{b_1}} &= \rm{ReLU}\left ( \mathbf{W_2^\top} (\mathbf{\hat{y}} - \mathbf{y})\right ) \tag{3}\\
 \frac{\partial J}{\partial \mathbf{b_2}} &= \mathbf{\hat{y}} - \mathbf{y} \tag{4}
\end{align}


### Ustteki backprop islemi tek bir ifade icin batch icin alinirsa belli degisiklikler olacaktir

In [60]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        yhat: prediction (estimate of y)
        y:  target vector
        h:  hidden vector (see eq. 1)
        W1, W2, b1, b2:  matrices and biases  
        batch_size: batch size 
     Outputs: 
        grad_W1, grad_W2, grad_b1, grad_b2:  gradients of matrices and biases   
    '''
    grad_W2 = (1 / batch_size) * np.dot(yhat - y, h.T)
    grad_b2 = (1 / batch_size) * np.sum(yhat - y, axis = 1, keepdims=True)
    grad_W1 = (1 / batch_size) * np.dot(np.maximum(0, np.dot(W2.T, yhat - y)), x.T)
    grad_b1 = (1 / batch_size) * np.sum(np.maximum(0, np.dot(W2.T, yhat - y)), axis = 1, keepdims=True)
    return grad_W1, grad_W2, grad_b1, grad_b2

In [61]:
def update_parameters(parameters, grads, alpha):
    W1, W2, b1, b2 = parameters
    grad_W1, grad_W2, grad_b1, grad_b2 = grads
    W1 = W1 - alpha * grad_W1
    W2 = W2 - alpha * grad_W2
    b1 = b1 - alpha * grad_b1
    b2 = b2 - alpha * grad_b2
    return W1, W2, b1, b2

In [65]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.03, batch_size = 128, C = 2):
    
    '''
    This is the gradient_descent function
    
      Inputs: 
        data:      text
        word2Ind:  words to Indices
        N:         dimension of hidden vector  
        V:         dimension of vocabulary 
        num_iters: number of iterations 
        alpha : learning rate
        batch_size : number of examples in each batch
        C : context half size
     Outputs: 
        W1, W2, b1, b2:  updated matrices and biases after num_iters iterations

    '''
    parameters = initialize_model(N, V)
    
    i = 0
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        z, h = forward_prop(x, *parameters)
        yhat = softmax(z)
        cost = compute_cost(y, yhat, batch_size)
        grads = back_prop(x, yhat, y, h, *parameters, batch_size)
        parameters = update_parameters(parameters, grads, alpha)
        i += 1
        if i == num_iters:
            break
        if (i + 1) % 10 == 0:
            print(f"{i+1}.iter Cost : {cost:.6f}")
        if i % 100 == 0:
            alpha *= 0.66
    return W1, W2, b1, b2

In [66]:
data = get_data()
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
N = 50
C = 2
batch_size = 128
alpha = 0.03
num_iters = 150


In [67]:
gradient_descent(data, word2Ind, N, V, num_iters, alpha, batch_size, C)

10.iter Cost : 7.919861
20.iter Cost : 9.987556
30.iter Cost : 9.420588
40.iter Cost : 9.730634
50.iter Cost : 7.951681
60.iter Cost : 1.908529
70.iter Cost : 1.691551
80.iter Cost : 10.574863
90.iter Cost : 8.802042
100.iter Cost : 7.780031
110.iter Cost : 12.238310
120.iter Cost : 11.485120
130.iter Cost : 9.196244
140.iter Cost : 9.154715
150.iter Cost : 7.964914


(array([[0.6219698 , 0.12671832, 0.64021677, ..., 0.14819813, 0.80576001,
         0.81387778],
        [0.30409119, 0.87743403, 0.27619172, ..., 0.36376133, 0.06986089,
         0.4669891 ],
        [0.03019791, 0.50882708, 0.44058658, ..., 0.30806755, 0.88930183,
         0.28626545],
        ...,
        [0.41081758, 0.43521317, 0.87382505, ..., 0.34322489, 0.04860274,
         0.46534924],
        [0.23565185, 0.81324709, 0.28863926, ..., 0.30923743, 0.84514841,
         0.22144138],
        [0.39902241, 0.30826413, 0.54928167, ..., 0.12998885, 0.07113925,
         0.23175899]]),
 array([[0.85062585, 0.19433653, 0.04235671, ..., 0.84236114, 0.80016593,
         0.29392048],
        [0.02274909, 0.99639508, 0.02554441, ..., 0.94838042, 0.81382462,
         0.15721227],
        [0.00897219, 0.82024191, 0.2140257 , ..., 0.62165875, 0.90084365,
         0.09101711],
        ...,
        [0.03522637, 0.64034248, 0.64632307, ..., 0.3075528 , 0.67987709,
         0.59643766],
        [0.6

In [74]:
num_iters = 150
alpha = 0.03
gradient_descent(data, word2Ind, N, V, num_iters, alpha, batch_size, C)

10.iter Cost : 10.814292
20.iter Cost : 8.810384
30.iter Cost : 11.351997
40.iter Cost : 9.166524
50.iter Cost : 12.098641
60.iter Cost : 2.335376
70.iter Cost : 1.993430
80.iter Cost : 11.902983
90.iter Cost : 9.226281
100.iter Cost : 6.141662
110.iter Cost : 8.760855
120.iter Cost : 9.700995
130.iter Cost : 6.349505
140.iter Cost : 9.719155
150.iter Cost : 8.535599


(array([[0.6219698 , 0.12671832, 0.64021677, ..., 0.14819813, 0.80576001,
         0.81387778],
        [0.30409119, 0.87743403, 0.27619172, ..., 0.36376133, 0.06986089,
         0.4669891 ],
        [0.03019791, 0.50882708, 0.44058658, ..., 0.30806755, 0.88930183,
         0.28626545],
        ...,
        [0.41081758, 0.43521317, 0.87382505, ..., 0.34322489, 0.04860274,
         0.46534924],
        [0.23565185, 0.81324709, 0.28863926, ..., 0.30923743, 0.84514841,
         0.22144138],
        [0.39902241, 0.30826413, 0.54928167, ..., 0.12998885, 0.07113925,
         0.23175899]]),
 array([[0.85062585, 0.19433653, 0.04235671, ..., 0.84236114, 0.80016593,
         0.29392048],
        [0.02274909, 0.99639508, 0.02554441, ..., 0.94838042, 0.81382462,
         0.15721227],
        [0.00897219, 0.82024191, 0.2140257 , ..., 0.62165875, 0.90084365,
         0.09101711],
        ...,
        [0.03522637, 0.64034248, 0.64632307, ..., 0.3075528 , 0.67987709,
         0.59643766],
        [0.6

In [75]:
# visualizing the word vectors here
from matplotlib import pyplot
%config InlineBackend.figure_format = 'svg'
words = ['king', 'queen','lord','man', 'woman','dog','wolf',
         'rich','happy','sad']

embs = (W1.T + W2)/2.0 # bu satira dikkat burada embedding vektoru aliniyor.
 
# given a list of words and the embeddings, it returns a matrix with all the embeddings
idx = [word2Ind[word] for word in words]
X = embs[idx, :]
print(X.shape, idx)  # X.shape:  Number of words of dimension N each 

(10, 50) [2744, 3949, 2960, 3022, 5672, 1452, 5671, 4189, 2315, 4276]
