In [1]:
import numpy as np



In [2]:
with open("./shakespeare.txt", "r", encoding="utf-8") as file:
    content = file.read()
    
sample = content[:1000]
sample

'This is the 100th Etext file presented by Project Gutenberg, and\nis presented in cooperation with World Library, Inc., from their\nLibrary of the Future and Shakespeare CDROMS.  Project Gutenberg\noften releases Etexts that are NOT placed in the Public Domain!!\n\nShakespeare\n\n*This Etext has certain copyright implications you should read!*\n\n<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM\nSHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS\nPROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE\nWITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE\nDISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS\nPERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED\nCOMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY\nSERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>\n\n*Project Gutenberg is proud to cooperate with The World Library*\nin the presentation of The Complete Works of William Shakespeare\nfor your

---
    training data should look like:
    
    <center>       ->    <context>
    [0, ..., 1, 0] ->    [0, ..., 1, 0]
    [0, ..., 1, 0] ->    [0, ..., 0, 1]
    [0, ..., 1, 0] ->    [1, ..., 0, 0]
    ...            ->    ...
    
    where the center word (input) have a number of <context words>
    based on the size of the window hence:
        if window_size = 2; two context words from both sides (left & right).
    
    objective:
        based on the given input <center word> try to predict the output <context word>
        and when we reach to a good performance we hope that the neural network will
        try to extract hiddin features, which are going to be the meanings of input words
        in a numerical representation.
        
    steps:
        1. split input text into chunks (tokens)
        2. assign a token_id per token
        3. encode thoes tokens based on their ids (one-hot encoding)
        4. building the training data as intended above
        5. 
    
    NOTE: this is going to be the basic solution and from here i'll try to improve.
    
    


In [3]:
def tokenize(text):
    # simple tokenizer (not the best)
    return text.lower().split(' ')

def encode(token_id, vocab_size):
    vector = [0] * vocab_size  # which is going to be the input vector size
    vector[token_id] = 1
    return np.array(vector)  # for training (later)

In [4]:
tokens = tokenize(sample)
tokens[:8]

['this', 'is', 'the', '100th', 'etext', 'file', 'presented', 'by']

In [5]:
token_to_id = {t: i for i, t in enumerate(set(tokens))}
id_to_token = {i: t for i, t in enumerate(set(tokens))}

In [6]:
vocab_size = len(set(tokens))
vocab_size

90

In [7]:
def combine(*iterables):
    # to combine two iterables
    # into single iterable
    for iterable in iterables:
        yield from iterable

def build_train(tokens, token_to_id, vocab_size, window_size=2, verbose=False):
    center_vectors = []
    context_vectors = []
    for i, center in enumerate(tokens):
        # context words iterable on the right side
        r_ctx = range(i+1, min(i+window_size+1, len(tokens)))
        # context words iterable on the left side
        l_ctx = range(max(0, i-window_size), i)
        # combined context iterables from both sides
        c_ctx = combine(l_ctx, r_ctx)
    
        # get numerical representation (one-hot encoding)
        # for both: 1. center word 2. context word    
        cnt_w_id = token_to_id[center]  
        cnt_w_vector = encode(cnt_w_id, vocab_size)  # 1. center word
        for w_id in c_ctx:
            if verbose:
                # <center word>  ->  <ctx word>
                print(f"{center} -> {tokens[w_id]}")
                
            ctx_w_id = token_to_id[tokens[w_id]]
            ctx_w_vector = encode(ctx_w_id, vocab_size)  # 2. context word
            
            center_vectors.append(cnt_w_vector)
            context_vectors.append(ctx_w_vector)
    # store them as numpy.array for training        
    return np.array(center_vectors), np.array(context_vectors)

In [8]:
cnt, ctx = build_train(tokens[:1000], token_to_id, vocab_size)
print("center words shape: ", cnt.shape)
print("context words shape: ", ctx.shape)

center words shape:  (546, 90)
context words shape:  (546, 90)


    center words are duplicated:
    where the same center word has more than context words
    the size depends on the `window_size`
    
    center word (1) -> context word (1)
                    -> context word (2)
                    -> ...
                    -> context word (N)
    ...
    center word (M) -> context word (1)
                    -> context word (2)
                    -> ...
                    -> context word (N)


---

    Forward pass:
    -------------
    A1   = X @ W1
    A2   = A1 @ W2
    Z    = softmax(A2)
    Loss = cross_entropy(Z, y)
    
    
    How changing W2 impacted the loss?
    ----------------------------------
    >> dLoss(Z, y)/dW2 = dLoss/dZ * dZ/dA2 * dA2/dW2 
    >> dLoss(Z, y)/dW2 = (z - y)           * A1
    shape convention:
    - (z - y): (#samples, vocab_size)
    - A1:      (#samples, embedding_size)
    
    shape of target matrix to update (W2): (embedding_size, vocab_size)
    
    ==> A1.T @ (z-y)
    
    
    How changing W1 impacted the loss?
    ----------------------------------
    >> dLoss(Z, t)/dW1 = dLoss/dZ * dZ/dA2 * dA2/dA1 * dA1/dW1
    >> dLoss(Z, t)/dW1 = (z - y)           * W2      * X
    
    shape convention:
    - (z - y): (#samples, vocab_size)
    - W2:      (embedding_size, vocab_size)
    - X:       (#samples, vocab_size) 
    
    shape of target matrix to update (W1): (vocab_size, embedding_size)
    
    ==> x.T @ ( (z-y) @ W2.T )
    

In [9]:
class Model:
    def __init__(self, X, y, vocab_size, embedding_size):
        self.X = X
        self.y = y
        self.vocab_size = vocab_size
        self.w1 = np.random.rand(vocab_size, embedding_size)
        self.w2 = np.random.rand(embedding_size, vocab_size)
    
    def forward(self):
        self.a1 = self.X @ self.w1
        self.a2 = self.a1 @ self.w2
        self.z = self._softmax(self.a2)
        return self.z
    
    def backward(self, alpha):
        self.forward()
        # derivatives
        dl = self.z - self.y
        dw2 = self.a1.T @ dl
        dw1 = self.X.T @ (dl @ self.w2.T)
        # update weights
        self.w2 = self.w2 - (alpha * dw2)
        self.w1 = self.w1 - (alpha * dw1)
        # compute loss
        return self._cross_entropy(self.y, self.z)
    
    @staticmethod    
    def _softmax(a):
        a = a - np.max(a)
        return np.exp(a) / np.sum(np.exp(a))
    
    @staticmethod
    def _cross_entropy(actual, predicted):
        return - np.sum(actual * np.log(predicted))

In [10]:
model = Model(cnt, ctx, vocab_size, 10)

In [11]:
model.backward(alpha=0.0001)

6019.210012276704