In [2]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from tokenizer.tokenizer import BPETokenizer

In [3]:
class Embedding:
    def __init__(self, num_embed=512, embed_dim=768):
        self.tokenizer = BPETokenizer()
        self.vec_matrix = np.random.randn(num_embed, embed_dim)
        self.pos_matrix = np.random.randn(num_embed, embed_dim)

    def vector(self, list_of_strings):
        batch = []
        lengths = []
        tokens = [self.tokenizer.encode(s) for s in list_of_strings]
        max_len = max(len(t) for t in tokens)
        for token in tokens:
            padded = np.pad(token, (0, max_len - len(token)), constant_values=0)
            embedding = self.vec_matrix[padded]
            batch.append(embedding)
            lengths.append(len(token))

        return np.stack(batch), lengths

    def positional(self, lengths):
        batch = []
        max_len = max(lengths)
        for seq_len in lengths:
            pos_array = np.array([i for i in range(seq_len)], dtype=np.int64)
            padded = np.pad(pos_array, (0, max_len - seq_len), constant_values=0)
            pos_embed = self.pos_matrix[padded]
            batch.append(pos_embed)

        return np.stack(batch)

    def forward(self, list_of_strings):
        vec_embed, seq_len = self.vector(list_of_strings)
        pos_embed = self.positional(seq_len)
        embedding = vec_embed + pos_embed

        return embedding

In [4]:
import numpy as np

embed = Embedding()

embed.forward([
    "The sun is bright.",
    "Hello world!",
    "AI is transforming the world.",
    "Short"
])

array([[[-0.67289758, -0.65876184, -1.16885848, ..., -0.14573598,
          1.53256203, -0.98027233],
        [ 0.70515587, -0.60343866, -1.57071777, ...,  0.87306986,
         -1.5945032 , -1.39735547],
        [ 1.30462285, -1.70272355,  0.61659251, ..., -2.36780905,
         -1.72164236, -0.30326686],
        ...,
        [-1.8928477 , -1.7798639 , -1.33986841, ...,  0.23387694,
          0.18005925,  0.57449301],
        [-1.8928477 , -1.7798639 , -1.33986841, ...,  0.23387694,
          0.18005925,  0.57449301],
        [-1.8928477 , -1.7798639 , -1.33986841, ...,  0.23387694,
          0.18005925,  0.57449301]],

       [[-1.56446585, -1.05304898, -1.35402063, ...,  1.0427826 ,
         -0.95605947, -0.07663333],
        [ 0.0812989 , -0.99898133, -1.32176596, ..., -0.62964367,
         -1.73148353, -0.68579836],
        [ 1.40792434, -1.54112876,  2.67011315, ..., -0.21924224,
         -0.10274299,  0.01540285],
        ...,
        [-1.8928477 , -1.7798639 , -1.33986841, ...,  

In [11]:
class Attention:
    def __init__(self, embed_dim=768, head_dim=64, num_head=12):
        self.embed_dim = embed_dim
        self.head_dim = head_dim
        self.num_head = num_head
        self.w_q = [
            np.random.randn(self.embed_dim, self.head_dim) for _ in range(self.num_head)
        ]
        self.w_k = [
            np.random.randn(self.embed_dim, self.head_dim) for _ in range(self.num_head)
        ]
        self.w_v = [
            np.random.randn(self.embed_dim, self.head_dim) for _ in range(self.num_head)
        ]
        self.w_o = np.random.randn(self.head_dim * self.num_head, self.embed_dim)
        self.embed = Embedding()

    @staticmethod
    def softmax(x, axis=-1):
        exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
        return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

    def sdpa(self, q, k, v):
        scores = np.matmul(q, np.swapaxes(k, -2, -1))
        scaled_scores = scores / np.sqrt(self.head_dim)

        d_m = scaled_scores.shape[1]
        mask = np.triu(np.ones((d_m, d_m)) * -np.inf, k=1)
        mask = mask[np.newaxis, :, :]
        masked_scores = scaled_scores + mask

        a = self.softmax(masked_scores)
        z = np.matmul(a, v)

        return z
 
    def MultiHead(self, list_of_strings):
        x = self.embed.forward(list_of_strings)

        heads = []
        for i in range(self.num_head):
            query = np.matmul(x, self.w_q[i])
            key = np.matmul(x, self.w_k[i])
            value = np.matmul(x, self.w_v[i])

            z_i = self.sdpa(query, key, value)
            heads.append(z_i)

        concat = np.concatenate(heads, axis=-1)
        output = np.matmul(concat, self.w_o)

        return output

In [12]:
attention = Attention()

input = [
    "The sun is bright.",
    "Hello world!",
    "AI is transforming the world.",
    "Short"
]

attention.MultiHead(input)

  query = np.matmul(x, self.w_q[i])
  query = np.matmul(x, self.w_q[i])
  query = np.matmul(x, self.w_q[i])
  key = np.matmul(x, self.w_k[i])
  key = np.matmul(x, self.w_k[i])
  key = np.matmul(x, self.w_k[i])
  value = np.matmul(x, self.w_v[i])
  value = np.matmul(x, self.w_v[i])
  value = np.matmul(x, self.w_v[i])
  scores = np.matmul(q, np.swapaxes(k, -2, -1))
  scores = np.matmul(q, np.swapaxes(k, -2, -1))
  scores = np.matmul(q, np.swapaxes(k, -2, -1))
  z = np.matmul(a, v)
  z = np.matmul(a, v)
  z = np.matmul(a, v)
  output = np.matmul(concat, self.w_o)
  output = np.matmul(concat, self.w_o)
  output = np.matmul(concat, self.w_o)


array([[[ 8.76981741e+02, -3.76012659e+02,  5.74969991e+02, ...,
         -3.77158841e+02, -3.38260857e+01, -4.33874268e+02],
        [ 2.87835573e+02, -7.70914279e+02, -4.00145085e+02, ...,
          8.37603895e+02,  7.08407259e+02,  9.95718250e+02],
        [ 1.13469171e+03, -1.57253042e+03,  1.29967458e+03, ...,
          8.15974310e+02, -1.57962632e+02,  3.99832623e+02],
        ...,
        [ 1.28590369e+03, -3.01137607e+02, -4.06931925e+02, ...,
         -1.46399214e+03, -7.86071001e+02, -1.76032089e+03],
        [ 1.28590369e+03, -3.01137607e+02, -4.06931925e+02, ...,
         -1.46399214e+03, -7.86071001e+02, -1.76032089e+03],
        [ 1.28590369e+03, -3.01137607e+02, -4.06931925e+02, ...,
         -1.46399214e+03, -7.86071001e+02, -1.76032089e+03]],

       [[ 3.13004882e+02,  3.91440123e+02,  1.21314096e+03, ...,
         -3.23846965e+02,  6.09130755e+02,  5.78467781e+01],
        [ 8.39614271e+02, -8.80535325e+02, -5.80785279e+02, ...,
         -2.76071634e+02,  1.03998787e