In [3]:
import sys
import os
sys.path.append(os.path.abspath('../src'))
import math
import numpy as np
from config import seq_len, batch_size, d_model, vocab_size, n_heads
from utils import tokenize, clean, encode_position

In [None]:
clean('../data/raw_corpus.txt', '../data/clean_corpus.txt')

with open('../data/clean_corpus.txt', 'r', encoding='utf-8') as f:
    corpus = f.read()

tokenize(corpus)

In [19]:
tokenized_corpus = np.load('../data/tokenized_corpus_ids.npy')

n_sequences = len(tokenized_corpus) - seq_len

# Make training examples with stride = 1
x_sequences = np.array([tokenized_corpus[i:i+seq_len] for i in range(n_sequences)])
y_sequences = np.array([tokenized_corpus[i+1:i+seq_len+1] for i in range(n_sequences)])

n_batches = len(x_sequences) // batch_size

x_batches = np.array_split(x_sequences[:n_batches*batch_size], n_batches)
y_batches = np.array_split(y_sequences[:n_batches*batch_size], n_batches)

# Consider using Xavier initialization here
embedding_factor = 1 / math.sqrt(d_model)  # Scale factor for embedding matrix
embedding_matrix = np.random.randn(vocab_size, d_model) * embedding_factor

# Example: 0th batch
X = embedding_matrix[x_batches[0]]
X = encode_position(X)

# Attention
## Initialize weights (Xavier normal)
def xavier_normal(fan_in, fan_out, shape=None):
    if shape is None:
        shape = (fan_in, fan_out)

    std = np.sqrt(2 / (fan_in + fan_out))

    return np.random.normal(loc=0, scale=std, size=shape)

W_K = xavier_normal(d_model, d_model)
W_Q = xavier_normal(d_model, d_model)
W_V = xavier_normal(d_model, d_model)

## Compute K, Q, and V
K = X @ W_K  # shapes: (batch_size, seq_len, d_model) @ (d_model, d_model) = (batch_size, seq_len, d_model)
Q = X @ W_Q
V = X @ W_V

## TODO: Better way to reshape and transpose when axes are kept (?)

## Split
d_k = d_model // n_heads

K = K.reshape(batch_size, seq_len, n_heads, d_k).transpose(0, 2, 1, 3)  # shapes: (batch_size, n_heads, seq_len, d_k)
Q = Q.reshape(batch_size, seq_len, n_heads, d_k).transpose(0, 2, 1, 3)
V = V.reshape(batch_size, seq_len, n_heads, d_k).transpose(0, 2, 1, 3)

## Get scores
scores = (Q @ K.transpose(0, 1, 3, 2)) / np.sqrt(d_k)  # shapes: (batch_size, n_heads, seq_len, d_k) @ (batch_size, n_heads, d_k, seq_len) = (batch_size, n_heads, seq_len, seq_len)

## TODO: Masking
## TODO: Softmax
