In [1]:
import re

class SimpleTokenizer:
    def __init__(self):
        self.token_to_id = {"[PAD]": 0, "[UNK]": 1}
        self.num_tokens = 2
    
    # Fit the tokenizer onto the dataset
    def fit(self, texts):
        for text in texts:
            for word in re.findall(r'\w+', text):
                if word not in self.token_to_id:
                    self.token_to_id[word] = self.num_tokens
                    self.num_tokens += 1

    def tokenize(self, text):
        return_list = []
        for word in re.findall(r'\w+', text):
            return_list.append(self.token_to_id.get(word, 1))
        return return_list
    
    def untokenize(self, token_list):
        return_string = ""
        for token in token_list:
            return_string += self.id_to_token.get(token, "[UNK]") + " "
        return return_string

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, filename, is_multi_line=True, chunk_size=1024):
        self.chunks = []
        with open(filename, 'r', encoding="utf-8") as f:
            if is_multi_line:
                self.chunks = f.readlines()
            else:
                while True:
                    chunk = f.read(chunk_size)
                    if not chunk:
                        break  # eof
                    self.chunks.append(chunk)

    def __len__(self):
        return len(self.chunks)
    
    def __getitem__(self, idx):
        return self.chunks[idx]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Example Usage:
texts = ["Hello world!", "How are you?"]
tokenizer = SimpleTokenizer()
tokenizer.fit(texts)
tokenized_text = tokenizer.tokenize("Hello world!")
print(tokenized_text)  # Output: [2, 3]

[2, 3]


In [4]:
# Usage:
dataset = TextDataset('datasets/iac_mini.txt', is_multi_line=False)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

texts = dataset.chunks
tokenizer.fit(texts)

# Process data in batches
for batch in dataloader:
    for line in batch:
        tokens = tokenizer.tokenize(line)
        # print(f"Original text: {line}")
        # print(f"Tokenized text: {tokens}")
    break

In [5]:
import torch
import torch.nn as nn

# Embedding dim is how many "features" you're allowing the model to use
# to describe the vocabulary
vocab_size = tokenizer.num_tokens
embedding_dim = 128

embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

In [6]:
import torch

max_context_len = 512
# Create positional encodings to let the model learn the value of positions
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create positional encodings
        # Define a tensor of context length by input embedding dim
        pe = torch.zeros(max_len, d_model)
        # Create an array of position values for the context len [[0], [1], [2],..., [max_len]]
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # Purely a scaling factor for our tensors so that the values don't get blown up to a bajillion
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # Add positional encodings to the input embeddings
        x = x + self.pe[:x.size(0), :]
        return x


In [None]:
'''
import torch
embed_dim = 4
max_len = 8
pe = torch.zeros(max_len, embed_dim)
# print(pe)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# print(position)
z = torch.arange(0, embed_dim, 2).float()
y = torch.tensor(10000.0)
x = (-torch.log(torch.tensor(10000.0)) / embed_dim)
div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_dim))
myx = position*div_term
# print(f"pos * div = {myx}")
# print(f"sin = {torch.sin(myx)}")
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
print(f"Positional Encoding: {pe}")
pe = pe.unsqueeze(0).transpose(0, 1)
print(f"Positional Encoding: {pe}")
'''

In [None]:
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        
        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"
        
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, x, mask):
        # Run throught the initial linear layers with the input
        queries = self.queries(x)
        keys = self.keys(x)
        values = self.values(x)

        # Splitting into multiple heads
        queries = queries.view(x.size(0), -1, self.heads, self.head_dim)
        keys = keys.view(x.size(0), -1, self.heads, self.head_dim)
        values = values.view(x.size(0), -1, self.heads, self.head_dim)

        attention_scores = torch.matmul(queries, keys.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, float("-inf"))

        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)

        out = torch.matmul(attention_weights, values)
        out = self.fc_out(torch.cat(out, dim=2))
        
        return out

        


In [None]:
import torch
import math

# Define the input tensor x
x = torch.tensor([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
                  [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]], dtype=torch.float32)

# Define the linear transformation layers for queries, keys, and values
class LinearLayers(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.queries = torch.nn.Linear(4, 4)
        self.keys = torch.nn.Linear(4, 4)
        self.values = torch.nn.Linear(4, 4)
        self.fc_out = torch.nn.Linear(4, 4)

    def forward(self, x):
        queries = self.queries(x)
        keys = self.keys(x)
        values = self.values(x)

        # Reshape for multi-head attention
        queries = queries.view(x.size(0), -1, 2, 2)  # Splitting the last dimension into (heads, head_dim)
        keys = keys.view(x.size(0), -1, 2, 2)
        values = values.view(x.size(0), -1, 2, 2)

        # Compute attention scores
        attention_scores = torch.matmul(queries, keys.transpose(-2, -1)) / math.sqrt(2)

        # Softmax normalization
        attention_weights = torch.nn.functional.softmax(attention_scores, dim=-1)

        # Compute output
        out = torch.matmul(attention_weights, values)

        # Concatenating heads
        out = out.view(x.size(0), -1, 4)
        out = self.fc_out(out)
        return out

# Instantiate the model and run the forward pass
model = LinearLayers()
output = model(x)
print(output)
