In [None]:
import time
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle

In [18]:
device = (
    'cuda' if torch.cuda.is_available()
    else 'mps' if torch.backends.mps.is_available()
    else 'cpu'
)
print(device)

cuda


### GPT Hyperparameters

In [None]:
# MODEL VARIABLES
batch_size = 8                  # Number of sequences processed in parallel during one training step.
block_size = 32                 # context window length
n_embd = 384                    # Embedding dimension — the size of each token’s vector.
n_head = 8                      # Number of attention heads. Each head learns a different relational pattern among tokens. Each head operates on n_embd / n_head = 48 features here.
n_layer = 1                     # Number of Transformer blocks/layers stacked.
dropout = 0.2                   # Randomly zeroes out 20% of connections during training to prevent overfitting.

# TRAINING CONTROL VARIABLES
max_iters = 2000                # How many training steps (batches) to run. One iteration = one optimizer update.
learning_rate = 3e-4            # Step size in optimization (how far parameters move per update).
eval_iters = 100                # How many mini-batches to average when estimating training/validation loss.

In [30]:
# read in our corpus into text string
chars = ""
with open('data\philosophers.txt','r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)
print(chars)
print(text[:200])

['\n', ' ', '!', '"', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', '—', '’', '“', '”', '\ufeff']
﻿

=== PLATO ===

How you, O Athenians, have been affected by my accusers, I cannot tell;
but I know that they almost made me forget who I was—so persuasively
did they speak; and yet they have hardly 


In [31]:
# encoder and decoder for strings to integers and vice versa

#chars = sorted(set(text))
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [22]:
# memory map for using small snippets of text from a single file of any size
def get_random_chunk(split):
    filename = "data\\philosophers.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data
#

In [32]:
data = torch.tensor(encode(text), dtype=torch.long)
# use first 80% of corpus for training; remaining 20% for validation
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = get_random_chunk(split) # Elliot's version - uses a different method to get data from train/test
    #data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x, y

#### Classes of the GPT Model
* Head
* MultiHeadAttention
* FeedForward
* Block
* GPTLanguageModel

In [None]:
class Head(nn.Module):
    """ one head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)

        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        # computer attention scores ("affinities")

        # head_size = subproblems
        # scaling controls so no one individual head dominates / "hear all voices evenly"
        weights = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B,T, hs) @ (B, hs, T) -> (B,T,T)

        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        weights = F.softmax(weights, dim=-1) # (B, T, T)
        weights = self.dropout(weights)

        # perform the weighted aggregation of the values
        out = weights @ v # (B, T, T) @ (B,T, hs) -> (B ,T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel"""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])     # heads running in parallel - from HEAD class
        self.proj = nn.Linear(head_size * num_heads, n_embd)                        # projection
        self.dropout = nn.Dropout(dropout)                                          # turn off some neurons

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)                         # concatenate along feature dimension (B,T,F) -> (B,T, [h1, h1, h1, h1, h2,h2,h2,h2,h3,h3,h3,h3])
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity"""
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4* n_embd, n_embd),
            nn.Dropout(dropout) # used to prevent overfitting. look up later.
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation"""
    # Blocks have two main sub-components: the self attention layer, and the feed forward network

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head                        # number of features that each head will capture in multi-head attention
        self.sa = MultiHeadAttention(n_head, head_size)     # self-attention layer from MultiHeadAttention class
        self.ffwd = FeedForward(n_embd)                     # feed-forward network
        self.ln1 = nn.LayerNorm(n_embd)                     # post-norm
        self.ln2 = nn.LayerNorm(n_embd)                     # post-norm

    def forward(self, x):
        y = self.sa(x)                  # self-attention layer from MultiHeadAttention class
        x = self.ln1(x+y)               # add+norm
        y = self.ffwd(x)                # feed-forward network
        x = self.ln2(x+y)               # add+norm
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocabulary_size):
        super().__init__()
        self.token_embeddings_table = nn.Embedding(vocabulary_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # decoder layers
        self.ln_f = nn.LayerNorm(n_embd) #normalization
        self.lm_head = nn.Linear(n_embd, vocabulary_size) # linear transformation

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    # forward transforms tokens into embeddings, passing them through attention blocks, and projecting them back to vocabulary scores
    def forward(self, index, targets=None):                                         # index is the X tensor passed from model(X,Y); targets is the Y tensor

        B, T = index.shape                                                          # index is a tensor with B batches, and T time-steps (or sequence lengths)

        tok_emb = self.token_embeddings_table(index)                                # token embedding vectors for the index tensor. should be shape of (B, T, number of embedding features for each token - aka n_embd aka C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))     # positional embedding vector for the index tensor. shape is (T, C aka n_embd)
        x = tok_emb + pos_emb                                                       # token and positional embeddings combined to form tensor of shape (B,T,C aka n_embd)
        x = self.blocks(x)                                                          # transformer block(s) from the Block class
        x = self.ln_f(x)                                                            # normalization
        logits = self.lm_head(x)                                                    # linear transformation

        if targets is None:
            return logits, None
        
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss #return logits.view(B, T, C), loss
    

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]

            # get the predictions
            logits, loss = self.forward(index_cond) #logits, _ = self.forward(index)          # (B,T,C)
            #print(f'index {index}')
            #print(logits)

            # focus only on the last time step
            logits = logits[:, -1, :]                # take last time step -> (B,C)
            #print('last')
            #print(logits)
            probs = F.softmax(logits, dim=-1)        # turn into probabilities
            #print('probs')
            #print(probs)
            index_next = torch.multinomial(probs, num_samples=1) # sample next token -> (B,1)
            #print(index_next)
            index = torch.cat((index, index_next), dim=1)
        return index


In [33]:
model = GPTLanguageModel(vocab_size).to(device)

#m = model.to(device)

#next(m.parameters()).device

In [None]:
@torch.no_grad()                            # decorator that disables gradient tracking - unneccesary during evaluation
def estimate_loss() -> dict:                # returns a dictionary that contains the average loss for training and validation splits
    out = {}                                # instantiate dictionary that will be returned by the function
    model.eval()                            # switch model to evaluation mode; disables dropout randomness, among other things unnecessary when not training
    for split in ['train','val']:           # done to return a loss for training and validation
        losses = torch.zeros(eval_iters)    # eval_iters = number of mini-batches of data to be used during loss estimation.
        for k in range(eval_iters):
            X,Y = get_batch(split)          # get your X and Y values
            logits, loss = model(X,Y)       # model(X,Y) calls the forward() method
            losses[k] = loss.item()
        out[split] = losses.mean()          # average loss among the n number of batches saved.
    model.train()                           # switch model back to train mode
    return out

In [None]:
start_time = time.perf_counter()

# create the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    
    losses = estimate_loss()

    if iter % 20 == 0: print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits,loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

with open('model-03.pkl', 'wb') as f:
    pickle.dump(model,f)
print('model saved')

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Code block executed in {elapsed_time:.4f} seconds.")

0
step: 0, train loss: 4.453, val loss 4.453
1
step: 1, train loss: 4.420, val loss 4.424
2
step: 2, train loss: 4.388, val loss 4.389
3
step: 3, train loss: 4.357, val loss 4.361
4
step: 4, train loss: 4.325, val loss 4.330
5
step: 5, train loss: 4.296, val loss 4.295
6
step: 6, train loss: 4.263, val loss 4.264
7
step: 7, train loss: 4.223, val loss 4.226
8
step: 8, train loss: 4.196, val loss 4.200
9
step: 9, train loss: 4.158, val loss 4.157
10
step: 10, train loss: 4.124, val loss 4.118
11
step: 11, train loss: 4.081, val loss 4.085
12
step: 12, train loss: 4.039, val loss 4.042
13
step: 13, train loss: 3.999, val loss 4.000
14
step: 14, train loss: 3.964, val loss 3.955
15
step: 15, train loss: 3.916, val loss 3.918
16
step: 16, train loss: 3.873, val loss 3.859
17
step: 17, train loss: 3.815, val loss 3.824
18
step: 18, train loss: 3.775, val loss 3.773
19
step: 19, train loss: 3.724, val loss 3.731
20
step: 20, train loss: 3.685, val loss 3.687
21
step: 21, train loss: 3.633, v

In [None]:
prompt = "Hello! Can you see me?"
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Hello! Can you see me?
erodonom anctit t of
who An tey arens),of thee y jorequt gearman ugeriatof and es walivell s, evene


In [35]:
prompt = "Hello! Can you see me?"
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Hello! Can you see me?.um s , may the a. ir an Ae t theny oyhemonelil t t fr
or th f5iun
man 9; a
dhino
omftes tm thusseve


In [None]:
import time

start_time = time.perf_counter()

# create the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    print(iter)
    losses = estimate_loss()
    print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits,loss = model.forward(xb,yb) #don't use forward
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

with open('model-03.pkl', 'wb') as f:
    pickle.dump(model,f)
print('model saved')

end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Code block executed in {elapsed_time:.4f} seconds.")

0
step: 0, train loss: 2.527, val loss 2.558
1
step: 1, train loss: 2.559, val loss 2.578
2
step: 2, train loss: 2.586, val loss 2.581
3
step: 3, train loss: 2.586, val loss 2.627
4
step: 4, train loss: 2.583, val loss 2.575
5
step: 5, train loss: 2.610, val loss 2.565
6
step: 6, train loss: 2.588, val loss 2.549
7
step: 7, train loss: 2.550, val loss 2.579
8
step: 8, train loss: 2.595, val loss 2.548
9
step: 9, train loss: 2.576, val loss 2.544
10
step: 10, train loss: 2.543, val loss 2.556
11
step: 11, train loss: 2.566, val loss 2.543
12
step: 12, train loss: 2.549, val loss 2.557
13
step: 13, train loss: 2.569, val loss 2.546
14
step: 14, train loss: 2.580, val loss 2.540
15
step: 15, train loss: 2.575, val loss 2.531
16
step: 16, train loss: 2.567, val loss 2.542
17
step: 17, train loss: 2.547, val loss 2.544
18
step: 18, train loss: 2.538, val loss 2.537
19
step: 19, train loss: 2.543, val loss 2.546
20
step: 20, train loss: 2.538, val loss 2.569
21
step: 21, train loss: 2.522, v

In [37]:
prompt = "Hello! Can you see me?"
context = torch.tensor(encode(prompt), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context.unsqueeze(0), max_new_tokens=100)[0].tolist())
print(generated_chars)

Hello! Can you see me?" kene satttanel furs yof isersuch
onf or force was he cophasodler eastemeed tists fit foors, was to


In [14]:
import inspect
help(nn.Linear)
#print(inspect.signature(nn.Module.__init__))

Help on class Linear in module torch.nn.modules.linear:

class Linear(torch.nn.modules.module.Module)
 |  Linear(in_features: int, out_features: int, bias: bool = True, device=None, dtype=None) -> None
 |  
 |  Applies an affine linear transformation to the incoming data: :math:`y = xA^T + b`.
 |  
 |  This module supports :ref:`TensorFloat32<tf32_on_ampere>`.
 |  
 |  On certain ROCm devices, when using float16 inputs this module will use :ref:`different precision<fp16_on_mi200>` for backward.
 |  
 |  Args:
 |      in_features: size of each input sample
 |      out_features: size of each output sample
 |      bias: If set to ``False``, the layer will not learn an additive bias.
 |          Default: ``True``
 |  
 |  Shape:
 |      - Input: :math:`(*, H_\text{in})` where :math:`*` means any number of
 |        dimensions including none and :math:`H_\text{in} = \text{in\_features}`.
 |      - Output: :math:`(*, H_\text{out})` where all but the last dimension
 |        are the same shap

In [15]:
class fakename(nn.Linear):
    def __init__(self, name, in_features, out_features):
        super().__init__(in_features, out_features)
        self.name = name

me = fakename("Carlos", 4, 4)