In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import nanogpt_train
from tokenize_support import TokenizationProcedure

torch.manual_seed(15)

<torch._C.Generator at 0x7f7d5b4f6bb0>

### Tokenize the data

In [2]:
#Get the input text
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

processing_type = "byte"
tokenizer = TokenizationProcedure(processing_type = processing_type)

##Sort the text
data = torch.tensor(tokenizer.encode_text(text), dtype = torch.long)

#Train and Test Splits
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]



In [3]:
data[:10]

tensor([ 7127, 84479,   734, 13036,   581, 18988,  1062,  6544,    11,  9598])

In [4]:
train_data.shape

torch.Size([267845])

In [5]:
val_data.shape

torch.Size([29761])

In [6]:
#hyperparameters section
batch_size = 16 #How many independent sequences of characters will we process in parallel
context_length = 256 #-Cody's version of block size. What is the maximum context length for predictions
max_iters = 3500#3000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
gen_dropout = 0.2
num_blocks = 2
num_heads = 6
attn_dropout = 0.2
residual_cxn_dropout = 0.2
head_size = n_embd//num_heads #head size is determined dynamically by seeing how many ewmbeddings need to be split amongst the requested heads. 


#Vocabulary size for LLMs should generally be all possible tokens, not just those found within the data set. Accordingly, we want ot use the n_vocab value of the tokenizer from the tokenizer class.
if processing_type == "byte":
    vocab_size = tokenizer.tokenizer.n_vocab
else:
    vocab_size = len(data.unique()) #vocabulary size is determined by the number of unique tokens from the text corpus we are using and the byte-pair tokenizer we are using



In [7]:
model_args = {'batch_size':64, 
              'context_length':256,
              'max_iters':3500, 
              'eval_interval':500,
              'learning_rate':3e-4, 
              'device':device, 
              'eval_iters':eval_iters,
              'n_embd':n_embd,
              'gen_dropout': gen_dropout,
              'num_blocks':num_blocks,
              'num_heads': num_heads,
              'head_size':head_size,
              'attn_dropout': attn_dropout, 
              'residual_cxn_dropout':residual_cxn_dropout,
              'vocab_size':vocab_size,
              'head_size':head_size}


In [8]:
##Run the model
#Initiate the model
gptconfig = nanogpt_train.GPTConfig(**model_args)
gptconfig.head_size

64

In [9]:
##Run the model
model = nanogpt_train.PrelimGPT(gptconfig)
print(model)

PrelimGPT(
  (token_embedding_table): Embedding(200019, 384)
  (position_embedding_table): Embedding(256, 384)
  (attention_blocks): Sequential(
    (0): Block(
      (multihead_attention): CausalSelfAttention(
        (kqv): Linear(in_features=384, out_features=1152, bias=False)
        (dropout): Dropout(p=0.2, inplace=False)
        (attn_dropout): Dropout(p=0.2, inplace=False)
        (residual_dropout): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=384, out_features=384, bias=True)
      )
      (ln1): LayerNorm()
      (MLP): MLP(
        (sequential_MLP): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=False)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1536, out_features=384, bias=False)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln2): LayerNorm()
    )
    (1): Block(
      (multihead_attention): CausalSelfAttention(
        (kqv): Linear(in_features=384, out_features=11

In [10]:
#Batch splitting
def get_batch(split, batch_size, context_length):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - context_length, (batch_size,)) #tensor of four integers representing four characters from the text
    
    # So for the random characters, we get i and then up to the block size. Then we put in a bigger tensor
    x = torch.stack([data[i:i+context_length] for i in ix])
    #same thing for y
    y = torch.stack([data[i+1:i+context_length+1] for i in ix])
    
    #for gpu compatibility
    x = x.to(device)
    y= y.to(device)
    
    return x,y

@torch.no_grad()#Everything that happens inside this function, don't keep track of grad on estimate loss
def estimate_loss(model, split):
    
    """This function is used for switching between eval and train model to routinely
    evaluate model performance"""
    
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    
    #Switch back to eval 
    model.train()
    return out


In [None]:
get_batch(split = 'train', batch_size = batch_size, context_length = context_length)

In [11]:
torch.cuda.empty_cache()

In [12]:
##Run the model
#Initiate the model
gptconfig = nanogpt_train.GPTConfig(**model_args)
model = nanogpt_train.PrelimGPT(gptconfig)

print(model)

m = model.to(device)

#create a PyTorch optimizer
optimizer = torch.optim.AdamW(params = m.parameters(), lr = learning_rate)


for _ in range(max_iters):

    #evaluate loss
    if _ % eval_interval == 0 and _ != 0:
        try:
            losses = estimate_loss(model, 'eval')
            print(f"step {_}: train loss {losses['train']:4f}, val loss {losses['val']:.4f}")
        except Exception as e:
            if "out of memory" in str(e):
                print("Cuda ran out of memory , we need to clear the GPU cache")
                torch.cuda.empty_cache()
                torch.cuda.ipc_collect()
            else:
                raise
    
    #Get the batches
    xb, yb = get_batch('train')
    
    #Run the forward pass
    try:
        logits, loss = m(xb, yb)
    except Exception as e:
        if "out of memory" in str(e):
            print("Cuda ran out of memory , we need to clear the GPU cache")
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        else:
            raise
    
    if device == 'cuda' and _%eval_interval == 0:
    
        num_gpus = torch.cuda.device_count()
        
        for n_gpu in range(num_gpus):
            local_device = torch.device(f"cuda:{n_gpu}")
            total_memory = torch.cuda.get_device_properties(local_device).total_memory / (1024**3) 
            allocated_memory = torch.cuda.memory_allocated() / (1024**3)
            reserved_memory = torch.cuda.memory_reserved() / (1024**3)
            open_memory = total_memory - reserved_memory
            
        print(f"Allocated: {allocated_memory:.2f} GB")
        print(f"Free Memory: {open_memory:.2f} GB")
        print(f"Cached: {reserved_memory:.2f} GB")
        
        

    
    #zero the gradients prior to running gradient calculations
    optimizer.zero_grad(set_to_none=True)
    
    #Calculate the gradients and use them to update the parameters
    loss.backward()
    
    #step the optimizer to update the parameters
    optimizer.step()

PrelimGPT(
  (token_embedding_table): Embedding(200019, 384)
  (position_embedding_table): Embedding(256, 384)
  (attention_blocks): Sequential(
    (0): Block(
      (multihead_attention): CausalSelfAttention(
        (kqv): Linear(in_features=384, out_features=1152, bias=False)
        (dropout): Dropout(p=0.2, inplace=False)
        (attn_dropout): Dropout(p=0.2, inplace=False)
        (residual_dropout): Dropout(p=0.2, inplace=False)
        (projection): Linear(in_features=384, out_features=384, bias=True)
      )
      (ln1): LayerNorm()
      (MLP): MLP(
        (sequential_MLP): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=False)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1536, out_features=384, bias=False)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln2): LayerNorm()
    )
    (1): Block(
      (multihead_attention): CausalSelfAttention(
        (kqv): Linear(in_features=384, out_features=11

In [None]:
#Generation section
#Turn it into a list and decode
#batch_of_interest = [str(x) for x in batch_of_interest]
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(''.join(decode(m.generate(context, max_new_tokens = 500)[0].tolist())))