In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import nanogpt_train

torch.manual_seed(1337)

#hyperparameters section
batch_size = 64 #How many independent sequences of characters will we process in parallel
context_length = 256 #-Cody's version of block size. What is the maximum context length for predictions
max_iters = 3500#3000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
gen_dropout = 0.2
num_blocks = 2
num_heads = 6
attn_dropout = 0.2
residual_cxn_dropout = 0.2



### Tokenize the data

In [None]:
#Get the input text
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()
    
#What is the length of the dataset in characters
#print(f"length of the dataset in characters: {len(text)}")


##Sort the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
all_chars = ''.join(chars)

print(f"All Chars: {all_chars}")
print(f"Total number: {vocab_size} chars")


#Lets work on tokenization

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: [itos[c] for c in l]


data = torch.tensor(encode(text), dtype = torch.long)
#print("Every integer in data represents a single character ", data.shape)


#Train and Test Splits
n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]



In [None]:
##Run the model
#Initiate the model

model = BigramLanguageModel()
m = model.to(device)

#create a PyTorch optimizer
optimizer = torch.optim.AdamW(params = m.parameters(), lr = learning_rate)


for _ in range(max_iters):
    
    #evaluate loss
        if _ % eval_interval == 0:
            losses = estimate_loss()
            print(f"step {_}: train loss {losses['train']:4f}, val loss {losses['val']:.4f}")
        
        #Get the batches
        xb, yb = get_batch('train')
        
        #Run the forward pass
        logits, loss = m(xb, yb)
        
        #zero the gradients prior to running gradient calculations
        optimizer.zero_grad(set_to_none=True)
        
        #Calculate the gradients and use them to update the parameters
        loss.backward()
        
        #step the optimizer to update the parameters
        optimizer.step()
    
    
    
#Generation section
#Turn it into a list and decode
#batch_of_interest = [str(x) for x in batch_of_interest]
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(''.join(decode(m.generate(context, max_new_tokens = 500)[0].tolist())))