# gpt-dev

In [2]:
# Text to JSON traslation 

#<------------- ENCODE -------------------------------------------------------------><------------- DECODE ------------------------------------------------------------------------------->
#Archivia i dettagli del cliente Dario Verdi il cell 3400000103 e dverdi@examples.org<START>{"nome": "Dario","cognome": "Verdi","email": "dverdi@examples.org","telefono": 3400000103}<END>

In [3]:
import torch

if torch.cuda.is_available():
    print("CUDA is available. List of devices:")
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

CUDA is available. List of devices:
1
NVIDIA GeForce RTX 3060
cuda


In [4]:
# Read the data file
raw = 'jsonrequests2.txt'

with open(raw,'r',encoding='utf-8') as f:
    text = f.read()

print("length of the dataset in characters: ", len(text))
print("Look the first characters of the data: ",text[:200])

chars = sorted(list(set(text)))
vocab_size = len(chars)

print("List of the vocabulary: ",''.join(chars))
print("Vocab size: ", vocab_size)


length of the dataset in characters:  18158
Look the first characters of the data:  {"prompt":"Inserisci nel database l'utente Giulia Monti, contatti: email giuliamonti@example.it, telefono 3400000000","completion": {"nome": "Giulia","cognome": "Monti","email": "giuliamonti@example.i
List of the vocabulary:  
 "'+,-.0123456789:;@ABCDEFGHILMNOPQRSTVabcdefghiklmnopqrstuvwxyz{}
Vocab size:  67


In [5]:
# Create a mapping between characters and integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l])

example ="Archivia i dettagli del cliente Dario Verdi il cell 3400000103 e dverdi@examples.org"

print(encode(example))
print(decode(encode(example)))

[21, 56, 42, 47, 48, 60, 48, 40, 1, 48, 1, 43, 44, 58, 58, 40, 46, 50, 48, 1, 43, 44, 50, 1, 42, 50, 48, 44, 52, 58, 44, 1, 24, 40, 56, 48, 53, 1, 39, 44, 56, 43, 48, 1, 48, 50, 1, 42, 44, 50, 50, 1, 11, 12, 8, 8, 8, 8, 8, 9, 8, 11, 1, 44, 1, 43, 60, 44, 56, 43, 48, 20, 44, 62, 40, 51, 54, 50, 44, 57, 7, 53, 56, 46]
Archivia i dettagli del cliente Dario Verdi il cell 3400000103 e dverdi@examples.org


In [6]:
# Map the entire dataset into integers
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:10])

torch.Size([18158]) torch.int64
tensor([65,  2, 54, 56, 53, 51, 54, 58,  2, 18])


In [7]:
# Division the dataset into training and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
# define the context x and target y for the training
block_size = 8 
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is: {context.tolist()} the target is: {target}")

When input is: [65] the target is: 2
When input is: [65, 2] the target is: 54
When input is: [65, 2, 54] the target is: 56
When input is: [65, 2, 54, 56] the target is: 53
When input is: [65, 2, 54, 56, 53] the target is: 51
When input is: [65, 2, 54, 56, 53, 51] the target is: 54
When input is: [65, 2, 54, 56, 53, 51, 54] the target is: 58
When input is: [65, 2, 54, 56, 53, 51, 54, 58] the target is: 2


In [9]:
# get random chucks of the data
torch.manual_seed(42)
batch_size = 2 #how many chucks of data we want to process in parallel
block_size = 8 #the length of each chuck 

# get the batch of data
def get_batch(split):
        data = train_data if split == 'train' else val_data
        ix = torch.randint(len(data) -block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
        return x,y

xb,yb = get_batch('train')
print(xb.shape, yb.shape)

for b in range(batch_size): #Batch
        for t in range(block_size): #Time
                context = xb[b,:t+1]
                target = yb[b,t]
                print(f"When characters input is: '{decode(context.tolist())}' the target is: {target}")

torch.Size([2, 8]) torch.Size([2, 8])
When characters input is: 'd' the target is: 7
When characters input is: 'd.' the target is: 42
When characters input is: 'd.c' the target is: 53
When characters input is: 'd.co' the target is: 51
When characters input is: 'd.com' the target is: 2
When characters input is: 'd.com"' the target is: 5
When characters input is: 'd.com",' the target is: 2
When characters input is: 'd.com","' the target is: 58
When characters input is: '5' the target is: 5
When characters input is: '5,' the target is: 1
When characters input is: '5, ' the target is: 54
When characters input is: '5, p' the target is: 53
When characters input is: '5, po' the target is: 57
When characters input is: '5, pos' the target is: 58
When characters input is: '5, post' the target is: 40
When characters input is: '5, posta' the target is: 1


# Parameters

In [10]:
# hyper-params testing with CPU
n_embd = 32  # Size of each embedding vector
block_size = 8  # Length of the sequence to be processed
vocab_size = len(chars)
batch_size = 32
n_head = 4  # Number of attention heads
n_layer = 2  # Number of transformer layers
dropout = 0.2  # Dropout rate prevent the neural network from overfitting

In [11]:
# hyper-params testing with GPU
block_size = 256
vocab_size = len(chars)
n_layer = 6
n_head = 6
n_embd = 384
bias = False
assert not bias, "this notebook assumes bias=False just for simplicity"

batch_size = 32 # how many independent sequences to be trained in parallel, the v-ram occupied is proportional to the batch size 
dropout = 0.2  # Dropout rate prevent the neural network from overfitting

# Decode Trasformer Model 

- Reference: [Attention is All You Need](https://arxiv.org/abs/1706.03762)

In [12]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(42)

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        
    def forward (self, x):
        B,T,C = x.shape
        k = self.key(x) # (B,T,C)
        q = self.query (x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (В, T, C) @ (В, C, T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei) 
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T, C)
        out = wei @ v # (В, Т, Т) @ (B, Т, C) - (B,T,C)
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)
    
    
class Block(nn.Module):
    def __init__(self,n_embd,n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.sa = MultiHeadAttention(n_heads,head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self,x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd)
        
        self.blocks = nn.Sequential(*[Block(n_embd,n_heads=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)

    def forward(self,idx,targets=None):

        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) # B,T,H=n_embd
        pos_emb = self.position_embedding_table(torch.arange(T,device=idx.device)) # T,H=n_embd
        x = tok_emb + pos_emb # B, T,H=n_embd for broadcasting
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # B,T,C=vocab_size

        if targets is None:
            loss = None
        else:
            B,T,C  = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T) 
            loss = F.cross_entropy(logits,targets)

        return logits,loss
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        idx = idx.to(device)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx 

xb = xb.to(device)
yb = yb.to(device)
m = BigramLanguageModel().to(device)

logits, loss = m(xb,yb)
print(logits.shape) # B batch,T time,C channel
print(loss)


print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([16, 67])
tensor(4.4523, device='cuda:0', grad_fn=<NllLossBackward0>)

S9onO8c'Txc'u:sm;pR3dn:kCs7iT E5cRDpID3u91bMs{,:+
LotiwVqFaOFCGRM6hqy+8uoOomI59{HDngQ R;"I3i@GCtT.A'


In [13]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [14]:
for steps in range(5000):
    xb,yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits,loss = m(xb,yb)  
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 500 == 0:
        print(f'At step {steps} the loss is {loss.item()}')

print(loss.item())

At step 0 the loss is 4.384756088256836
At step 500 the loss is 0.19141004979610443
At step 1000 the loss is 0.08020901679992676
At step 1500 the loss is 0.054469846189022064
At step 2000 the loss is 0.04692617803812027
At step 2500 the loss is 0.046780623495578766
At step 3000 the loss is 0.03848539665341377
At step 3500 the loss is 0.03568541258573532
At step 4000 the loss is 0.03606744483113289
At step 4500 the loss is 0.03547069430351257
0.03820186108350754


In [15]:
# print the number of parameters:
print(sum(p.numel() for p in m.parameters())/1e6, 'M Parameters') 

10.790467 M Parameters


In [16]:
# generate text with the model
print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens=900)[0].tolist()))


{"prompt":"Inserisci in archivio l'utente Olivia Ferri, e-mail: oliviaferri@online.it, telefono 0039 3400000112","completion": {"nome": "Olivia","cognome": "Ferri","email": "oliviaferri@online.it","telefono": 3400000112}}
{"prompt":"Crea nel database il profilo di Pietro Bianchi, contatti: tel +39 3400000113, email pietrob@myemail.it","completion": {"nome": "Pietro","cognome": "Bianchi","email": "pietro.bianchi@myAile.com","telefono": 34000001134}}
{"prompt":"Aggiungi al registro Luca Neri, contatti: numero di tel 00393400000107, e-mail l.neri@domain.it","completion": {"nome": "Luca","cognome": "Neri","email": "l.neri@domain.it","telefono": 3400000109}}
{"prompt":"Aggiungi al sistema l'utente Marta Bianco con dettagli: email mbianco@sample.com, cell +39 3400000110","completion": {"nome": "Marta","cognome": "Bianco","email": "mbianco@sample.com","telefono": 3400000110}}
{"prompt":"Registr


----------------------------------------

# Testing 

## Layer Norm

In [None]:
class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]
  
torch.manual_seed(42)
module = BatchNorm1d(100)
x = torch.randn(32, 100)
x = module(x)
x.shape


In [None]:
x[:,0].mean(), x[:,0].std()

In [None]:
x[0,:].mean(), x[0,:].std()

# Self attention for a single head 

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x= torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)  # B,T,16
q = query(x) # B,T,16

wei = q @ k.transpose(-2,-1) * head_size**-0.5 # (B,T,16) @ (B,16,T) = (B,T,T) 

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0,float('-inf')) #the future can't communicate with the past
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v


In [None]:
wei[0]