In [1]:
# More compressed gpt-dev

import torch

if torch.cuda.is_available():
    print("CUDA is available. List of devices:")
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


CUDA is available. List of devices:
1
NVIDIA GeForce RTX 3060
cuda


In [5]:
from datasets import load_dataset

dataset = load_dataset("gretelai/synthetic_text_to_sql",split="train")


In [11]:
print(dataset)
len(dataset)
print(dataset.shape)
print(dataset.column_names)
print(dataset.features)

Dataset({
    features: ['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation'],
    num_rows: 100000
})
(100000, 11)
['id', 'domain', 'domain_description', 'sql_complexity', 'sql_complexity_description', 'sql_task_type', 'sql_task_type_description', 'sql_prompt', 'sql_context', 'sql', 'sql_explanation']
{'id': Value(dtype='int32', id=None), 'domain': Value(dtype='string', id=None), 'domain_description': Value(dtype='string', id=None), 'sql_complexity': Value(dtype='string', id=None), 'sql_complexity_description': Value(dtype='string', id=None), 'sql_task_type': Value(dtype='string', id=None), 'sql_task_type_description': Value(dtype='string', id=None), 'sql_prompt': Value(dtype='string', id=None), 'sql_context': Value(dtype='string', id=None), 'sql': Value(dtype='string', id=None), 'sql_explanation': Value(dtype='string', id=None)}


In [None]:
# Read the data file
raw = 'jsonrequests2.txt'
with open(raw,'r',encoding='utf-8') as f:
    text = f.read()
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create a mapping between characters and integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l])

# Map the entire dataset into integers
import torch
data = torch.tensor(encode(text), dtype=torch.long)

# Division the dataset into training and validation
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

# define the context x and target y for the training
block_size = 8 
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]

# get random chucks of the data
torch.manual_seed(42)
batch_size = 2 #how many chucks of data we want to process in parallel
block_size = 8 #the length of each chuck 

# get the batch of data
def get_batch(split):
        data = train_data if split == 'train' else val_data
        ix = torch.randint(len(data) -block_size, (batch_size,))
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
        return x,y

xb,yb = get_batch('train')

for b in range(batch_size): #Batch
        for t in range(block_size): #Time
                context = xb[b,:t+1]
                target = yb[b,t]


In [None]:
# hyper-params testing with GPU
block_size = 256
vocab_size = len(chars)
n_layer = 6
n_head = 6
n_embd = 384
bias = False
assert not bias, "this notebook assumes bias=False just for simplicity"

batch_size = 32 # how many independent sequences to be trained in parallel, the v-ram occupied is proportional to the batch size 
dropout = 0.2  # Dropout rate prevent the neural network from overfitting

In [None]:
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(42)

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
        
    def forward (self, x):
        B,T,C = x.shape
        k = self.key(x) # (B,T,C)
        q = self.query (x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (В, T, C) @ (В, C, T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei) 
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T, C)
        out = wei @ v # (В, Т, Т) @ (B, Т, C) - (B,T,C)
        return out
    
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)
    
    
class Block(nn.Module):
    def __init__(self,n_embd,n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.sa = MultiHeadAttention(n_heads,head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self,x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table = nn.Embedding(block_size,n_embd)
        
        self.blocks = nn.Sequential(*[Block(n_embd,n_heads=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd,vocab_size)

    def forward(self,idx,targets=None):

        B,T = idx.shape

        tok_emb = self.token_embedding_table(idx) # B,T,H=n_embd
        pos_emb = self.position_embedding_table(torch.arange(T,device=idx.device)) # T,H=n_embd
        x = tok_emb + pos_emb # B, T,H=n_embd for broadcasting
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # B,T,C=vocab_size

        if targets is None:
            loss = None
        else:
            B,T,C  = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T) 
            loss = F.cross_entropy(logits,targets)

        return logits,loss
    
    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        idx = idx.to(device)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx 

xb = xb.to(device)
yb = yb.to(device)
m = BigramLanguageModel().to(device)

logits, loss = m(xb,yb)
print(logits.shape) # B batch,T time,C channel
print(loss)


print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens=100)[0].tolist()))


In [None]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

for steps in range(5000):
    xb,yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits,loss = m(xb,yb)  
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 500 == 0:
        print(f'At step {steps} the loss is {loss.item()}')

print(loss.item())

In [None]:
# print the number of parameters:
print(sum(p.numel() for p in m.parameters())/1e6, 'M Parameters') 

# generate text with the model
print(decode(m.generate(idx = torch.zeros((1,1), dtype = torch.long), max_new_tokens=900)[0].tolist()))