# first time with fineweb-edu

In [1]:
from tqdm import tqdm
import tiktoken
import torch 
import datasets
import random

dataset = datasets.load_dataset("HuggingFaceFW/fineweb-edu", data_files=["sample/10BT/000_00000.parquet"], split="train")
dataset = dataset.train_test_split()

enc = tiktoken.get_encoding("gpt2")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string), dtype=torch.long)

def decode(tensor):
    return enc.decode(tensor.cpu().squeeze().numpy())

num_samples = 100_000
dataset_tok_train = torch.cat([encode(dataset["train"][i]["text"]) for i in tqdm(range(num_samples))])
dataset_tok_test = torch.cat([encode(dataset["test"][i]["text"]) for i in tqdm(range(num_samples))])

def get_sample(split, sample_length, batch_size):
    tokens = dataset_tok_train if split == "train" else dataset_tok_test
    idcs = torch.randint(len(tokens)-sample_length, (batch_size,))
    x = torch.stack([torch.tensor(tokens[x:x+sample_length]) for x in idcs])
    y = torch.stack([torch.tensor(tokens[x+1:x+sample_length+1]) for x in idcs])
    return x, y

100%|███████████████████████████████████████████| 100000/100000 [00:46<00:00, 2150.13it/s]
100%|███████████████████████████████████████████| 100000/100000 [00:40<00:00, 2464.28it/s]


In [2]:
print(f"Train data: {len(dataset_tok_train):,} tokens")
print(f"Test data: {len(dataset_tok_test):,} tokens")

Train data: 103,013,759 tokens
Test data: 103,356,615 tokens


In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        # out = F.scaled_dot_product_attention(q, k, v)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [4]:
vocab_size = 50_272
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
batch_size = 64
block_size = 128 
device = "mps"
learning_rate = 3e-4

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')


49.34896 M parameters


In [5]:
import time

log_interval = 10
eval_interval = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

curr_time = time.time()
tok_total = 0 

for curr_step in range(1, 1000):

    # every once in a while evaluate the loss on train and val sets
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_sample('train', block_size, batch_size)
    tok_step = xb.view(-1).size(0)
    tok_total += tok_step

    # evaluate the loss
    logits, loss = model(xb.to(device), yb.to(device))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    step_time = time.time() - curr_time
    if curr_step % log_interval == 0:
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms/step {(tok_step/step_time):.0f}tok/s (total {tok_total:,} tok)")

    if curr_step % eval_interval == 0:
        model.eval()
        xb, yb = get_sample('validation', block_size, batch_size)
        logits, loss = model(xb.to(device), yb.to(device))
        print(f"step {curr_step} eval: val_loss {loss.detach().item():.2f}")
        # with torch.no_grad():
        #     output = model.generate(encode("hello my name is").to(device).unsqueeze(0), 10)
        #     print(decode(output))
        model.train()
    curr_time = time.time()


  x = torch.stack([torch.tensor(tokens[x:x+sample_length]) for x in idcs])
  y = torch.stack([torch.tensor(tokens[x+1:x+sample_length+1]) for x in idcs])


step 10: loss 9.56 1236.76ms/step 6624tok/s (total 81,920 tok)
step 20: loss 8.65 1239.44ms/step 6609tok/s (total 163,840 tok)
step 30: loss 8.12 1234.64ms/step 6635tok/s (total 245,760 tok)
step 40: loss 7.80 1294.27ms/step 6329tok/s (total 327,680 tok)
step 50: loss 7.61 1353.85ms/step 6051tok/s (total 409,600 tok)
step 50 eval: val_loss 7.68
step 60: loss 7.70 1280.32ms/step 6398tok/s (total 491,520 tok)
step 70: loss 7.42 1430.43ms/step 5727tok/s (total 573,440 tok)
step 80: loss 7.41 1796.88ms/step 4559tok/s (total 655,360 tok)
step 90: loss 7.35 1856.76ms/step 4412tok/s (total 737,280 tok)
step 100: loss 7.15 1905.73ms/step 4299tok/s (total 819,200 tok)
step 100 eval: val_loss 7.31
step 110: loss 7.12 1600.46ms/step 5119tok/s (total 901,120 tok)
step 120: loss 7.13 1481.66ms/step 5529tok/s (total 983,040 tok)
step 130: loss 7.10 1517.05ms/step 5400tok/s (total 1,064,960 tok)
step 140: loss 7.04 1638.87ms/step 4999tok/s (total 1,146,880 tok)
step 150: loss 7.07 1631.76ms/step 5020

In [8]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This is done.
-,
active, please become a :-) of a fire and concern on feedback (DSv). The largest assessment beyond all reasons good can come upon on the stitches at Earth and flu-well. This reason why we fourth hearing


In [7]:
with torch.no_grad():
    output = model.generate(encode("First Citizen:").to(device).unsqueeze(0), 50)
    print(decode(output))


First Citizen: suggests we don't enjoy again that needs you canaffe, with an�atored to live.
When you can use a statement that case us or barn for the speed of reReplceived, you can find months outside the shape of a length


In [9]:
import time

log_interval = 10
eval_interval = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

curr_time = time.time()
tok_total = 0 

for curr_step in range(1, 1000):

    # every once in a while evaluate the loss on train and val sets
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_sample('train', block_size, batch_size)
    tok_step = xb.view(-1).size(0)
    tok_total += tok_step

    # evaluate the loss
    logits, loss = model(xb.to(device), yb.to(device))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    step_time = time.time() - curr_time
    if curr_step % log_interval == 0:
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms/step {(tok_step/step_time):.0f}tok/s (total {tok_total:,} tok)")

    if curr_step % eval_interval == 0:
        model.eval()
        xb, yb = get_sample('validation', block_size, batch_size)
        logits, loss = model(xb.to(device), yb.to(device))
        print(f"step {curr_step} eval: val_loss {loss.detach().item():.2f}")
        # with torch.no_grad():
        #     output = model.generate(encode("hello my name is").to(device).unsqueeze(0), 10)
        #     print(decode(output))
        model.train()
    curr_time = time.time()


  x = torch.stack([torch.tensor(tokens[x:x+sample_length]) for x in idcs])
  y = torch.stack([torch.tensor(tokens[x+1:x+sample_length+1]) for x in idcs])


step 10: loss 5.96 1185.10ms/step 6912tok/s (total 81,920 tok)
step 20: loss 5.82 1200.04ms/step 6826tok/s (total 163,840 tok)
step 30: loss 5.84 1196.65ms/step 6846tok/s (total 245,760 tok)
step 40: loss 5.86 1215.82ms/step 6738tok/s (total 327,680 tok)
step 50: loss 5.86 1231.36ms/step 6653tok/s (total 409,600 tok)
step 50 eval: val_loss 5.83
step 60: loss 5.87 1265.65ms/step 6473tok/s (total 491,520 tok)
step 70: loss 5.84 1269.98ms/step 6450tok/s (total 573,440 tok)
step 80: loss 5.93 1302.95ms/step 6287tok/s (total 655,360 tok)
step 90: loss 5.86 1293.50ms/step 6333tok/s (total 737,280 tok)
step 100: loss 5.73 1366.26ms/step 5996tok/s (total 819,200 tok)
step 100 eval: val_loss 5.78
step 110: loss 5.83 1748.00ms/step 4686tok/s (total 901,120 tok)
step 120: loss 5.99 1729.59ms/step 4736tok/s (total 983,040 tok)
step 130: loss 5.87 1754.72ms/step 4669tok/s (total 1,064,960 tok)
step 140: loss 5.84 1708.36ms/step 4795tok/s (total 1,146,880 tok)
step 150: loss 5.80 1637.67ms/step 5002

In [10]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This contains Kohiedcheter shelled dead stem sugar bodies that all sea bacteria can bring up, noun and green breakfast.
Ozy containing water potential• Take be very hard again, ‘I amNFTuta’ I app which will


In [11]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This decision will equation which can be generated by the two categories of instruction. That is be made in one; it's, or the test – a little process also shows the most because a, given set of input transfer across the space. Since the mouse


In [12]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This natural understanding of leadership at the battle and the people heard.
6. In this comet, we have clushing the air without trapping CLG until the Sierra Parks Shin had gone higher over and if you five months. I think about picking in bridges


In [13]:
import time

log_interval = 10
eval_interval = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

curr_time = time.time()
tok_total = 0 

for curr_step in range(1, 4000):

    # every once in a while evaluate the loss on train and val sets
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_sample('train', block_size, batch_size)
    tok_step = xb.view(-1).size(0)
    tok_total += tok_step

    # evaluate the loss
    logits, loss = model(xb.to(device), yb.to(device))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    step_time = time.time() - curr_time
    if curr_step % log_interval == 0:
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms/step {(tok_step/step_time):.0f}tok/s (total {tok_total:,} tok)")

    if curr_step % eval_interval == 0:
        model.eval()
        xb, yb = get_sample('validation', block_size, batch_size)
        logits, loss = model(xb.to(device), yb.to(device))
        print(f"step {curr_step} eval: val_loss {loss.detach().item():.2f}")
        # with torch.no_grad():
        #     output = model.generate(encode("hello my name is").to(device).unsqueeze(0), 10)
        #     print(decode(output))
        model.train()
    curr_time = time.time()


  x = torch.stack([torch.tensor(tokens[x:x+sample_length]) for x in idcs])
  y = torch.stack([torch.tensor(tokens[x+1:x+sample_length+1]) for x in idcs])


step 10: loss 5.41 1178.31ms/step 6952tok/s (total 81,920 tok)
step 20: loss 5.55 1191.49ms/step 6875tok/s (total 163,840 tok)
step 30: loss 5.44 1189.79ms/step 6885tok/s (total 245,760 tok)
step 40: loss 5.49 1189.49ms/step 6887tok/s (total 327,680 tok)
step 50: loss 5.55 1190.74ms/step 6880tok/s (total 409,600 tok)
step 50 eval: val_loss 5.46
step 60: loss 5.46 1186.89ms/step 6902tok/s (total 491,520 tok)
step 70: loss 5.48 1200.09ms/step 6826tok/s (total 573,440 tok)
step 80: loss 5.43 1207.70ms/step 6783tok/s (total 655,360 tok)
step 90: loss 5.42 1229.86ms/step 6661tok/s (total 737,280 tok)
step 100: loss 5.50 1231.87ms/step 6650tok/s (total 819,200 tok)
step 100 eval: val_loss 5.46
step 110: loss 5.59 1250.18ms/step 6553tok/s (total 901,120 tok)
step 120: loss 5.48 1285.18ms/step 6374tok/s (total 983,040 tok)
step 130: loss 5.41 1305.08ms/step 6277tok/s (total 1,064,960 tok)
step 140: loss 5.51 1384.97ms/step 5915tok/s (total 1,146,880 tok)
step 150: loss 5.44 1570.17ms/step 5217

In [14]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This will be not guaranteed any protection and can be abandoned.
Based on a study of both healthcare and business governance for civilian workers, on such domestic rights and other locations, those living fellow workers can afford that Judiciary has been studied.
It has also


In [15]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This 150, as carbon dioxide increases in the body temperature, and the elasticity of nature and capacity in the body. The metabolism of gas, acidic dust, also tends to rise in cholesterol levels, first from excess defects to an oncolic acid,


In [16]:
import time

log_interval = 10
eval_interval = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

curr_time = time.time()
tok_total = 0 

for curr_step in range(1, 4000):

    # every once in a while evaluate the loss on train and val sets
    #     losses = estimate_loss()
    #     print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_sample('train', block_size, batch_size)
    tok_step = xb.view(-1).size(0)
    tok_total += tok_step

    # evaluate the loss
    logits, loss = model(xb.to(device), yb.to(device))
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    step_time = time.time() - curr_time
    if curr_step % log_interval == 0:
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms/step {(tok_step/step_time):.0f}tok/s (total {tok_total:,} tok)")

    if curr_step % eval_interval == 0:
        model.eval()
        xb, yb = get_sample('validation', block_size, batch_size)
        logits, loss = model(xb.to(device), yb.to(device))
        print(f"step {curr_step} eval: val_loss {loss.detach().item():.2f}")
        # with torch.no_grad():
        #     output = model.generate(encode("hello my name is").to(device).unsqueeze(0), 10)
        #     print(decode(output))
        model.train()
    curr_time = time.time()


  x = torch.stack([torch.tensor(tokens[x:x+sample_length]) for x in idcs])
  y = torch.stack([torch.tensor(tokens[x+1:x+sample_length+1]) for x in idcs])


step 10: loss 4.92 1186.28ms/step 6906tok/s (total 81,920 tok)
step 20: loss 4.93 1189.59ms/step 6886tok/s (total 163,840 tok)
step 30: loss 4.88 1196.98ms/step 6844tok/s (total 245,760 tok)
step 40: loss 4.92 1193.92ms/step 6861tok/s (total 327,680 tok)
step 50: loss 4.96 1186.49ms/step 6904tok/s (total 409,600 tok)
step 50 eval: val_loss 4.93
step 60: loss 4.83 1192.20ms/step 6871tok/s (total 491,520 tok)
step 70: loss 4.89 1214.15ms/step 6747tok/s (total 573,440 tok)
step 80: loss 4.99 1204.09ms/step 6803tok/s (total 655,360 tok)
step 90: loss 4.95 1232.73ms/step 6645tok/s (total 737,280 tok)
step 100: loss 5.01 1215.88ms/step 6738tok/s (total 819,200 tok)
step 100 eval: val_loss 4.81
step 110: loss 5.05 1221.89ms/step 6704tok/s (total 901,120 tok)
step 120: loss 4.88 1220.96ms/step 6709tok/s (total 983,040 tok)
step 130: loss 4.99 1224.10ms/step 6692tok/s (total 1,064,960 tok)
step 140: loss 4.83 1263.16ms/step 6485tok/s (total 1,146,880 tok)
step 150: loss 4.88 1309.07ms/step 6258

KeyboardInterrupt: 

In [17]:
with torch.no_grad():
    output = model.generate(encode("This").to(device).unsqueeze(0), 50)
    print(decode(output))


This setup will be placed inside brakes.
- Address for more visible traffic behaviour, improve performance, cloud tools, systems, in commands, charging, electric/verresolveably pressing from drive.
- Panel 4: A Contract also flex joins following


In [19]:
with torch.no_grad():
    output = model.generate(encode("Hello").to(device).unsqueeze(0), 50)
    print(decode(output))


Hello & More EICK N Page! honored by Yes!!'To stay true | TODAY!Thus when ||rez | St David Erep e
for essential date : Kanow for underground mine with ice
disasterful wagon ship opened: Brian Wol


In [22]:
with torch.no_grad():
    output = model.generate(encode("HTML").to(device).unsqueeze(0), 50)
    print(decode(output))


HTML
verb Statistics and graphicalization papers can be applied to a text for color and use of their first or best sentence conversions to use files or nonverbal symbols. The extra position is even more than spend some time just a week. Once downloaded from the
