# compare MPS vs CPU

In [1]:
device = "cpu"

In [2]:
import tiktoken
import torch 
import datasets
import random

# r50k_base vocab size: 50,257 https://arxiv.org/pdf/2404.09894
enc = tiktoken.get_encoding("r50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string))

def decode(tensor):
    return enc.decode([tensor.item()])
    
def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = random.randint(0, len(tokens)-sample_length)
    return torch.tensor(tokens[s:s+sample_length], device=device), torch.tensor(tokens[s+sample_length], device=device)

dataset = datasets.load_dataset('karpathy/tiny_shakespeare')
dataset_tok = dataset.map(lambda row: {"tok": encode(row["text"])}, remove_columns="text")

In [3]:
# for i in tqdm(range(1000)): get_sample("train", 10)

NameError: name 'tqdm' is not defined

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLP(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.embedding(x)
        out = out.mean(dim=0)  # Average the embeddings to get a single vector per sentence
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        return out

    def generate(self, prompt, tokens):
        with torch.no_grad():
            for _ in range(tokens):
                input_tok = encode(prompt).to(device)
                output_tok = self(input_tok)
                output = decode(torch.argmax(output_tok))
                prompt = prompt + output
            return prompt
        
model = MLP(vocab_size=50_272, embedding_dim=1024, hidden_size=512, num_classes=50_272)

## cpu

In [5]:
from tqdm import tqdm 
import time

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
curr_time = time.time()
curr_step = 1
curr_tok = 0

for curr_step in range(1, 150):
    inputs, labels = get_sample("train", 10)
    tok_cnt = inputs.size()[0]
    
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (curr_step % 20 == 0):
        step_time = time.time() - curr_time
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms {(tok_cnt/step_time):.0f}tok/s (total {curr_tok:,} tok)")

    if (curr_step % 100 == 0):
        print(model.generate("Oh my lord", 10))
    
    curr_time = time.time() 
    curr_step = curr_step + 1
    curr_tok = curr_tok + tok_cnt

step 20: loss 10.82 78.95ms 127tok/s (total 190 tok)
step 40: loss 10.85 81.87ms 122tok/s (total 390 tok)
step 60: loss 10.91 80.32ms 125tok/s (total 590 tok)
step 80: loss 10.84 78.58ms 127tok/s (total 790 tok)
step 100: loss 10.69 79.25ms 126tok/s (total 990 tok)
Oh my lord Utahlav








step 120: loss 10.70 79.28ms 126tok/s (total 1,190 tok)
step 140: loss 10.96 80.29ms 125tok/s (total 1,390 tok)


## MPS

In [6]:
device="mps"

In [7]:
from tqdm import tqdm 
import time

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
curr_time = time.time()
curr_step = 1
curr_tok = 0

for curr_step in range(1, 150):
    inputs, labels = get_sample("train", 10)
    tok_cnt = inputs.size()[0]
    
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (curr_step % 20 == 0):
        step_time = time.time() - curr_time
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms {(tok_cnt/step_time):.0f}tok/s (total {curr_tok:,} tok)")

    if (curr_step % 100 == 0):
        print(model.generate("Oh my lord", 10))
    
    curr_time = time.time() 
    curr_step = curr_step + 1
    curr_tok = curr_tok + tok_cnt

step 20: loss 10.05 45.34ms 221tok/s (total 190 tok)
step 40: loss 10.78 44.80ms 223tok/s (total 390 tok)
step 60: loss 10.79 45.28ms 221tok/s (total 590 tok)
step 80: loss 11.04 44.76ms 223tok/s (total 790 tok)
step 100: loss 10.93 45.26ms 221tok/s (total 990 tok)
Oh my lord










step 120: loss 4.64 44.26ms 226tok/s (total 1,190 tok)
step 140: loss 10.83 47.11ms 212tok/s (total 1,390 tok)


# more simple NN: embedding only

In [1]:
device = "mps"

In [2]:
import tiktoken
import torch 
import datasets
import random

# r50k_base vocab size: 50,257 https://arxiv.org/pdf/2404.09894
enc = tiktoken.get_encoding("r50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string))

def decode(tensor):
    return enc.decode([tensor.item()])
    
def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = random.randint(0, len(tokens)-sample_length)
    return torch.tensor(tokens[s:s+sample_length], device=device), torch.tensor(tokens[s+sample_length], device=device)

dataset = datasets.load_dataset('karpathy/tiny_shakespeare')
dataset_tok = dataset.map(lambda row: {"tok": encode(row["text"])}, remove_columns="text")

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLP(nn.Module):
    def __init__(self, vocab_size):
        super(MLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x):
        out = self.embedding(x)
        out = out.mean(dim=0)  # Average the embeddings to get a single vector per sentence
        return out

    def generate(self, prompt, tokens):
        with torch.no_grad():
            for _ in range(tokens):
                input_tok = encode(prompt).to(device)
                output_tok = self(input_tok)
                output = decode(torch.argmax(output_tok))
                prompt = prompt + output
            return prompt
        
model = MLP(vocab_size=50_272)

In [4]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{count_parameters(model)/1_000_000}M parameters")

2527.273984M parameters


In [6]:
model

MLP(
  (embedding): Embedding(50272, 50272)
)

In [5]:
from tqdm import tqdm 
import time

model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
curr_time = time.time()
curr_step = 1
curr_tok = 0

for curr_step in range(1, 150):
    inputs, labels = get_sample("train", 10)
    tok_cnt = inputs.size()[0]
    
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (curr_step % 20 == 0):
        step_time = time.time() - curr_time
        print(f"step {curr_step}: loss {loss.detach().item():.2f} {step_time*1000:.2f}ms {(tok_cnt/step_time):.0f}tok/s (total {curr_tok:,} tok)")

    if (curr_step % 100 == 0):
        print(model.generate("Oh my lord", 10))
    
    curr_time = time.time() 
    curr_step = curr_step + 1
    curr_tok = curr_tok + tok_cnt

RuntimeError: MPS backend out of memory (MPS allocated: 18.85 GB, other allocations: 9.42 GB, max allowed: 27.20 GB). Tried to allocate 256 bytes on shared pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

# benchmark get_sample function

In [1]:
import tiktoken
import torch 
import datasets
import random

# r50k_base vocab size: 50,257 https://arxiv.org/pdf/2404.09894
enc = tiktoken.get_encoding("r50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string))

def decode(tensor):
    return enc.decode([tensor.item()])
    
dataset = datasets.load_dataset('karpathy/tiny_shakespeare')
dataset_tok = dataset.map(lambda row: {"tok": encode(row["text"])}, remove_columns="text")

In [6]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = random.randint(0, len(tokens)-sample_length)
    return torch.tensor(tokens[s:s+sample_length]), torch.tensor(tokens[s+sample_length])

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:08<00:00, 24.83it/s]


## maybe creating a new tensor takes a lot of time?

In [8]:
import tiktoken
import torch 
import datasets
import random

# r50k_base vocab size: 50,257 https://arxiv.org/pdf/2404.09894
enc = tiktoken.get_encoding("r50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string))

def decode(tensor):
    return enc.decode([tensor.item()])
    
dataset = datasets.load_dataset('karpathy/tiny_shakespeare')
dataset_tok = dataset.map(lambda row: {"tok": torch.tensor(encode(row["text"]))}, remove_columns="text")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  dataset_tok = dataset.map(lambda row: {"tok": torch.tensor(encode(row["text"]))}, remove_columns="text")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [9]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = random.randint(0, len(tokens)-sample_length)
    return tokens[s:s+sample_length], tokens[s+sample_length]

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 25.05it/s]


In [10]:
import tiktoken
import torch 
import datasets
import random

# r50k_base vocab size: 50,257 https://arxiv.org/pdf/2404.09894
enc = tiktoken.get_encoding("r50k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

def encode(string):
    return torch.tensor(enc.encode(string))

def decode(tensor):
    return enc.decode([tensor.item()])
    
dataset = datasets.load_dataset('karpathy/tiny_shakespeare')
dataset_tok = dataset.map(lambda row: {"tok": torch.tensor(encode(row["text"]), device="mps")}, remove_columns="text")

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  dataset_tok = dataset.map(lambda row: {"tok": torch.tensor(encode(row["text"]), device="mps")}, remove_columns="text")


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [11]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = random.randint(0, len(tokens)-sample_length)
    return tokens[s:s+sample_length], tokens[s+sample_length]

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 25.36it/s]


In [12]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = 5
    return tokens[s:s+sample_length], tokens[s+sample_length]

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 25.61it/s]


In [13]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = 5
    return tokens[s:s+sample_length]

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 25.32it/s]


In [14]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    tokens = dataset_tok[split]["tok"][0]
    s = 5
    return tokens

for i in tqdm(range(200)): get_sample("train", 10)

100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:07<00:00, 25.36it/s]


In [15]:
from tqdm import tqdm 

def get_sample(split, sample_length):
    # tokens = dataset_tok[split]["tok"][0]
    s = 5
    return s

for i in tqdm(range(200)): get_sample("train", 10)

100%|█████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 49902.49it/s]


In [19]:
tok = torch.tensor(dataset_tok["train"]["tok"][0][0:10])

In [21]:
tok.dtype

torch.int64

In [23]:
tok.long()
tok.dtype

torch.int64

In [30]:
torch.randint(5, (4,2))

tensor([[3, 0],
        [3, 4],
        [4, 3],
        [4, 2]])

In [25]:
torch.randint?

[0;31mDocstring:[0m
randint(low=0, high, size, \*, generator=None, out=None, dtype=None, layout=torch.strided, device=None, requires_grad=False) -> Tensor

Returns a tensor filled with random integers generated uniformly
between :attr:`low` (inclusive) and :attr:`high` (exclusive).

The shape of the tensor is defined by the variable argument :attr:`size`.

.. note::
    With the global dtype default (``torch.float32``), this function returns
    a tensor with dtype ``torch.int64``.

Args:
    low (int, optional): Lowest integer to be drawn from the distribution. Default: 0.
    high (int): One above the highest integer to be drawn from the distribution.
    size (tuple): a tuple defining the shape of the output tensor.

Keyword args:
    generator (:class:`torch.Generator`, optional): a pseudorandom number generator for sampling
    out (Tensor, optional): the output tensor.
    dtype (`torch.dtype`, optional) - the desired data type of returned tensor. Default: if ``None``,
        

In [32]:
import torch.nn as nn

emb = nn.Embedding(10,10)

In [34]:
input = torch.tensor([0, 2, 3])

In [35]:
emb

Embedding(10, 10)

In [36]:
emb(input)

tensor([[ 0.4692, -0.6647, -0.0064,  0.4490, -1.2883,  1.1145,  0.1919, -0.7953,
          2.2297, -1.0140],
        [ 0.7702,  0.7249, -0.0917,  1.4448,  1.1533,  1.2294,  0.3328,  1.9844,
         -1.0909, -0.6624],
        [-0.5557,  0.6507,  0.6215, -1.9482,  0.0234, -0.1776,  1.6709, -1.6300,
         -0.6850,  0.2105]], grad_fn=<EmbeddingBackward0>)

In [37]:
emb(input).size()

torch.Size([3, 10])

In [38]:
torch.randint(10, (2,2))

tensor([[3, 4],
        [6, 3]])

In [39]:
emb(torch.randint(10, (2,2))).size()

torch.Size([2, 2, 10])

In [40]:
torch.randint(10, (3,4))

tensor([[6, 6, 5, 9],
        [3, 0, 1, 1],
        [2, 4, 1, 4]])

In [41]:
emb(torch.randint(10, (3,4))).size()

torch.Size([3, 4, 10])

In [44]:
emb.weight.dtype

torch.float32

In [45]:
emb.weight

Parameter containing:
tensor([[ 0.4692, -0.6647, -0.0064,  0.4490, -1.2883,  1.1145,  0.1919, -0.7953,
          2.2297, -1.0140],
        [-1.7927,  0.0889, -0.8772, -0.2689,  0.3096,  1.0809,  0.8955, -1.7298,
         -0.5292, -0.2212],
        [ 0.7702,  0.7249, -0.0917,  1.4448,  1.1533,  1.2294,  0.3328,  1.9844,
         -1.0909, -0.6624],
        [-0.5557,  0.6507,  0.6215, -1.9482,  0.0234, -0.1776,  1.6709, -1.6300,
         -0.6850,  0.2105],
        [-1.4775,  0.5935, -2.4034,  1.5521, -0.8200, -0.6712,  0.2298,  0.7088,
          0.4819,  0.7531],
        [ 0.6940,  0.2879, -0.4894,  1.2471,  0.3382,  0.8189, -0.1822,  0.7593,
         -1.4281,  0.5939],
        [-0.5219, -0.5266,  1.2750,  1.7248,  0.8310,  0.0730,  1.3571,  0.2483,
          0.1648,  0.2519],
        [ 1.0359, -0.2647, -0.0677,  0.4956,  1.1466, -0.2427,  0.3803, -0.0629,
          0.2553, -0.6983],
        [ 1.2476, -0.5968,  1.3025, -1.9699,  0.8386,  0.4369, -1.6739, -0.2482,
         -0.9271,  1.1132