<a href="https://colab.research.google.com/github/dsogden/Bigram/blob/main/LanguageModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [2]:
path = '/content/drive/MyDrive/names.txt'
with open(path, 'r', encoding='utf-8') as f:
  names = f.read().splitlines()
print(names[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [60]:
class Tokenizer:
    def __init__(self, names: list[str]):
        self.chars = sorted(list(set(''.join(names))))
        self.stoi = {s: i + 1 for i, s in enumerate(self.chars)}
        self.stoi['.'] = 0
        self.itos = {i: s for s, i in self.stoi.items()}
        self.vocab_size = len(self.stoi)

    def encode(self, name: str, max_length: int) -> list[int]:
        padding = max_length - len(name)
        return [self.stoi[c] for c in name] + [0] * (padding + 1)

    def decode(self, tokens: list[int]) -> str:
        return ''.join([self.itos[t] for t in tokens])

In [21]:
tokenizer = Tokenizer(names)
max_length = max([len(name) for name in names])
# print(max_length)
encoded = torch.tensor([tokenizer.stoi[c] for name in names for c in name])
print(encoded[:1000])

tensor([ 4, 12, 12,  0, 14, 11,  8, 21,  8,  0,  0, 21,  0,  8, 18,  0,  1,  4,
        11, 11,  0, 18, 14, 15,  7,  8,  0,  2,  7,  0, 17, 11, 14, 19, 19,  4,
        12,  8,  0,  0, 12,  4, 11,  8,  0,  7,  0, 17, 15,  4, 17,  4, 21,  4,
        11, 24, 13,  0,  1,  8,  6,  0,  8, 11,  4, 12,  8, 11, 24,  4, 11,  8,
        25,  0,  1,  4, 19,  7, 12,  8, 11,  0,  4, 11, 11,  0,  0, 21,  4, 17,
        24, 18, 14,  5,  8,  0,  2,  0, 12,  8, 11,  0,  0, 17,  8,  0, 18,  2,
         0, 17, 11,  4, 19, 19, 21,  8,  2, 19, 14, 17,  8,  0, 12,  0,  3,  8,
        18, 14, 13, 11, 20, 13,  0,  6, 17,  0,  2,  4,  2,  7, 11, 14,  4, 15,
         4, 13,  4, 11, 14, 15,  4, 11,  0, 24, 11,  0, 17,  8, 11,  4, 24, 25,
        14,  4, 24, 13, 14, 17,  0, 11,  8, 11, 24,  4, 11,  4,  0, 13, 14, 17,
         7,  0, 13, 13,  0,  7, 11,  8, 11, 11,  8,  0, 13,  0,  3,  3,  8, 18,
        14, 13,  0, 20,  1, 17,  4, 24,  4, 11, 11,  8,  4, 18, 19,  4, 11, 11,
         0, 13,  0, 19,  0, 11,  8,  4, 

In [24]:
n = int(0.9 * len(encoded))
train_data = encoded[:n]
val_data = encoded[n:]

block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f'{context} --> {target}')

tensor([4]) --> 12
tensor([ 4, 12]) --> 12
tensor([ 4, 12, 12]) --> 0
tensor([ 4, 12, 12,  0]) --> 14
tensor([ 4, 12, 12,  0, 14]) --> 11
tensor([ 4, 12, 12,  0, 14, 11]) --> 8
tensor([ 4, 12, 12,  0, 14, 11,  8]) --> 21
tensor([ 4, 12, 12,  0, 14, 11,  8, 21]) --> 8


In [29]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y
x, y = get_batch('train')

In [44]:
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # B, T, C
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T , C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B, 1
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

m = BigramLanguageModel(tokenizer.vocab_size)
logits, loss = m(x, y)
print(logits.shape)
print(loss)
idx = torch.zeros((1, 1), dtype=torch.long)

decode = tokenizer.decode(m.generate(idx, max_new_tokens=15)[0].tolist())
print(decode)

torch.Size([32, 26])
tensor(3.8399, grad_fn=<NllLossBackward0>)
awqvfnabpgcecctt


In [54]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.593684673309326


In [59]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode = tokenizer.decode(m.generate(idx, max_new_tokens=15)[0].tolist())
print(decode)

ajalyvytynkecela


In [63]:
encoded = torch.tensor([tokenizer.encode(name, 15) for name in names])
print(encoded[:1000].shape)

torch.Size([1000, 16])


In [108]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            logits = self.token_embedding_table(inputs)
            loss = None
        else:
            logits = self.token_embedding_table(inputs)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [109]:
m = SimpleModel(tokenizer.vocab_size)
logits, loss = m(encoded, encoded)
print(logits.shape)

torch.Size([512528, 26])


In [110]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode = tokenizer.decode(m.generate(idx, max_new_tokens=15)[0].tolist())
print(decode)

aocxxmdliewdzfqj


In [112]:
epochs = 100
batch_size = 32
learning_rate = 0.001
m = SimpleModel(tokenizer.vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)
for epoch in range(epochs):
    m.train()
    running_loss = 0.0
    for i in range(0, len(encoded) - batch_size, batch_size):
        inputs = encoded[i: i + batch_size]
        targets = encoded[i + 1: i + batch_size + 1]
        optimizer.zero_grad()
        logits, loss = m(inputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    running_loss /= len(encoded) / batch_size
    if epoch % 10 == 0 or (epoch == epochs - 1):
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss:0.4f}')

Epoch 1/100, Loss: 4.1618
Epoch 11/100, Loss: 1.1135
Epoch 21/100, Loss: 1.1107
Epoch 31/100, Loss: 1.1106
Epoch 41/100, Loss: 1.1105
Epoch 51/100, Loss: 1.1105
Epoch 61/100, Loss: 1.1105
Epoch 71/100, Loss: 1.1105
Epoch 81/100, Loss: 1.1105
Epoch 91/100, Loss: 1.1105


In [117]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode = tokenizer.decode(m.generate(idx, max_new_tokens=16)[0].tolist())
print(decode)

aaaasssaaaaaaaaaa


In [118]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.linear = nn.Linear(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            logits = self.token_embedding_table(inputs)
            loss = None
        else:
            embed = self.token_embedding_table(inputs)
            logits = self.linear(embed)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [119]:
epochs = 100
batch_size = 32
learning_rate = 0.001
m = SimpleModel(tokenizer.vocab_size)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)
for epoch in range(epochs):
    m.train()
    running_loss = 0.0
    for i in range(0, len(encoded) - batch_size, batch_size):
        inputs = encoded[i: i + batch_size]
        targets = encoded[i + 1: i + batch_size + 1]
        optimizer.zero_grad()
        logits, loss = m(inputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    running_loss /= len(encoded) / batch_size
    if epoch % 10 == 0 or (epoch == epochs - 1):
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {running_loss:0.4f}')

Epoch 1/100, Loss: 1.2847
Epoch 11/100, Loss: 1.1072
Epoch 21/100, Loss: 1.1068
Epoch 31/100, Loss: 1.1069
Epoch 41/100, Loss: 1.1069
Epoch 51/100, Loss: 1.1071
Epoch 61/100, Loss: 1.1072
Epoch 71/100, Loss: 1.1073
Epoch 81/100, Loss: 1.1074
Epoch 91/100, Loss: 1.1075
Epoch 100/100, Loss: 1.1076


In [121]:
idx = torch.zeros((1, 1), dtype=torch.long)
decode = tokenizer.decode(m.generate(idx, max_new_tokens=5)[0].tolist())
print(decode)

anwfnh


In [123]:
with open(path, 'r', encoding='utf-8') as f:
  names_new = f.read()
print(names_new[:10])

emma
olivi


In [130]:
chars = sorted(list(set(names_new)))
vocab_size = len(chars)
print(vocab_size)
print(''.join(chars))
stoi = {s: i for i, s in enumerate(chars)}
itos = {i: s for s, i in stoi.items()}
print(stoi)
print(itos)

27

abcdefghijklmnopqrstuvwxyz
{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}
{0: '\n', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}


In [153]:
data = torch.tensor([stoi[c] for c in names_new])
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    return x, y
x, y = get_batch('train')
print(x.shape, y.shape)

torch.Size([4, 8]) torch.Size([4, 8])


In [154]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            logits = self.token_embedding_table(inputs)
            loss = None
        else:
            logits = self.token_embedding_table(inputs)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [172]:
m = SimpleModel(vocab_size)
batch_size = 32
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.4008350372314453


In [190]:
idx = torch.zeros((100, 1), dtype=torch.long)
decode = lambda l: ''.join([itos[i] for i in l])
decoded = decode([l for l in m.generate(idx, max_new_tokens=26)[0].tolist()])
print(decoded)


fus
aiecuqcon
yodoe
listei


In [219]:
idx = torch.zeros((5, 1), dtype=torch.long)
# decoded = decode([l for l in m.generate(idx, max_new_tokens=26).tolist()])
vals = [decode(l).split('\n') for l in m.generate(idx, max_new_tokens=26).tolist()]
vals

[['', 'say', 'jerimyan', 'tabeaki', 'ane', 'a'],
 ['', 'me', 't', 'vel', 'ncanteson', 'ikillky'],
 ['', 'kyasein', 'ka', 'wa', 'corahorvaroh'],
 ['', 'anale', 'e', 'letstellesttailari'],
 ['', 'a', 'kencri', 'anavellyatu', 'ny', 'ly']]

In [252]:
class SimpleModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.rnn = nn.LSTM(
            vocab_size, vocab_size, batch_first=True
        )
        self.logits = nn.Linear(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            embed = self.token_embedding_table(inputs)
            rnn_output, _ = self.rnn(embed)
            logits = self.logits(rnn_output)
            loss = None
        else:
            embed = self.token_embedding_table(inputs)
            rnn_output, _ = self.rnn(embed)
            logits = self.logits(rnn_output)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [253]:
m = SimpleModel(vocab_size)
batch_size = 32
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.136561155319214


In [255]:
idx = torch.zeros((10, 1), dtype=torch.long)
vals = [decode(l).split('\n') for l in m.generate(idx, max_new_tokens=26).tolist()]
for val in vals:
  print(val[1:])

['ovel', 'lucka', 'qeita', 'mirkah', 've']
['tilloiza', 'chabreigh', 'pacleil']
['averion', 'mikhya', 'zayen', 'dakel']
['adlosel', 'caleelyn', 'zairanie', '']
['saiven', 'maysa', 'colstra', 'alion']
['losya', 'jayona', 'sakona', 'tarist']
['nehiphariah', 'nistin', 'joni', 'jh']
['adalynn', 'mecaira', 'hami', 'abrde']
['khath', 'sakios', 'lonakeeyn', 'lec']
['benden', 'sasevian', 'terren', 'kan']


In [262]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.Q = nn.Linear(hidden_size, hidden_size, bias=False)
        self.K = nn.Linear(hidden_size, hidden_size, bias=False)
        self.V = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, query, keys, d_model=512):
        Q = self.Q(query)
        K = self.K(keys)
        V = self.V(keys)
        attn_weights = torch.bmm(Q, K.transpose(1, 2))
        attn_weights = F.softmax(attn_weights / d_model ** 0.5, dim=-1)
        attn_output = torch.bmm(attn_weights, V)
        return attn_output

attention = Attention(10)
embedding = nn.Embedding(10, 10)

query = embedding(torch.randint(low=0, high=9, size=(10, 10)))
keys = embedding(torch.randint(low=0, high=9, size=(10, 10)))
attn_output = attention(query, keys)
print(attn_output.shape)

torch.Size([10, 10, 1])


In [266]:
(attn_output + query).shape

torch.Size([10, 10, 10])

In [268]:
class IntermediateModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.attention = Attention(vocab_size)
        self.logits = nn.Linear(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            embed = self.token_embedding_table(inputs)
            attn_output = self.attention(embed, embed, 27)
            logits = self.logits(embed + attn_output)
            loss = None
        else:
            embed = self.token_embedding_table(inputs)
            attn_output = self.attention(embed, embed, 27)
            logits = self.logits(embed + attn_output)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [269]:
m = IntermediateModel(vocab_size)
batch_size = 32
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.398796558380127


In [273]:
idx = torch.zeros((10, 1), dtype=torch.long)
vals = [decode(l).split('\n') for l in m.generate(idx, max_new_tokens=26).tolist()]
for val in vals:
  print(val[1:])

['n', 'a', 'ssacanyle', 'e', 'li', 'sshaman']
['aiahaharasaiyadashaiayarah']
['e', 'zaia', 'lalirissivilllirena']
['se', 'dh', 'min', 's', 'lini', 'manaquura']
['tronlonnnonne', 'evpessronceg']
['ayziziaiara', 'h', 'zaa', 'aishah', 'n']
['yahishaeyah', 'saraa', 'h', 'aneili']
['d', 'tzoiswai', 'bek', 'hmitrneeya', '']
['ahta', 'lalyllis', 'calavobiri', 'a']
['a', 'aryalay', 'an', 'aliarahie', 'a', 'i']


In [283]:
class IntermediateModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.attention = Attention(vocab_size)
        self.proj = nn.Linear(vocab_size, vocab_size)
        self.norm = nn.LayerNorm(vocab_size)
        self.logits = nn.Linear(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        if targets is None:
            embed = self.token_embedding_table(inputs)
            attn_output = self.attention(embed, embed, 27)
            normed = self.norm(attn_output + embed)
            proj = self.proj(normed)
            normed = self.norm(normed + proj)
            logits = self.logits(normed)
            loss = None
        else:
            embed = self.token_embedding_table(inputs)
            attn_output = self.attention(embed, embed, 27)
            normed = self.norm(attn_output + embed)
            proj = self.proj(normed)
            normed = self.norm(normed + proj)
            logits = self.logits(normed)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # B, C
            probs = F.softmax(logits, dim=-1) # generate probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # B,
            idx = torch.cat((idx, idx_next), dim=1) # B, T + 1
        return idx

In [284]:
m = IntermediateModel(vocab_size)
batch_size = 32
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.460574150085449


In [288]:
idx = torch.zeros((10, 1), dtype=torch.long)
vals = [decode(l).split('\n') for l in m.generate(idx, max_new_tokens=26).tolist()]
for val in vals:
  print(val[1:])

['wthaya', 'telalllu', 'ma', 'tynazon']
['er', 'rie', 'ry', 'yif', 'de', 'bricerida']
['gaitha', 'dabi', 'alailrana', 'elol']
['a', 'owaliessaeebemeyadan', 'sil']
['nahlch', 'sstonn', 'xyanerolyn', 'h']
['alilynivi', 'je', 'jadav', 'ja', 'jh', 'f']
['keelighanyannjaveinaoky', 'rm']
['a', 'gahan', 'ka', 'kahn', 'n', 'ste', 'synn']
['riliah', 'kan', 'zach', 'kedlayhliz']
['aie', 'nnysa', 'aryaru', 'tailormch']


In [293]:
pos = torch.linspace(0, 26, 27)
for i in range(pos.shape[0] // 2):
    pos[2 * i] = torch.sin(pos[2 * i] / 10000 ** (2 * i / 27))
    pos[2 * i + 1] = torch.cos(pos[2 * i + 1] / 10000 ** (2 * i / 27))
pos

tensor([0.0000e+00, 5.4030e-01, 8.4734e-01, 5.4331e-02, 8.5317e-01, 2.8906e-01,
        6.9967e-01, 6.1841e-01, 4.9886e-01, 8.3229e-01, 3.2405e-01, 9.3483e-01,
        1.9884e-01, 9.7658e-01, 1.1777e-01, 9.9201e-01, 6.8142e-02, 9.9738e-01,
        3.8770e-02, 9.9916e-01, 2.1779e-02, 9.9974e-01, 1.2110e-02, 9.9992e-01,
        6.6781e-03, 9.9998e-01, 2.6000e+01])