In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
block_size = 128
batches = 16
train_loop = 10000
learning_rate = 5e-5
train_split = 0.8
debug_loop = 500
embeds = 512
heads = 8
layers = 8
dropout = 0.2

cuda


In [4]:
chars = ""
with open('grammar.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\n', ' ', '!', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '}', '´', 'Á', 'à', 'á', 'æ', 'è', 'é', 'í', 'ò', 'ó', 'ù', 'ú', 'Ā', 'ā', 'Ă', 'ă', 'ē', 'ĕ', 'ħ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'ō', 'Ŏ', 'ŏ', 'œ', 'Ū', 'ū', 'ŭ', 'ȳ', '̄', '̆', 'Α', 'Κ', 'Λ', 'Ν', 'Ο', 'Τ', '‘', '’', '“', '”', '⪥', '\ufeff']
126


In [5]:
str2int = { ch:i for i,ch in enumerate(chars) }
int2str = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [ str2int[c] for c in s ]
decode = lambda l: ''.join([ int2str[i] for i in l ])
data = torch.tensor(encode(text), dtype=torch.long).to(device)
print(data[:100])

tensor([125,   0,   0,   0,   0,  39,  71,  68,  57,  74,  56,  58,  57,   1,
         55,  78,   1,  26,  61,  71,  62,  72,   1,  26,  74,  71,  67,  68,
         76,   1,  54,  67,  57,   1,  73,  61,  58,   1,  38,  67,  65,  62,
         67,  58,   1,  27,  62,  72,  73,  71,  62,  55,  74,  73,  58,  57,
          0,  39,  71,  68,  68,  59,  71,  58,  54,  57,  62,  67,  60,   1,
         43,  58,  54,  66,   1,  54,  73,   1,  61,  73,  73,  69,  21,  10,
         10,  76,  76,  76,   9,  69,  60,  57,  69,   9,  67,  58,  73,   1,
          4,  43], device='cuda:0')


In [6]:
n = int(train_split*len(data))
train = data[:n]
valid = data[n:]

def get_batch(split):
    data = train if split=='train' else valid
    ix = torch.randint(len(data) - block_size, (batches,))
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
print(x)
print('target:')
print(y)

inputs:
tensor([[ 68,  67,  72,  ...,  54,  71,  66],
        [ 58,   1,  69,  ...,  58,  68,  69],
        [ 68,   1,  54,  ...,   9, 123,   1],
        ...,
        [  1,  73,  68,  ...,  59,  62,  67],
        [ 67,  68,  73,  ...,  61,  62,  56],
        [ 58,  72,  73,  ...,   1,  57,  62]], device='cuda:0')
target:
tensor([[ 67,  72,   7,  ...,  71,  66,   0],
        [  1,  69,  58,  ...,  68,  69,  65],
        [  1,  54,  57,  ..., 123,   1,  31],
        ...,
        [ 73,  68,   1,  ...,  62,  67,  62],
        [ 68,  73,  61,  ...,  62,  56,  61],
        [ 72,  73,  62,  ...,  57,  62,  57]], device='cuda:0')


In [7]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embeds, head_size, bias=False)
        self.query = nn.Linear(embeds, head_size, bias=False)
        self.value = nn.Linear(embeds, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, embeds)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embeds, 4*embeds),
            nn.ReLU(),
            nn.Linear(4*embeds, embeds),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, embeds, heads):
        super().__init__()
        head_size = embeds // heads
        self.sa = MultiHeadAttention(heads, head_size)
        self.ffwd = FeedForward(embeds)
        self.ln1 = nn.LayerNorm(embeds)
        self.ln2 = nn.LayerNorm(embeds)
    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

class Kiwi(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embeds)
        self.position_embedding = nn.Embedding(block_size, embeds)
        self.blocks = nn.Sequential(*[Block(embeds, heads=heads) for _ in range(layers)])
        self.final_norm = nn.LayerNorm(embeds)
        self.head = nn.Linear(embeds, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape

        tok = self.token_embedding(index)
        pos = self.position_embedding(torch.arange(T, device=device))
        x = tok + pos
        x = self.blocks(x)
        x = self.final_norm(x)
        logits = self.head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, index, max_new_tokens):
        for iter in range(max_new_tokens):
            index_crop = index[:, -block_size:]
            logits, loss = self.forward(index_crop)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
            print(f"\r<{'*' * int(10 * iter/max_new_tokens)}{' ' * (10 - int(10*iter/max_new_tokens))}>", end='', flush=False)
        print("\r<**********>")
        return index

m = Kiwi(vocab_size)
model = Kiwi(vocab_size).to(device)

In [8]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(debug_loop)
        for k in range(debug_loop):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
%%time
optim = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(train_loop):
    if iter % debug_loop == 0:
        losses = estimate_loss(model)
        print(f'step: {iter}, train loss: {losses['train']:.6f}, valid loss: {losses['val']:.6f}')

    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optim.zero_grad(set_to_none=True)
    loss.backward()
    optim.step()
print(loss.item())
with open('kiwi.pkl', 'wb') as f:
    pickle.dump(model, f)

step: 0, train loss: 4.980935, valid loss: 4.971207
step: 500, train loss: 2.453805, valid loss: 2.603982
step: 1000, train loss: 2.142277, valid loss: 2.315857
step: 1500, train loss: 1.924116, valid loss: 2.148293
step: 2000, train loss: 1.787321, valid loss: 2.071828
step: 2500, train loss: 1.685532, valid loss: 2.002305
step: 3000, train loss: 1.594479, valid loss: 1.938929
step: 3500, train loss: 1.523731, valid loss: 1.911186
step: 4000, train loss: 1.446487, valid loss: 1.902528
step: 4500, train loss: 1.384791, valid loss: 1.877297
step: 5000, train loss: 1.333591, valid loss: 1.873751
step: 5500, train loss: 1.271168, valid loss: 1.868645
step: 6000, train loss: 1.220224, valid loss: 1.869128
step: 6500, train loss: 1.164963, valid loss: 1.879513
step: 7000, train loss: 1.115938, valid loss: 1.881689
step: 7500, train loss: 1.062489, valid loss: 1.890433
step: 8000, train loss: 1.009505, valid loss: 1.917896
step: 8500, train loss: 0.953280, valid loss: 1.940237
step: 9000, tr

In [9]:
%%time
with open('kiwi.pkl', 'rb') as f:
    model = pickle.load(f)
ctx = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = decode(model.generate(ctx, max_new_tokens=1000)[0].tolist())
print(generated)

  return torch.load(io.BytesIO(b))


<**********>


“A Cing up to agent,” and stand some what two “King oōke,” that “call out conjOh! probaking and
a hymng subbstantives, which are you wile to trick thim buscks.

There we armire word in partive can phrases is some sight three, is done,
owed boving it. The same word which is were to ouh, of will at no rend to
the plate
seconds of for metal pronouble to whether, and that every infeed with ristring
out of by all meging ward, inswaints, and they would
get nought to erms.

Here walt in verbs Griviate are divided into may be gentlement and in concommon
alway. Mass ee fiasi--is, we shaall be drown in the mow hand.

Passive a shine pronounce whaped a pun and as sunds till Schootma’s by
short,” &c.

When engard in shock to disase sigure pronouns the degries of a horsol.

Asformed intow, however, with shater the tan the plural of line.


RULE II.

Of those good “some,” “dimmide” which, áccor.

And nothing is to the same peculiar emphasis.

“Did _who_ waifter, make wishat,”
“What ar

In [10]:
import speech_recognition as sr
import pyttsx3

gender = 1

listener = sr.Recognizer()
engine = pyttsx3.init('sapi5')
voices = engine.getProperty("voices")
engine.setProperty('voice',voices[gender].id)

def talk(audio):
    engine.say(audio)
    engine.runAndWait()

talk(generated)