In [17]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

cpu


In [18]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

#this will create a list of all the characters in the text file
#the set function will remove all the duplicates
#the sorted function will sort the characters in the list
   #use a tokenizer to convert each element into  an integer

    #

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [19]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1])


In [20]:
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:] 

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - (block_size), (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:') 
print(y)


tensor([ 36165,  25887, 175999,  33061])
inputs:
tensor([[ 1, 76, 61, 58, 67,  1, 33,  0],
        [61, 58,  1, 54, 67, 62, 66, 54],
        [58,  1, 47, 62, 79, 54, 71, 57],
        [73, 68, 65, 57,  1, 74, 72,  1]])
targets:
tensor([[76, 61, 58, 67,  1, 33,  0, 72],
        [58,  1, 54, 67, 62, 66, 54, 65],
        [ 1, 47, 62, 79, 54, 71, 57, 11],
        [68, 65, 57,  1, 74, 72,  1, 78]])


In [21]:
## Biogram data model is a model that predicts the next character given the previous character  
## bio means two so we're going to use two characters to predict the next character

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(y)
    print('when input is', context, 'target is', target)


tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80]) target is tensor(28)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28]) target is tensor(39)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39]) target is tensor(42)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39, 42]) target is tensor(39)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39, 42, 39]) target is tensor(44)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39, 42, 39, 44]) target is tensor(32)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39, 42, 39, 44, 32]) target is tensor(49)
tensor([28, 39, 42, 39, 44, 32, 49,  1])
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) target is tensor(1)


In [22]:
class BigGramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigGramLanguageModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, vocab_size)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        # self.rnn = nn.LSTM(vocab_size, vocab_size, num_layers=1)
        # self.proj = nn.Linear(hidden_dim, vocab_size)

    def forward(self, index, targets = None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss    

    def generate(self, index, max_new_tokens):
        #index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #get the predictions
            logits, loss = self.forward(index)
            #focus only  on the last time step
            logits = logits[:, -1, :] # becoms (B, C)
            probs = F.softmax(logits, dim=-1) # (B, C)
            #sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            #append sampled index to the running sequence
            index = torch.cat([index, index_next], dim=1)# (B, T+1)
        return index

model = BigGramLanguageModel(vocab_size)
m = model.to(device)
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)




N"'x,OMyE!'cdH(J2uj"_b8J1iZ)B224m6?0DKYU"3Y6"'zp1k﻿_b?3nhP4MysMr8XBu.rQR-G6CgTPJ*lDbUJo(﻿ipIxjOoOV4&fb]f?7B,PzDT9yQq!5[MWgG8Q[LIsRyI1sx !*]gWI1rENc04ENpGhsd
o"H(GW'7F91cmpau3TUfnGWx
r!﻿[7ZMlbSGK:H)"34Y5HF8rTGJHwlKrXCpu.'fx&(i':GPrAAnzdZi36
IrQze-bY﻿x&G8T!aUc(eI2XPB,mZLJ5Nz4F(SE4ZdWv0i?vIUdY*G.CgE'[,nXTWE'15Ucx3'xciN:-ri';n0R"﻿mQNpCsgWjzdtu9]hAhDwB(Supv,,-Z(S?BL:9lfOdr-MdtUB(oDY*YowZhItQmzz:Q;DBGsKKrdF
z4OA updLaj,T_DSE1T5w4cU!"3SE4t
vENj(_IpXk fn﻿vd9aPr*jg5!SN3sMGrXUh&7;B)?0d 7khbO9JNqTtK(Vg[7Y_


In [23]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')

    #evaulate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    
print(loss.item())


tensor([ 74980, 161595,   6712,  57995])
tensor([ 45438, 155212,  45686,  30929])
tensor([117202, 114812,  29165,  79911])
tensor([133265, 156579,  65988,  48359])
tensor([80496, 14624, 55397, 33447])
tensor([153101, 102058, 140481,  32995])
tensor([87573, 39610, 61479,  7890])
tensor([153815,  98165,  69234,  14151])
tensor([153794, 142744,  88487,  32891])
tensor([102466,  84398,  47273,  33143])
tensor([ 59197, 160484,  12508,  91422])
tensor([133348,    692, 132691, 134132])
tensor([ 97367, 146917, 139253, 102300])
tensor([ 75919,  75657,  37918, 185586])
tensor([ 29714, 102585,  98433, 148336])
tensor([  4317, 146716, 101514,   5960])
tensor([139611,  77471, 108645, 105120])
tensor([161507,  38982,  45683,    616])
tensor([ 43130, 163175,  34400, 177660])
tensor([74495, 37250, 30344,  4414])
tensor([136127,   9840,  81651,  75452])
tensor([ 88770,  12421, 144669,   1438])
tensor([179520,  62168, 147781,  91029])
tensor([135712,  78423, 101511,  27455])
tensor([119583,   4878,  210

In [28]:

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


E
s,.c﻿IAN[h?7KH(.m'G2)"SuR)QrBnCEM9xa5;*
Z.k ;j:V0]*uAv,4t]gm0"AL!(Ift0DB;s'Bpm9v,(9R]E)L&Wr&kS?HF(bH"x6v7WTQ8Bpz_V7))M8TGs4pQdNpF&GARw_pi5v9JAE'2sG8jZG TE'xkE;Nubrc7!15c])250Dpa,E-E7MM6'G ;SuwC8)83]EKF&*JVup1?O-8
ZauJ!XR ;_6jzRoH4HuZpj(&&DW8lpjAM5v)qZa-9d_]YewYsi8)E'J,-GhB0JxNrE'p 0JS5,E"[GJL7Ms5wZiGtoIHqN))CD"BOjU_YbedC]Bqh
fnnokbG-GKo 6p]t9bBu?-)]4uZhshNup-4eWCud?"zU00fn]mPJ;g:rQdDlyQG PrL0PPoDni;d67HZ:p,O;HPGA?2TUw4?O)L?"M:z*!Y."PrXx[nA(Hnne0J)8r8B?*x!BAVn ;:2pHyyqz[&dUlThPrIV.GKEMGoI.mDp0i
