In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim

In [2]:
# First Read the dataset:
file = open("/Users/diego/Scripts/og-language-models/tiny-cicero.txt", "r")
contents = file.read()
#print(contents)
file.close()

In [3]:
vocabulary = list(set(contents))
vocabulary = sorted(vocabulary)
VOCAB_SIZE = len(vocabulary)
print(vocabulary)

[' ', '!', '$', '&', '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', 'Æ', 'ä', 'æ', 'è', 'é', 'Œ', 'œ', 'α', 'δ', 'μ', 'ν', 'ο', 'π', 'ς', 'τ', '—', '‘', '’', '“', '”', '\ufeff']


In [4]:
print("Vocabulary Length: ", len(vocabulary))
print("Content Length: ", len(contents))

Vocabulary Length:  86
Content Length:  851509


In [5]:
## First we must make encode and decoder functions for our dataset
string_to_int = {ch : i for i, ch in enumerate(vocabulary)}
int_to_string = {i : ch for i, ch in enumerate(vocabulary)}
encode = lambda s : [string_to_int[c] for c in s]
decode = lambda l : ''.join([int_to_string[i] for i in l])
print(encode("Hello World"))
print(decode(encode("Hello World")))

[19, 42, 49, 49, 52, 0, 34, 52, 55, 49, 41]
Hello World


In [6]:
data = torch.tensor(encode(contents), dtype=int)
print(data.shape,data.dtype)
print(data[:1000])

torch.Size([851509]) torch.int64
tensor([85, 26, 25,  0, 17, 29, 20, 16, 25, 15, 30, 19, 20, 27,  0, 24, 12, 29,
        14, 32, 30,  0, 31, 32, 23, 23, 20, 32, 30,  0, 14, 20, 14, 16, 29, 26,
         0, 31, 19, 16,  0, 38, 58, 44, 58, 55,  0, 28, 58, 46, 51, 57, 58, 56,
         0, 24, 58, 40, 46, 58, 56,  0, 30, 40, 38, 42, 59, 52, 49, 38,  0, 58,
        56, 42, 41,  0, 57, 52,  0, 55, 42, 40, 52, 58, 51, 57,  0, 38,  0, 51,
        58, 50, 39, 42, 55,  0, 52, 43,  0, 56, 57, 52, 55, 46, 42, 56,  0, 38,
        39, 52, 58, 57,  0, 45, 46, 56,  0, 43, 38, 57, 45, 42, 55,  7, 46, 51,
         7, 49, 38, 60,  0, 18, 38, 46, 58, 56,  0, 23, 38, 42, 49, 46, 58, 56,
         6,  0, 38, 40, 40, 58, 55, 38, 57, 42, 49, 62,  0, 55, 42, 50, 42, 50,
        39, 42, 55, 42, 41,  0, 38, 51, 41,  0, 40, 45, 38, 55, 50, 46, 51, 44,
        49, 62,  0, 57, 52, 49, 41, 10,  0, 38, 51, 41,  0, 60, 45, 42, 51, 42,
        59, 42, 55,  0, 45, 42,  0, 57, 38, 49, 48, 42, 41,  0, 38, 39, 52, 58,
       

In [7]:
n = int(0.9*len(data))
train_data = data[n:]
val_data = data[:n]
train_data = train_data.float()
val_data = val_data.float()

In [8]:
## Now, we define our context window
torch.manual_seed(1337)
BATCH_SIZE = 4
CONTEXT_SIZE = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - CONTEXT_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+CONTEXT_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+CONTEXT_SIZE+1] for i in ix])
    return x, y

In [9]:
xb, yb = get_batch('train')
print('inputs:')
print(xb)
print('targets:')
print(yb)

inputs:
tensor([[57., 46., 51., 44.,  0., 56., 52., 50.],
        [42., 56.,  6.,  0., 30., 46., 55.,  6.],
        [56., 42.,  0., 46., 51.,  0., 57., 45.],
        [46., 56., 57., 42., 41.,  0., 38., 43.]])
targets:
tensor([[46., 51., 44.,  0., 56., 52., 50., 42.],
        [56.,  6.,  0., 30., 46., 55.,  6.,  0.],
        [42.,  0., 46., 51.,  0., 57., 45., 42.],
        [56., 57., 42., 41.,  0., 38., 43., 57.]])


In [109]:
## LSTM Model

## Short Term Memory Block
class stmBlock(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        
        # Forget Gate:
        self.Wif = nn.Linear(input_dim, hidden_dim, bias = False)
        self.Whf = nn.Linear(hidden_dim, hidden_dim)

        # Input Gate:
        self.Wii = nn.Linear(input_dim, hidden_dim, bias = False)
        self.Whi = nn.Linear(hidden_dim, hidden_dim)

        # Candidate Gate:
        self.Wic = nn.Linear(input_dim, hidden_dim, bias = False)
        self.Whc = nn.Linear(hidden_dim, hidden_dim)

        # Output Gate:
        self.Wio = nn.Linear(input_dim, hidden_dim, bias = False)
        self.Who = nn.Linear(hidden_dim, hidden_dim)
        
        self.sigmoid = nn.Sigmoid()
        self.tanh = nn.Tanh()
        

    def forward(self, x, c_prev = None, h_prev = None):
        # If not first block:
        if h_prev is not None:
            f = self.sigmoid(self.Wif(x) + self.Whf(h_prev))
            i = self.sigmoid(self.Wii(x) + self.Whi(h_prev))
            c = self.tanh(self.Wic(x) + self.Whc(h_prev))
            o = self.sigmoid(self.Wio(x) + self.Who(h_prev))
        else:
            f = self.sigmoid(self.Wif(x))
            i = self.sigmoid(self.Wii(x))
            c = self.tanh(self.Wic(x))
            o = self.sigmoid(self.Wio(x))

        if c_prev == None:
            c_t = i * c
        else:
            c_t = f * c_prev + i * c
        h = o * self.tanh(c_t)
        return c_t, h

## Now for the entire LSTM Module
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.stmBlocks = nn.ModuleList([stmBlock(input_dim, hidden_dim) for i in range(CONTEXT_SIZE)])
        # We make an additional list of linear layers for the ouput of each block
        self.linLays = nn.ModuleList([nn.Linear(hidden_dim, output_dim) for i in range(CONTEXT_SIZE)])
        self.loss = nn.CrossEntropyLoss()

    def forward(self, x, targets=None):
        logits = []
        h = None
        c = None
        for i in range(CONTEXT_SIZE):
            element = x[:, i].unsqueeze(1)
            print("Element ", element)
            c, h = self.stmBlocks[i](c, h, element) if h is not None else self.stmBlocks[i](element)
            print("h ", h.shape)
            o = self.linLays[i](h)
            print("Ouput: ", o.shape)
            logits.append(o)
        
        print(len(logits))
        logits = torch.stack(logits, dim=1)
        print(logits.shape)
        if targets is not None:
            loss = self.loss(logits.view(-1, VOCAB_SIZE), targets.view(-1).long())
            return logits, loss
        return logits, None


In [110]:
def evaluate(model):
    lossAvg = 0
    counter = 0
    for i in range(400):
        counter+=1
        batch = get_batch("test")
        logits, loss = model(batch[0], batch[1])
        lossAvg += loss.item()
    return lossAvg / counter

In [111]:
model = LSTM(1, 256, VOCAB_SIZE)
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [113]:
training_iterations = 3000
for i in range(training_iterations):
    optimizer.zero_grad()
    batch = get_batch("train")
    print(batch[0].shape)
    outputs, loss = model(batch[0], batch[1])
    loss.backward()
    #print("Loss: ", loss.item())
    optimizer.step()

torch.Size([4, 8])
Element  tensor([[55.],
        [ 0.],
        [55.],
        [38.]])
h  torch.Size([4, 256])
Ouput:  torch.Size([4, 86])
Element  tensor([[57.],
        [50.],
        [52.],
        [49.]])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (4x256 and 1x256)

In [21]:
print("Test Loss: ", evaluate(model))

Test Loss:  3.005908076763153


In [22]:
## Let's do some testing:
def infer(input_data):
    with torch.no_grad():
        outputs, _ = model(input_data)
        predicted_indices = torch.argmax(outputs, dim=-1)
        return predicted_indices
        
test_batch = get_batch("test")
result = infer(test_batch[0])
for i in range(BATCH_SIZE):
    print("Input: ", decode(test_batch[0].tolist()[i]))
    print(decode(result.tolist()[i]))

Input:  llow you
     h  
Input:  dd to it
 ta   h 
Input:   turn an
te    te
Input:  ordered 
   th  t


In [None]:
### Truly terrible results