<a href="https://colab.research.google.com/github/eisbetterthanpi/RNN/blob/main/RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torchdata portalocker

In [None]:
# @title data
# https://github.com/Sam-Armstrong/tinyGPT/blob/main/Training.py
# https://colab.research.google.com/github/karpathy/minGPT/blob/master/play_char.ipynb
# https://github.com/karpathy/nanoGPT

import torch
import torch.nn as nn
from torch.utils.data import Dataset

class CharDataset(Dataset): # https://github.com/karpathy/minGPT
    def __init__(self, raw_data, seq_len):
        data = ''.join(raw_data)
        chars = sorted(list(set(data)))
        self.vocab_size = len(chars) # 283
        self.stoi = {ch:i for i,ch in enumerate(chars)}
        self.itos = {i:ch for i,ch in enumerate(chars)}
        self.data = self.data_process(data) # list of int
        self.seq_len = seq_len

    def data_process(self, data): # str 10780437
        return torch.tensor([self.stoi.get(c) for c in data]) # list of int 4570571 # stoi.get(c,UNK_IDX)

    def __len__(self):
        # return len(self.data) - self.seq_len
        return len(self.data)//(self.seq_len+1)

    def __getitem__(self, idx):
        # dix = self.data[idx:idx + self.seq_len + 1]
        dix = self.data[idx*(self.seq_len+1) : (idx+1)*(self.seq_len+1)]
        x, y = dix[:-1], dix[1:]
        return x, y

# # data = list(open('input.txt', 'r').read()) # for using a text corpus contained within a .txt file
# # from torchtext.datasets import WikiText2
# # train_iter, val_iter, test_iter = WikiText2() # line by line of wiki  = Valkyria Chronicles III =
# from torchtext.datasets import PennTreebank
# train_iter, val_iter, test_iter = PennTreebank()
# seq_len = 128
# train_dataset = CharDataset(train_iter, seq_len) # one line of poem is roughly 50 characters
# test_dataset = CharDataset(test_iter, seq_len) # one line of poem is roughly 50 characters
# from torch.utils.data.dataloader import DataLoader
# batch_size = 512 #512
# train_loader = DataLoader(train_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 2) # num_workers = 4
# test_loader = DataLoader(test_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 0)

# def encode(context): return torch.tensor([train_dataset.stoi.get(c) for c in context], device=device)
# def decode(x): return ''.join([train_dataset.itos[int(i)] for i in x])
# # for x,y in train_loader:
# #     break
# # n=2
# # print(decode(x[n]))
# # print(decode(y[n]))


In [None]:
# @title dataloader
# https://www.tensorflow.org/text/tutorials/text_generation
# path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
# Read, then decode for py2 compat.
import requests
url="https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt"
out=requests.get(url)
with open("shakespeare.txt", "wb") as f:
    f.write(out.content)
text = open("shakespeare.txt", 'rb').read().decode(encoding='utf-8')


seq_len = 100
train_dataset = CharDataset(text, seq_len) # one line of poem is roughly 50 characters
# test_dataset = CharDataset(test_iter, seq_len) # one line of poem is roughly 50 characters
from torch.utils.data.dataloader import DataLoader
batch_size = 64 #512
train_loader = DataLoader(train_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 2) # num_workers = 4
# test_loader = DataLoader(test_dataset, shuffle = True, pin_memory = True, batch_size = batch_size, num_workers = 0)

def encode(context): return torch.tensor([train_dataset.stoi.get(c) for c in context], device=device)
def decode(x): return ''.join([train_dataset.itos[int(i)] for i in x])
# for x,y in train_loader:
#     break


# # from torchtext.datasets import WikiText2
# # train_iter, val_iter, test_iter = WikiText2() # line by line of wiki  = Valkyria Chronicles III =
# # seq_len = 128
# # data = ''.join(train_iter)
# # for i,x in enumerate(train_iter):
# for i,x in enumerate(train_loader):
#     print(x)
#     print(len(x))
#     # break
#     if i>3: break
# # print(train_iter)

# # print(train_dataset.vocab_size)
# # print(train_dataset.vocab_size)

# data = ''.join(train_iter)
# chars = sorted(list(set(data)))
# print(chars)



In [31]:
# @title RNN from scratch
# https://github.com/georgeyiasemis/Recurrent-Neural-Networks-from-scratch-using-PyTorch/blob/main/rnncells.py
import torch
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"

class RNNCell(nn.Module):
    def __init__(self, in_dim, d_model, bias=True):
        super().__init__()
        self.d_model = d_model
        self.x2h = nn.Linear(in_dim, d_model, bias=bias)
        self.h2h = nn.Linear(d_model, d_model, bias=bias)

    def forward(self, input, hx=None):
        if hx is None: hx = torch.zeros(input.size(0), self.d_model, device=device)
        hy = self.x2h(input) + self.h2h(hx)
        hy = torch.tanh(hy)
        return hy

class LSTMCell(nn.Module):
    def __init__(self, in_dim, d_model, bias=True):
        super().__init__()
        self.d_model = d_model
        self.xh = nn.Linear(in_dim, 4*d_model, bias=bias)
        self.hh = nn.Linear(d_model, 4*d_model, bias=bias)

    def forward(self, input, hxcx=None):
        if hxcx is None: hx, cx = torch.zeros(input.size(0), self.d_model, device=device), torch.zeros(input.size(0), self.d_model, device=device)
        else: hx, cx = hxcx
        gates = self.xh(input) + self.hh(hx) # gates = self.xh(torch.cat([input, hx], dim=-1))
        input_gate, forget_gate, cell_gate, output_gate = gates.chunk(4, dim=-1)
        f_t = torch.sigmoid(forget_gate)
        i_t = torch.sigmoid(input_gate)
        g_t = torch.tanh(cell_gate)
        o_t = torch.sigmoid(output_gate)
        cy = cx * f_t + i_t * g_t
        hy = o_t * torch.tanh(cy)
        return hy, cy

class GRUCell(nn.Module):
    def __init__(self, in_dim, d_model, bias=True):
        super().__init__()
        self.d_model = d_model
        self.x2h = nn.Linear(in_dim, 3*d_model, bias=bias)
        self.h2h = nn.Linear(d_model, 3*d_model, bias=bias)

    def forward(self, input, hx=None): # input: (batch_size, in_dim), hx: (batch_size, d_model)
        if hx is None: hx = torch.zeros(input.size(0), self.d_model, device=device)
        x_t = self.x2h(input)
        h_t = self.h2h(hx)
        x_reset, x_upd, x_new = x_t.chunk(3, dim=-1)
        h_reset, h_upd, h_new = h_t.chunk(3, dim=-1)
        reset_gate = torch.sigmoid(x_reset + h_reset)
        update_gate = torch.sigmoid(x_upd + h_upd)
        new_gate = torch.tanh(x_new + reset_gate*h_new)
        hy = update_gate*hx + (1-update_gate)*new_gate
        return hy # (batch_size, d_model)


class RNN(nn.Module): # https://github.com/georgeyiasemis/Recurrent-Neural-Networks-from-scratch-using-PyTorch/blob/main/rnnmodels.py
    def __init__(self, in_dim, d_model, num_layers, out_dim):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        self.tok_emb = nn.Embedding(in_dim, d_model)
        # self.rnn = nn.ModuleList([RNNCell(d_model, d_model) for _ in range(num_layers)])
        self.rnn = nn.ModuleList([LSTMCell(d_model, d_model) for _ in range(num_layers)])
        # self.rnn = nn.ModuleList([GRUCell(d_model, d_model) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, out_dim)

        # std = 1.0 / torch.sqrt(torch.tensor(self.d_model))
        # for w in self.parameters():
        #     w.data.uniform_(-std, std)
        #     # nn.init.uniform_(w, -std, std)
        # nn.init.xavier_uniform_(self)

        # for param in self.parameters():
        #     if param.dim() > 1:
        #         nn.init.xavier_uniform_(param)

        # def init_weights(self):
        #     for name, param in self.named_parameters():
        #         if "weight" in name:
        #             nn.init.xavier_uniform_(param)
        #         elif "bias" in name:
        #             nn.init.zeros_(param)

    def forward(self, x, h0=None):
        batch_size, seq_len = x.shape
        x = self.tok_emb(x) #[batch_size, seq_len, d_model]
        if h0==None: h0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device)
        ht = list(torch.split(h0, 1))
        out = []
        for t in range(seq_len):
            for i, layer in enumerate(self.rnn):
                if i==0: ht[i] = layer(x[:,t], h0[i])
                else: ht[i] = layer(ht[i-1], h0[i])
            out.append(ht[-1]) # seq_len * [batch_size, d_model]
            h0 = ht
        ht = torch.stack(ht, dim=0)
        out = torch.stack(out, dim=1) # [batch_size, seq_len, d_model]
        # out=h0[-1]

        # for t in range(seq_len): # simple single layer
        #     h0 = self.rnn(x[:, t], h0)
        #     out.append(h0)

        out = self.fc(out) # out: (n, 10)
        return out, ht # rnn/gru


    def forward(self, x, hc=None): # lstm
        batch_size, seq_len = x.shape
        x = self.tok_emb(x) #[batch_size, seq_len, d_model]
        if hc==None:
            h0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device)
            c0 = torch.zeros(self.num_layers, batch_size, self.d_model, device=device)
        else: h0, c0 = hc
        ht, ct = list(torch.split(h0, 1)), list(torch.split(c0, 1))
        out = []
        for t in range(seq_len):
            for i, layer in enumerate(self.rnn):
                if i==0: ht[i], ct[i] = layer(x[:, t], (h0[i],c0[i]))
                else: ht[i], ct[i] = layer(ht[i-1], (h0[i],c0[i]))
            out.append(ht[-1]) # seq_len * [batch_size, d_model]
            h0 = ht
        ht, ct = torch.stack(ht, dim=0), torch.stack(ct, dim=0)
        out = torch.stack(out, dim=1) # [batch_size, seq_len, d_model]
        out = self.fc(out) # out: (n, 10)
        return out, (ht, ct) # lstm


input_size = seq_len#28
hidden_size = 128#128
num_layers = 2#2
num_classes = vocab_size = train_dataset.vocab_size

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)



In [None]:
# @title RNN pytorch
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = "cuda" if torch.cuda.is_available() else "cpu"

import math

# Fully connected neural network with one hidden layer
class RNN(nn.Module):
    def __init__(self, in_dim, d_model, num_layers, out_dim):
        super().__init__()
        self.num_layers = num_layers
        self.d_model = d_model
        # self.rnn = nn.RNN(d_model, d_model, num_layers, batch_first=True)
        self.rnn = nn.GRU(d_model, d_model, num_layers, batch_first=True)
        # self.lstm = nn.LSTM(d_model, d_model, num_layers, batch_first=True)
        self.fc = nn.Linear(d_model, out_dim)

        self.tok_emb = nn.Embedding(in_dim, d_model)
        # self.d_model = d_model
        # self.fc = nn.Linear(d_model, vocab_size)
        # for p in self.parameters():
        #     if p.dim() > 1:
        #         nn.init.xavier_uniform_(p)

    def forward(self, x, hc=None): # [batch_size, seq, in_dim]
        x = self.tok_emb(x)
        if hc is None:
            h0 = torch.zeros((self.num_layers, x.size(0), self.d_model), device=device)
            c0 = torch.zeros((self.num_layers, x.size(0), self.d_model), device=device)
        else: h0,c0 = hc
        # print(x.shape, h0.shape,c0.shape)
        out, (h,c) = self.lstm(x, (h0,c0)) # [batch, seq_len, d_model], ([num_layers, batch, d_model] )
        # out = out[:, -1, :] # out: (n, 128)
        out = self.fc(out) # out: (n, 10)
        return out, (h, c)

    def forward(self, x, h=None):
        x = self.tok_emb(x)# * math.sqrt(self.d_model)
        if h is None: h0 = torch.zeros((self.num_layers, x.size(0), self.d_model), device=device)
        else: h0 = h
        # print(x.shape, h0.shape)
        out, h = self.rnn(x, h0)
        # out = out[:, -1, :] # out: (n, 128)
        out = self.fc(out) # out: (n, 10)
        return out, h


input_size = seq_len#28
# sequence_length = 28
hidden_size = 128
num_layers = 2
num_classes = vocab_size = train_dataset.vocab_size

model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)
# print(model)


In [None]:
# @title wandb
# https://docs.wandb.ai/quickstart
!pip install wandb
import wandb
wandb.login() # 487a2109e55dce4e13fc70681781de9f50f27be7
run = wandb.init(
    project="tiny_gpt",
    config={
        "model": "adam 1e-3",
        "optim": "adam",
        # "learning_rate": 5,
    })


In [28]:
# @title train test generate
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
scaler = torch.cuda.amp.GradScaler()

def strain(model, dataloader, optimizer, loss_fn, scheduler=None): # train function with automatic mixed precision
    model.train()
    total_loss = 0
    for batch, (data, targets) in enumerate(dataloader):
        data, targets = data.to(device), targets.to(device)
        with torch.cuda.amp.autocast():
            logits = model(data)
            loss = loss_fn(logits.reshape(-1, logits.size(-1)), targets.flatten()) # [512, 128, 283], [512, 128]
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        scaler.step(optimizer)
        scaler.update()
        # if scheduler is not None: scheduler.step()
        # print("strain",loss.item())
        total_loss += loss.item()
        try: wandb.log({"train loss": loss.item()/len(targets)})
        except NameError: pass
    return total_loss / len(dataloader)


from tqdm import tqdm
def train(loader, model, loss_fn, optimizer):
    model.train()
    total_loss = 0
    pbar = tqdm(enumerate(loader), total = len(loader))
    for it, (x, y) in pbar:
    # for it, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        logits, _ = model(x)
        # print("in train:",logits.shape,y.shape)
        # loss = loss_fn(logits.reshape(-1, logits.size(-1)), y[:,-1]) # [batch, vocab_size], [batch]
        loss = loss_fn(logits.flatten(end_dim=-2), y.flatten()) # [batch*seq_len, vocab_size], [batch*seq_len]
        optimizer.zero_grad()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        total_loss+=loss.item()
        try: wandb.log({"train loss": loss.item()/len(y)})
        except NameError: pass
        pbar.set_description(f"epoch {epoch + 1} iter {it}: train loss {loss.item():.5f}.")
    return total_loss / len(loader)

def test(loader, model, loss_fn):
    model.eval()
    total_loss = 0
    for it, (x, y) in enumerate(loader):
        x, y = x.to(device), y.to(device)
        with torch.no_grad():
            logits, _ = model(x)
        loss = loss_fn(logits.flatten(end_dim=-2), y.flatten()) # [batch*seq_len, vocab_size], [batch*seq_len]
        total_loss+=loss.item()
    return total_loss / len(loader)


from torch.nn import functional as F
def generate(model, context, max_steps = 64):
    x = torch.tensor([train_dataset.stoi.get(c) for c in context], device=device)
    model.eval()
    for n in range(max_steps):
        if x.shape[0] <= seq_len: x_bar = x
        else: x_bar = x[-seq_len:]
        output = model(x_bar.unsqueeze(0)) # [1, len(contex)+, vocab_size=283] float
        output = output[-1,:]
        output = F.softmax(output, dim = -1) # vocab_size to char
        ix = torch.multinomial(output, num_samples = 1) # rand sample by output distribution
        x = torch.cat((x, ix.flatten()))
    # print([int(i) for i in x])
    completion = ''.join([train_dataset.itos[int(i)] for i in x])
    return completion


import torch
from torch.nn import functional as F
def generate(model, context, max_steps = 64, temperature=1):
    # x = torch.tensor([train_dataset.stoi.get(c) for c in context], device=device).unsqueeze(0)
    x=ix = torch.tensor([train_dataset.stoi.get(c) for c in context], device=device).unsqueeze(0)
    model.eval()
    hidden=None
    with torch.no_grad():
        for n in range(max_steps):
            # output, hidden = model(x, hidden)
            output, hidden = model(ix, hidden)
            # hidden=hidden[:, -1, :].unsqueeze(1) # RNN/GRU
            hidden = hidden[0][:, -1, :].unsqueeze(1), hidden[1][:, -1, :].unsqueeze(1) # LSTM
            output = output[:, -1, :] # get logit for last character
            output = output/temperature
            output = F.softmax(output, dim = -1) # vocab_size to char
            ix = torch.multinomial(output, num_samples = 1) # rand sample by output distribution
            x = torch.cat((x, ix),1)
        completion = ''.join([train_dataset.itos[int(i)] for i in x.flatten()])
        return completion

# out=generate(model, "A wi")
# print(out)



  scaler = torch.cuda.amp.GradScaler()


In [None]:
# @title run
# AdamW 1e-4 1e-3
# sgd 1e-3
optimizer = torch.optim.AdamW(model.parameters(), 1e-3, (0.9, 0.95)) # lr = 1e-4 #3e-4
# optimizer = torch.optim.AdamW(model.parameters(), 1e-3, (0.9, 0.95)) # lr = 1e-4 #3e-4
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) # 5. , 0.001
criterion = nn.CrossEntropyLoss()
# print("lr: ", optimizer.param_groups[0]['lr'])
# optimizer.param_groups[0]['lr']=1e-3

for epoch in range(30):
    # train_loss = strain(model, train_loader, optimizer, criterion, scheduler=None)
    train_loss = train(train_loader, model, criterion, optimizer)
    # test_loss = test(test_loader, model, criterion)
    # print('Test Loss:', test_loss)
    print(generate(model, "this is what "))


In [None]:
# @title inference
context = "this is what "
#context = 'There are many things about horses that have been discovered in recent'
completion = generate(model, context,200)
print(completion)
# print(generate(model, "this is what "))


In [None]:
# @title save
path = "/content/drive/MyDrive/frame/RNNS.pth"

torch.save(model.state_dict(), path)

# model.load_state_dict(torch.load(path, map_location=device))


In [None]:
torch.cuda.empty_cache()