#### setup

In [1]:
import torch

torch.random.manual_seed(42)

cuda_available = torch.cuda.is_available()

if cuda_available:
    dev_count = torch.cuda.device_count()
    dev_current = torch.cuda.current_device()
    dev_name = torch.cuda.get_device_name(dev_current)
    print(f'Device count: {dev_count}')
    print(f'Current device: {dev_current}')
    print(f'Device name: {dev_name}')

Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060


#### data path

In [2]:
import os

data_path = 'D:\\data-science\\wikipedia\\tokens'
all_files = os.listdir(data_path)
file_path = os.path.join(data_path, all_files[0])

data = torch.load(file_path)
data.shape

torch.Size([157528, 256])

In [3]:
# flatten data
data = data.view(-1)

# remove padding
data = data[data != 0]

data.shape

torch.Size([28096187])

In [16]:
tokenizer.decode(data[:100])

'[CLS] " the angel of 8th ave. " ( stylised in all lowercase ) is a song by australian alternative rock band gang of youths, released on 15 june 2021 as the lead single from their second ep, total serene ( 2021 ). the track also features on the band \' s third studio album, angel in realtime ( 2022 ). frontman david le \' aupepe said the song was inspired by " falling in love and finding a new life in a new city together.'

In [4]:
from transformers import AutoTokenizer, BertTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Functions
#### 1) labeled_sequences

In [5]:
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

def labeled_sequences(words, sequence_length, batch_size):

    n_batches = len(words) // batch_size
    x, y = [], []
    words = words[:n_batches * batch_size]

    for i in range(0, len(words)-sequence_length):
        i_end = i + sequence_length

        # all words excluding last one are X
        batch_x = words[i:i_end]
        x.append(batch_x)

        # last word in sequence is y / label
        batch_y = words[i_end]
        y.append(batch_y)

    dataset = TensorDataset(torch.from_numpy(np.asarray(x)), torch.from_numpy(np.asarray(y)))
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, drop_last=True)

    return data_loader

#### RNN()

In [7]:
from torch import nn

class RNN(nn.Module):
    def __init__(self, vocab_size, output_size, emb_dim, hidden_dim, n_layers, dropout=0.5):
        super(RNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.fc = nn.Linear(hidden_dim, output_size)
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.to(torch.int64)
        
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # stack layers
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # output
        out = self.fc(lstm_out)
        out = out.view(batch_size, -1, self.output_size)
        out = out[:, -1]
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weights = next(self.parameters()).data
        hidden = ((weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda()),
                  (weights.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda()))
        
        return hidden

#### forward_back_prop()

In [8]:
def forward_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    # creating variables for hidden state to prevent back-propagation
    # of historical states
    h = tuple([each.data for each in hidden])

    rnn.zero_grad()
    # move inputs, targets to GPU 
    inputs, targets = inp.cuda(), target.cuda()

    output, h = rnn(inputs, h)

    loss = criterion(output, targets)

    # perform backpropagation and optimization
    loss.backward()
    nn.utils.clip_grad_norm_(rnn.parameters(), 5)
    optimizer.step()

    # return the loss over a batch and the hidden state produced by our model
    return loss.item(), h

#### train_rnn()

In [9]:
from time import perf_counter
import numpy as np

def train_rnn(rnn, batch_size, optimizer, criterion, n_epochs, show_every_n_batches=100):
    
    batch_losses = []
    rnn.train()

    for epoch_i in range(1, n_epochs + 1):
        print(f"Epoch {epoch_i} / {n_epochs}\n")
        
        hidden = rnn.init_hidden(batch_size)

        start = perf_counter()
        for batch_i, (inputs, labels) in enumerate(train_loader, 1):
            
            # moving inputs and labels to gpu
            inputs, labels = inputs.cuda(), labels.cuda()

            # forward and backward propagation
            loss, hidden = forward_back_prop(rnn, optimizer, criterion, inputs, labels, hidden)
            
            # loss
            batch_losses.append(loss)
            
            # monitor
            if batch_i % show_every_n_batches == 0:
                end = perf_counter()
                time = np.round(end - start, 4)
                print(f'Batch: {batch_i}  Loss: {np.average(batch_losses)}  Time: {time}s\n')
                batch_losses = []
                start = perf_counter()

    # Return the trained RNN
    return rnn

#### parameters

In [10]:
num_epochs = 10
learning_rate = 0.001

vocab_size = len(tokenizer)
output_size = vocab_size
embedding_dim = 200
hidden_dim = 250
n_layers = 2
batch_size = 64

show_every_n_batches = 1_000

In [None]:
train_loader = labeled_sequences(words=data[:len(data) // 8], sequence_length=16, batch_size=batch_size)

In [11]:
rnn = RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5)
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()
rnn.cuda()

RNN(
  (embedding): Embedding(30522, 200)
  (lstm): LSTM(200, 250, num_layers=2, batch_first=True, dropout=0.5)
  (fc): Linear(in_features=250, out_features=30522, bias=True)
)

In [None]:
trained_rnn = train_rnn(rnn, batch_size, optimizer, criterion, num_epochs, show_every_n_batches)

model_path = os.path.join('./torch-model-v0.2.pt')
torch.save(trained_rnn, model_path)

In [12]:
model_rnn = torch.load('torch-model-v0.2.pt', weights_only=False)

In [61]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [86]:
# create text generation base on the model
model_rnn.eval()
model_rnn.cuda()

hidden = model_rnn.init_hidden(1)

input_text = 'test'
generated_text = input_text.split()

for _ in range(20):
    with torch.no_grad():
        input_tokens = torch.tensor(tokenizer.encode(input_text, return_tensors='pt')).cuda()
        output, hidden = model_rnn(input_tokens, hidden)
        predicted_idx = torch.argmax(output[-1]).item()
        
        predicted_word = tokenizer.decode(predicted_idx, ignore_special_tokens=False)
        generated_text.append(predicted_word)
        # print(generated_text)
        
generated_text = ' '.join(generated_text)
print(generated_text)

test  Cl  experience attle attle attle attle attle attle attle attle attle attle attle attle attle attle attle attle attle attle


  input_tokens = torch.tensor(tokenizer.encode(input_text, return_tensors='pt')).cuda()
