[Functions](#functions)

#### setup

In [56]:
import numpy as np
import torch
from transformers.commands import train

torch.random.manual_seed(42)

cuda_available = torch.cuda.is_available()
device = torch.device('cuda' if cuda_available else 'cpu')

if cuda_available:
    dev_count = torch.cuda.device_count()
    dev_current = torch.cuda.current_device()
    dev_name = torch.cuda.get_device_name(dev_current)
    print(f'Device count: {dev_count}')
    print(f'Current device: {dev_current}')
    print(f'Device name: {dev_name}')

Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060


#### data

In [57]:
import polars as pl

data_path = 'D:\\data-science\\wikipedia\\parq\\train-00040-of-00041.parquet'

df = pl.read_parquet(data_path)

In [58]:
df = df.with_columns(
    pl.col('text')
    .str.replace_all(r'\[[^]]*\]', '')
    .str.replace_all(r'\([^]]*\)', '')
    # .str.replace_all(r'\"[^]]*\"', '')
    # .str.replace_all(r'[\.\,]', '')
    .str.replace_all(r'\n+', ' ')
    .str.replace_all(r' +', ' ')
)

df

id,url,title,text
str,str,str,str
"""67963775""","""https://en.wikipedia.org/wiki/…","""The Angel of 8th Ave.""","""""The Angel of 8th Ave"" as thei…"
"""67963778""","""https://en.wikipedia.org/wiki/…","""Hurricane Municipal Airport""","""Hurricane Municipal Airport al…"
"""67963783""","""https://en.wikipedia.org/wiki/…","""Satin berrypecker""","""The satin berrypecker is a spe…"
"""67963841""","""https://en.wikipedia.org/wiki/…","""Cassinia complanata""","""Cassinia complanata commonly k…"
"""67963851""","""https://en.wikipedia.org/wiki/…","""Monoporella""","""Monoporella is a genus of bryo…"
…,…,…,…
"""70201819""","""https://en.wikipedia.org/wiki/…","""Bianca Fernandez""","""Bianca Jolie Fernandez is a Ca…"
"""70201882""","""https://en.wikipedia.org/wiki/…","""Condons and Clangibbon""","""Condons and Clangibbon ""the wh…"
"""70201886""","""https://en.wikipedia.org/wiki/…","""2022 Chattanooga Red Wolves SC…","""The 2022 Chattanooga Red Wolve…"
"""70201947""","""https://en.wikipedia.org/wiki/…","""Nkiko Prosper""","""Turatsinze Nkiko Prosper profe…"


#### tokens

In [59]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [60]:
df_tokens = df.with_columns(
    pl.col('text').map_elements(
        lambda x: tokenizer.encode(x, add_special_tokens=False, truncation=True, padding=False, max_length=512), return_dtype=pl.List(pl.Int64)
    )
)

In [61]:
list_tokens = df_tokens['text'].to_list()

In [62]:
len(list_tokens)

157528

In [63]:
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset
import numpy as np

def label_dataset(sequence, seq_len, batch_size) -> 'TensorDataset':

    # n_batches = len(sequence) // batch_size
    x, y = [], []

    for i in range(0, len(sequence) - seq_len):
        split = sequence[i:i+seq_len+1]
        
        x.append(split[:seq_len])
        y.append(split[-1])
            
    temp_data = TensorDataset(torch.as_tensor(x),
                              torch.as_tensor(y))
    
    return temp_data
    # return DataLoader(temp_data, batch_size=batch_size, shuffle=False, drop_last=True)

In [64]:
list_datasets = []

for sequence in list_tokens:
    list_datasets.append(label_dataset(sequence, seq_len=16, batch_size=64))

list_datasets = ConcatDataset(list_datasets)
train_loader = DataLoader(list_datasets, batch_size=64, shuffle=True, drop_last=True)

In [65]:
len(train_loader)

348026

In [66]:
i = 0
for batch in train_loader:
    x, y = batch
    print(len(x), y)
    break

64 tensor([ 2069,  2099,  1005,  2002,  1024,  7454,  2096,  9815,  1015, 18944,
         2015,  3900,  1996,  2185,  5182,  2009,  1997,  2233,  2095,  2060,
         7076,  3455,  2981, 24267,  2822,  2365,  3777,  1998,  5608,  2001,
         1997,  3530,  1996,  3813,  1996,  4799,  2011,  2013,  1996,  1055,
         2590,  2007,  1055,  6006,  2004,  5546,  2233,  5485,  8529,  1998,
         1998,  1996,  1017, 16798,  5307,  4733,  5279, 12350, 11638,  2099,
         1057,  2569, 13970, 10812])


<a id='functions'></a>
# functions
#### RNN

In [67]:
from torch import nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden
    
    def init_hidden(self, batch_size, hidden_dim, num_layers, device):
        return (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                torch.zeros(num_layers, batch_size, hidden_dim).to(device))

#### train

In [None]:
from time import perf_counter

def train_model(model, data_loader, criterion, optimizer, n_epochs, n_layers, device):
    model.to(device)
    model.train()
    
    for epoch in range(n_epochs):
        total_loss = 0
        hidden = model.init_hidden(batch_size, hidden_dim, n_layers, device)
        
        for inputs, targets in data_loader:
            inputs, target = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            
            outputs, hidden = model(inputs, hidden)
            hidden = (hidden[0].detach(), hidden[1].detach())
            
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch + 1}/{n_epochs} | Loss: {avg_loss:.4f}')

In [None]:
vocab_size = len(tokenizer)
embed_dim = 128
hidden_dim = 256

num_layers = 2
learning_rate = 0.001
batch_size = 64
n_epochs = 10


rnn = RNN(vocab_size,
          embed_dim=embed_dim,
          hidden_dim=hidden_dim,
          num_layers=num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)

torch.save(rnn, 'torch-model-v0.3.pt')

In [None]:
train_model(rnn,
            data_loader=train_loader,
            criterion=criterion,
            optimizer=optimizer,
            n_epochs=n_epochs,
            n_layers=num_layers,
            device=device)