# **Loading Dataset**

Replace PATH_TO_HF content to the path of where your new_ds.hf folder is at. For me, I mounted my Google Drive which contains the file path below.

In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downlo

In [None]:
from datasets import load_dataset, load_from_disk

PATH_TO_HF = "/content/drive/MyDrive/CS4248 NLP/new_ds.hf"
dataset = load_from_disk(PATH_TO_HF)
dataset = dataset.flatten()
dataset = dataset.rename_column("translation.en","input")
dataset = dataset.rename_column("translation.zh","target")
small_dataset = dataset.select(range(2000))

FileNotFoundError: ignored

# Basic Preprocessing

Since the preprocessing hasn't been fully done at the point of creating the RNN model, for simplicity's sake I will follow the guide in [Language Modeling with LSTMs in PyTorch](https://towardsdatascience.com/language-modeling-with-lstms-in-pytorch-381a26badcbf)

In [None]:
!pip install spacy
!pip install torchtext
!python -m spacy download en_core_web_sm
!python -m spacy download zh_core_web_sm

In [None]:
import spacy
import torchtext
import torch

en_tokenizer = spacy.load("en_core_web_sm")
zh_tokenizer = spacy.load("zh_core_web_sm")

def tokenize_en(data):
  data["input"] = [tok.text.lower() for tok in en_tokenizer(data["input"])]
  return data
def tokenize_zh(data):
  data["target"] = [tok.text for tok in zh_tokenizer(data["target"])]
  return data
def to_vector(data, vocab, column_name,max_length):
  data[column_name] = vocab.lookup_indices(data[column_name])+[vocab["<pad>"] for i in range(max_length-len(data[column_name]))]
  return data

def build_vocab(data, in_vocab_size=None, out_vocab_size=None):
  tokenized_data = data.map(tokenize_en,num_proc=4)
  tokenized_data = tokenized_data.map(tokenize_zh,num_proc=4)
  en_vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_data['input'], 
                                                       max_tokens = in_vocab_size, 
                                                       specials = ["<bos>","<eos>","<pad>"], 
                                                       special_first=True)
  zh_vocab = torchtext.vocab.build_vocab_from_iterator(tokenized_data['target'], 
                                                       max_tokens = out_vocab_size,
                                                       specials = ["<bos>","<eos>","<pad>"], 
                                                       special_first=True)
  en_vocab.set_default_index(0)
  zh_vocab.set_default_index(0)
  
  return tokenized_data,en_vocab, zh_vocab

small_dataset,en_vocab, zh_vocab = build_vocab(small_dataset)
small_dataset = small_dataset.map(lambda e: to_vector(e,en_vocab,"input", len(en_vocab)))
small_dataset = small_dataset.map(lambda e: to_vector(e,zh_vocab,"target",len(zh_vocab)))


# RNN Model Implementation

Referenced from: https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb#scrollTo=Z_sJahjXaDFx

In [None]:
import random
class Encoder(torch.nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout):
    super().__init__()
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers

    self.embedding = torch.nn.Embedding(input_size, embedding_size)
    self.LSTM = torch.nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, dropout=dropout)
    self.dropout = torch.nn.Dropout(dropout)

  def forward(self, input_seq):
    word_embeddings = self.embedding(input_seq)
    word_embeddings = self.dropout(word_embeddings)
    outputs, (hidden,cell) = self.LSTM(word_embeddings)
    return hidden, cell
  
class Decoder(torch.nn.Module):
  def __init__(self, output_size, embedding_size, hidden_size, num_layers, dropout):
    super().__init__()

    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.output_size = output_size

    self.embedding = torch.nn.Embedding(output_size, embedding_size)
    self.LSTM = torch.nn.LSTM(embedding_size, hidden_size, num_layers=num_layers, dropout=dropout)
    self.out = torch.nn.Linear(hidden_size, output_size)
    self.dropout = torch.nn.Dropout(dropout)
  
  def forward(self, input, hidden, cell):
    input = input.unsqueeze(0)
    word_embeddings = self.embedding(input)
    word_embeddings = self.dropout(word_embeddings)
    outputs, (hidden,cell) = self.LSTM(word_embeddings,(hidden, cell))
    
    outputs = self.out(outputs.squeeze(0))
    return outputs, hidden, cell

class EncoderDecoder(torch.nn.Module):
  def __init__(self,encoder,decoder,device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device
  def forward(self,input,target_output,teacher_forcing_ratio=0.5):
    batch_size = target_output.shape[1]
    target_len = target_output.shape[0]
    target_vocab_size = self.decoder.output_size

    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(self.device)
    hidden, cell = self.encoder(input)

    target_in = target_output[0,:]

    for t in range(1, target_len):
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(target_in, hidden, cell)
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            target_in = target_output[t] if teacher_force else top1   
    return outputs

# Training

In [None]:
import time
import math

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        src = batch["input"]
        trg = batch["target"]
        src = torch.transpose(src,0,1)
        trg = torch.transpose(trg,0,1)

        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        
        output = output[1:].reshape(-1, output_dim)
        trg = trg[1:].reshape(-1)
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            src = batch["input"]
            trg = batch["target"]
            src = torch.transpose(src,0,1)
            trg = torch.transpose(trg,0,1)

            output = model(src, trg, 0) 
            output_dim = output.shape[-1]
            
            output = output[1:].reshape(-1, output_dim)
            trg = trg[1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

def init_weights(m):
    for name, param in m.named_parameters():
        torch.nn.init.uniform_(param.data, -0.08, 0.08)
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pt_dataset = small_dataset.with_format("torch",columns=["input","target"],device = device)
pt_dataset = pt_dataset.train_test_split(test_size=0.2)
train_dataloader = torch.utils.data.DataLoader(pt_dataset["train"], batch_size=10)
valid_dataloader = torch.utils.data.DataLoader(pt_dataset["test"], batch_size=10)

#parameters
INPUT_DIM = len(en_vocab)
OUTPUT_DIM = len(zh_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = EncoderDecoder(enc, dec, device).to(device)
model.apply(init_weights)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss(ignore_index = 2)

best_valid_loss = float('inf')
N_EPOCHS = 10
CLIP = 1
for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, iter(train_dataloader), optimizer, criterion, CLIP)
    valid_loss = evaluate(model, iter(valid_dataloader), criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), '/content/drive/MyDrive/CS4248 NLP/rnn-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')