In [1]:
# https://github.com/lucidrains/routing-transformer
%pip install routing_transformer
%pip install transformers

Collecting routing_transformer
  Downloading routing_transformer-1.6.1-py3-none-any.whl (16 kB)
Collecting local-attention>=1.4.0
  Downloading local_attention-1.4.3-py3-none-any.whl (5.0 kB)
Collecting einops
  Downloading einops-0.4.1-py3-none-any.whl (28 kB)
Collecting mixture-of-experts>=0.2.0
  Downloading mixture_of_experts-0.2.1-py3-none-any.whl (6.0 kB)
Collecting product-key-memory
  Downloading product_key_memory-0.1.10.tar.gz (3.5 kB)
Building wheels for collected packages: product-key-memory
  Building wheel for product-key-memory (setup.py) ... [?25l[?25hdone
  Created wheel for product-key-memory: filename=product_key_memory-0.1.10-py3-none-any.whl size=3072 sha256=05fafd08aa7be98298fb620c6a4af50689377804df95e58ff9411dd1fe109eb1
  Stored in directory: /root/.cache/pip/wheels/43/78/51/06648579a50c8e83f24ebfbdfd66462d1b88315a3491deba86
Successfully built product-key-memory
Installing collected packages: product-key-memory, mixture-of-experts, local-attention, einops, rout

In [2]:
import torch
import torch.optim as optim
import time
import json
from tqdm import tqdm
from routing_transformer import RoutingTransformerEncDec
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [3]:
# CUDA Enabled GPU required, make sure Google Colab runtime is set to GPU.
gpu_info = !nvidia-smi
gpu = torch.cuda.is_available()
gpu_info = '\n'.join(gpu_info)
if gpu_info.find("failed") >= 0 and not gpu:
  print("Not connected to a GPU, Change Runtime type to GPU")
else:
  print(gpu_info)

Sat May  7 09:56:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
# Mount Google Drive for running on Colab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path = "/content/drive/My Drive/Senior Design/"

Mounted at /content/drive


In [5]:
# Initialize tokenizer.
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [6]:
data_path = path + "data/datascience_qa.json"

qa_data = {}

with open(data_path, "r", encoding='utf-8') as f:
    qa_data = json.load(f)

qa_data_arr = []

for i in qa_data:
  question = qa_data[i]["question:"]
  question += "---"
  answer = qa_data[i]["answer:"]
  retrievals = qa_data[i]["retrievals:"]
  retrievals = [x[0]+"---"+x[1] for x in retrievals] # Concat title + body
  retrievals = retrievals[::-1] # Reverse retrievals (hurdles paper)

  ques_and_rets = ""
  for ret in retrievals:
    ques_and_rets += ret + " "
  ques_and_rets += question # Concat question to end of rets (hurdles paper)
  
  query= str(ques_and_rets).strip()
  answer = str(answer).strip()

  if answer != "nan":
    qa_data_arr.append([i, query, answer])

In [7]:
# Split dataset into training and validation set.
train_data, valid_data = train_test_split(
    qa_data_arr, train_size=0.99, shuffle=True, random_state=47)

len_valid_data = len(valid_data)
print(f"Samples in validation set: {len_valid_data}")

Samples in validation set: 138


In [9]:
valid_i = []

with open(path + "eval/valid_data_indexes.txt", "w") as f:
  for qa in valid_data:
    valid_i.append(qa[0])
    f.write(qa[0])
    f.write("\n")

len(valid_i)

138

In [10]:
# Constants / Model Parameters
BATCH_SIZE = 1 # 128 Mini batch size in (hurdles paper)
#NUM_BATCHES = len_train_data//BATCH_SIZE
LEARNING_RATE = 5e-5 # (hurdles paper)
#EVALUATE_EVERY  = NUM_BATCHES//4
NUM_TOKENS = 65536 #~32k, possibly use 65536
ENC_SEQ_LEN = 4096 # 8192 Max tokens in tokenized sequence. #(hurdles paper)
DEC_SEQ_LEN = 2048
DROPOUT = .15 # (hurdles paper)
WINDOW_SIZE = 256 # 512(RT paper)
HEADS = 8 # (RT paper)
LAYERS = 18 # 22(RT paper)
start_token = (torch.zeros((1, 1)) * 1).long().cuda()

In [11]:
valid_inp_data = []
valid_tgt_data = []

for qa_pair in valid_data:
  valid_inp_data.append(qa_pair[1]) # Query
  valid_tgt_data.append(qa_pair[2]) # Answer

valid_inp_data = np.array(valid_inp_data)
valid_tgt_data = np.array(valid_tgt_data)

In [12]:
# Class definition for PyTorch Dataset
# https://pytorch.org/tutorials/beginner/basics/data_tutorial.html
class QADataset(Dataset):
    def __init__(self, input_data, target_data):
        super().__init__()
        self.input_data = input_data
        self.target_data = target_data

    def __getitem__(self, index):
        # Encode Queries
        enc = tokenizer.encode_plus(
            self.input_data[index], 
            None, 
            max_length=ENC_SEQ_LEN, 
            padding="max_length", 
            truncation=True)
        inp = enc['input_ids']
        inp = torch.tensor(inp).long().cuda()
        inp_mask = torch.tensor(enc['attention_mask']).bool().cuda()

        # Encode Answers
        enc = tokenizer.encode_plus(
            self.target_data[index], 
            None, 
            max_length=DEC_SEQ_LEN, 
            padding="max_length",
            truncation=True)
        tgt = enc['input_ids']
        tgt = torch.tensor(tgt).long().cuda()
        tgt_mask = torch.tensor(enc['attention_mask']).bool().cuda()

        return inp, inp_mask, tgt, tgt_mask

    def __len__(self):
        return len(self.input_data)

In [13]:
valid_dataset = QADataset(valid_inp_data, valid_tgt_data)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [14]:
# Routing transformer w/ encoder decoder stack.
model = RoutingTransformerEncDec(
    dim=512,
    enc_num_tokens=NUM_TOKENS,
    enc_depth=LAYERS // 2, # Half to enc half to dec
    enc_heads=HEADS,
    enc_max_seq_len=ENC_SEQ_LEN,
    enc_window_size=WINDOW_SIZE,
    dec_num_tokens=NUM_TOKENS,
    dec_depth=LAYERS // 2,
    dec_heads=HEADS,
    dec_max_seq_len=DEC_SEQ_LEN,
    dec_window_size=WINDOW_SIZE,
    reversible=True,
    shift_tokens=True,
    attn_dropout=DROPOUT,
    ff_dropout=DROPOUT,
    layer_dropout=DROPOUT,
    causal=True # Auto-regressive
).cuda()

optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [15]:
def format_dict(question, generation, actual):
  dict_format = {"question:": "",
                "gen:": "",
                "actual:": [] }
  
  dict_format["question:"] = question
  dict_format["gen:"] = generation
  dict_format["actual:"] = actual

  return dict_format

In [16]:
# Function for evaluating model.
def generate(model, data_loader, batch_size=BATCH_SIZE, sample_size=1):
  model.eval()
  total = batch_size*sample_size

  gens = {}
  count = 0
  
  for _ in range(sample_size):
    index = valid_i[count]
    inp, inp_mask, tgt, _ = next(data_loader)

    for i, x in enumerate(inp):
      predict = model.generate(
          inp[i:i+1], start_token, DEC_SEQ_LEN, eos_token=102)
      
      question = qa_data[str(index)]["question:"]
      gen = tokenizer.decode(predict[0][1:-1])
      gen = gen.capitalize()
      actual = tokenizer.decode(tgt[0][1:])

      print("\nQuestion: ", question)
      print("\nGeneration: ", gen)
      print("\nActual: ", actual)

      gens[str(index)] = format_dict(question, gen, actual)

    count += 1

  return gens

In [17]:
# Load last checkpoint
models_path = path + "models/pytorch/pytorch"
model_name = "qa_rt153.pt"
model.load_state_dict(torch.load(models_path+model_name))

<All keys matched successfully>

In [18]:
gens = generate(model, iter(valid_loader), sample_size=len_valid_data)
gens

Output hidden; open in https://colab.research.google.com to view.

In [19]:
with open(path + 'eval/rt_generations_153.json', 'w', encoding='utf-8') as f:
    json.dump(gens, f, ensure_ascii=False, indent=4)