In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sqlite3
from typing import Any, Optional
from pydantic import BaseModel, Extra, Field
from transformers import TrainingArguments, AutoModel, AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
import numpy as np
import json
import datasets
import wandb
import torch
from tqdm.notebook import tqdm
import sys
sys.path.append('../../')
sys.path.append('./set_transformer/')
from subset_active_learning.subset_selection import select, preprocess
from subset_active_learning.active_learning.subset_classifier import get_df_from_db

import torch.nn as nn
from modules import SAB, PMA

2022-10-17 15:25:51.221289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-17 15:25:51.221346: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
ds_name = 'sst2'
test_dataset = None
eval_mapping = '[]'
num_labels = 2
valid_split = 'validation'
test_split = 'validation'

db_path = "./temp.db"
seed = 0
pool_size = 1000
search_size = 100
warmup_runs = 500
annealing_runs = 1000
wandb_project = 'sst_search_test'
wandb_entity = 'johntzwei'

model_card = "roberta-base"
pretraining = True
max_steps = 5000
eval_steps = 500
learning_rate = 1e-4
batch_size = 32
# adam should default to correct_bias = True
adam_epsilon = 1e-6
adam_beta1 = 0.9
adam_beta2 = 0.999
max_grad_norm = 1.0
warmup_ratio = 0.0
weight_decay = 0.1

In [4]:
num_embeddings = 2000   # dataset dependent
embedding_dim = 768
enc_num_layers = 3
enc_dim = 32
enc_num_heads = 32
dec_dim = 32
dec_num_heads = 32

In [5]:
db_path = '../../scripts/active_learning/sst_random_fixed_small_validation.db'

In [6]:
params = select.SubsetTrainingArguments(model_card=model_card,
                                               num_labels=num_labels,
                                               eval_mapping=json.loads(eval_mapping),
                                               pretraining=pretraining,
                                               max_steps=max_steps,
                                               eval_steps=eval_steps,
                                               learning_rate=learning_rate,
                                               batch_size=batch_size,
                                               adam_epsilon=adam_epsilon,
                                               adam_beta1=adam_beta1,
                                               adam_beta2=adam_beta2,
                                               max_grad_norm=max_grad_norm,
                                               warmup_ratio=warmup_ratio,
                                               weight_decay=weight_decay)

In [7]:
df = get_df_from_db(db_path)
df = df.iloc[1:]
# maybe the center is too dense?

In [8]:
class DataUtilityDataset(torch.utils.data.Dataset):
    """Face Landmarks dataset."""

    def __init__(self, db_path):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = get_df_from_db(db_path)
        # self.df = df[(df.objective < 0.75567) | (df.objective > 0.77111) | (np.random.random(len(df)) > 0.5)]
        
        self.data = []
        for i, row in self.df.iterrows():
            if row['objective'] == 0.0:
                continue
            self.data.append((json.loads(row['indexes']), row['objective']))
        print('Subset size:', len((self.data[0][0])))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [9]:
dataset = DataUtilityDataset(db_path)

Subset size: 100


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' 

In [11]:
class SmallSetTransformer(nn.Module):
    def __init__(self, embedding, embedding_dim, enc_num_layers, enc_dim, enc_num_heads, dec_dim, dec_num_heads):
        super().__init__()
        
        args = [embedding,
            SAB(dim_in=embedding_dim, dim_out=enc_dim, num_heads=enc_num_heads),]
        
        for i in range(enc_num_layers - 1):
            args.append(SAB(dim_in=enc_dim, dim_out=enc_dim, num_heads=enc_num_heads))
        
        self.enc = nn.Sequential(*args)
        self.dec = nn.Sequential(
            PMA(dim=dec_dim, num_heads=dec_num_heads, num_seeds=1),
            nn.Linear(in_features=dec_dim, out_features=1),
        )

    def forward(self, x):
        x = self.enc(x)
        x = self.dec(x)
        return x.squeeze(-1)

In [12]:
def train(model, train_dataset, val_dataset, tolerance=1):
    steps = 0
    epochs = 0
    best_acc = None
    patience = 0
    pbar = tqdm(total=params.max_steps)
    
    wandb_run = wandb.init(project='sst_set_transformer', entity='johntzwei', tags=[])
    
    def collate_fn(list_items):
        x = []
        y = []
        for x_, y_ in list_items:
            x.append(x_)
            y.append(y_)
        return torch.LongTensor(x), torch.Tensor(y)
    
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=params.batch_size, pin_memory=True, collate_fn=collate_fn)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, shuffle=True, batch_size=params.batch_size, pin_memory=True, collate_fn=collate_fn)
    it = iter(train_dataloader)

    optimizer = torch.optim.AdamW(params=model.parameters(), lr=params.learning_rate, betas=(params.adam_beta1, params.adam_beta2), eps=params.adam_epsilon, weight_decay=params.weight_decay)
    criterion = nn.MSELoss().cuda()
    
    
    while steps < params.max_steps:
        # training
        model.train()
        total_loss = 0.
        try:
            batch = next(it)
        except:
            epochs += 1
            it = iter(train_dataloader)
            batch = next(it)
        steps += 1

        x, y = batch[0].to(device), batch[1].to(device)
        loss = criterion(model(x), y)
        wandb.log({'loss' : loss})
        total_loss += loss.cpu()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()

        pbar.set_description('Epoch: %d, Avg batch loss: %.2f' % (epochs, total_loss / steps))
        pbar.update(1)

        if steps % params.eval_steps == 0:
            model.eval()
            corr, avg_loss = evaluate(model, val_dataloader, eval_mapping={})
            wandb.log({'sst:val_spearman' : corr})
            wandb.log({'sst:val_loss' : avg_loss})
            # early stopping
            if not best_acc or corr > best_acc:
                best_acc = corr
            else:
                patience += 1
            if patience >= tolerance:
                break

def evaluate(model, val_dataloader, eval_mapping: list):
    model.eval()
    val_pbar = tqdm(total=len(val_dataloader))
    criterion = nn.MSELoss().cuda()
    
    losses = []
    ys, ys_ = [], []
    for batch in val_dataloader:
        x, y = batch[0].to(device), batch[1].to(device)
        
        with torch.no_grad():
            y_ = model(x)
            loss = criterion(y_, y)
        losses.append(loss.cpu())
        
        ys.extend(y.tolist())
        ys_.extend(y_.tolist())
        val_pbar.update(1)
        
    avg_loss = np.mean(losses)
    ys_ = sum(ys_, [])
    corr = np.corrcoef(np.array(ys), np.array(ys_))[0, 1]
    val_pbar.set_description('Correlation: %.2f, Avg. loss: %.2f' % (corr, avg_loss))
    
        
    plt.figure()
    sns.scatterplot(ys, ys_)
    
    return corr, avg_loss

In [13]:
len(dataset)

2772

In [14]:
np.max(dataset[0][0])

997

In [15]:
if False:
    embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
else:
    embedding = torch.load('./trained-emb_downsample.pt')
    embedding.requires_grad = False

In [16]:
model = SmallSetTransformer(embedding, embedding_dim, enc_num_layers, enc_dim, enc_num_heads, dec_dim, dec_num_heads)
model.to(device)

# train_size = 1000
# train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset)-train_size])
train_dataset = dataset[:1000]
val_dataset = dataset[1000:]

train(model, train_dataset, val_dataset, tolerance=10)

  0%|          | 0/5000 [00:00<?, ?it/s]

[34m[1mwandb[0m: Currently logged in as: [33mjohntzwei[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.13.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2022-10-17 15:26:01.441919: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-17 15:26:01.441957: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


  0%|          | 0/56 [00:00<?, ?it/s]

  return F.mse_loss(input, target, reduction=self.reduction)


NameError: name 'plt' is not defined

In [None]:
torch.save(model, 'set_transformer.pt')