In [1]:
import os
import sqlite3
import json
import random
import seaborn as sns
from tqdm.notebook import tqdm

import torch
import numpy as np
import datasets
import transformers
import wandb
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback, DataCollator

2022-05-07 05:39:51.280024: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64
2022-05-07 05:39:51.280061: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
seed = 0
annealing_runs = 5000
n_downsample = 1000
n_search = 100
db_name = 'sst_results'
anneal_factor = 0.1

model_card = "google/electra-small-discriminator"
pretraining = True
max_steps = 6000
eval_steps = 300
learning_rate = 1e-5
batch_size = 8
# adam should default to correct_bias = True
adam_epsilon = 1e-6
adam_beta1 = 0.9
adam_beta2 = 0.999
max_grad_norm = 1.0
warmup_ratio = 0.1
weight_decay = 0.01

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1080 Ti'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_card)

In [6]:
sst2 = datasets.load_dataset('sst')

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/johnny/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
sst2

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})

In [8]:
'''
def tokenize_function(examples, field='sentence'):
    return tokenizer(examples[field], padding=False, truncation=True)
tokenized_sst2 = sst2.map(tokenize_function, batched=True)
lengths = [len(i) for i in tokenized_sst2['train']['input_ids']]
max_length = np.quantile(lengths, 0.9)
max_length = np.max(lengths)
'''
max_length = 66
max_length

66

In [9]:
def tokenize_function(examples, field='sentence'):
    return tokenizer(examples[field], padding='max_length', max_length=max_length, truncation=True)

tokenized_sst2 = sst2.map(tokenize_function, batched=False)

Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-2490800be235e365.arrow


  0%|          | 0/1101 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-ecdebd119929fae1.arrow


In [10]:
# binarize label
tokenized_sst2 = tokenized_sst2.rename_column('label', 'scalar_label')
tokenized_sst2 = tokenized_sst2.map(lambda x: {'labels' : 0 if x['scalar_label'] < 0.5 else 1})

print(tokenized_sst2['test'][-10:]['sentence'])
print(tokenized_sst2['test'][-10:]['labels'])

Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-cb535ba06d831740.arrow


  0%|          | 0/1101 [00:00<?, ?ex/s]

Loading cached processed dataset at /home/johnny/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff/cache-e7a743503717c0b8.arrow


['Fails to bring as much to the table .', 'A film made with as little wit , interest , and professionalism as artistically possible for a slummy Hollywood caper flick .', 'Disturbingly superficial in its approach to the material .', "If you 're not the target demographic ... this movie is one long chick-flick slog .", 'I hate this movie', 'An imaginative comedy\\/thriller .', '( A ) rare , beautiful film .', '( An ) hilarious romantic comedy .', 'Never ( sinks ) into exploitation .', '( U ) nrelentingly stupid .']
[0, 0, 0, 0, 0, 1, 1, 1, 1, 0]


In [11]:
tokenized_sst2.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [12]:
sst2_downsample = tokenized_sst2['train'].shuffle(seed=seed).select(range(0, n_downsample))

In [13]:
val_dataloader = torch.utils.data.DataLoader(tokenized_sst2['validation'], shuffle=False, batch_size=batch_size, pin_memory=True)
test_dataloader = torch.utils.data.DataLoader(tokenized_sst2['test'], shuffle=False, batch_size=batch_size, pin_memory=True)

In [14]:
def create_hf(model_card, num_labels=2):
    tokenizer = AutoTokenizer.from_pretrained(model_card)
    model = AutoModelForSequenceClassification.from_pretrained(model_card, num_labels=num_labels)
    return model, tokenizer

In [15]:
metric = datasets.load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
try:
    con = sqlite3.connect('%s.db' % db_name)
    cur = con.cursor()
    cur.execute('''CREATE TABLE states
                   (indexes text, objective real)''')
    cur.execute('''CREATE INDEX idx_objective 
                    ON states (objective);''')
    
    # start from a random sample
    random_idx = np.random.choice(np.arange(0, n_downsample), n_search)
    cur.execute("INSERT INTO states VALUES ('%s', 0)" % json.dumps(random_idx.tolist()))
except:
    pass
finally:
    con.commit()
    con.close()

In [17]:
def get_nth_best(n):
    try:
        con = sqlite3.connect('%s.db' % db_name)
        cur = con.cursor()
        cur.execute("SELECT * FROM states ORDER BY objective DESC LIMIT 1 OFFSET %d" % n)
        r = cur.fetchone()
    finally:
        con.close()
    return (np.array(json.loads(r[0])), r[1])

def get_num_runs():
    try:
        con = sqlite3.connect('%s.db' % db_name)
        cur = con.cursor()
        cur.execute("SELECT COUNT(objective) FROM states")
        r = cur.fetchone()[0]
        con.commit()
    finally:
        con.close()
    return r

def insert_run(idx, obj):
    try:
        con = sqlite3.connect('%s.db' % db_name)
        cur = con.cursor()
        cur.execute("INSERT INTO states VALUES ('%s', %.8f)" % (json.dumps(idx.tolist()), obj))
        con.commit()
    finally:
        con.close()

In [18]:
def transition(idx, n):
    total_idx = np.arange(n_downsample)
    not_in = np.setdiff1d(total_idx, idx)
    a = np.random.choice(not_in)
    b = random.randint(0, len(idx) - 1)
    new_idx = idx.copy()
    print(new_idx[b], a)
    new_idx[b] = a
    return new_idx

In [19]:
def evaluate(model, val_dataloader):
    model.eval()
    val_pbar = tqdm(total=len(val_dataloader))
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        val_pbar.update(1)

    eval_dict = metric.compute()
    val_pbar.set_description('Acc: %.2f' % eval_dict['accuracy'])
    
    return eval_dict

In [20]:
def train(model, train_dataset, tolerance=2):
    steps = 0
    epochs = 0
    best_acc = None
    patience = 0
    pbar = tqdm(total=max_steps)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size, pin_memory=True)
    it = iter(train_dataloader)

    optimizer = torch.optim.AdamW(params=model.parameters(), lr=learning_rate, betas=(adam_beta1, adam_beta2), eps=adam_epsilon, weight_decay=weight_decay)
    lr_scheduler = transformers.get_scheduler("linear", optimizer=optimizer, num_warmup_steps=warmup_ratio*max_steps, num_training_steps=max_steps)

    while steps < max_steps:
        # training
        model.train()
        total_loss = 0.

        try:
            batch = next(it)
        except:
            epochs += 1
            it = iter(train_dataloader)
            batch = next(it)

        steps += 1
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        total_loss += loss.cpu()

        wandb.log({'loss' : loss})

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        pbar.set_description('Epoch: %d, Avg batch loss: %.2f' % (epochs, total_loss / steps))
        pbar.update(1)

        if steps % eval_steps == 0:
            model.eval()
            eval_dict = evaluate(model, val_dataloader)
            wandb.log({'sst:val_acc' : eval_dict['accuracy']})
            
            # early stopping
            if not best_acc or eval_dict['accuracy'] > best_acc:
                best_acc = eval_dict['accuracy']
            else:
                patience += 1
            
            if patience >= tolerance:
                break

In [21]:
def one_run():
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # controls transition sampling
    # random.seed(seed)
    # np.random.seed(seed)

    i = get_num_runs()
    ratio = (annealing_runs - i) / annealing_runs if i < annealing_runs else 0
    idx = random.randint(0, int(ratio * i))
    idx, quality = get_nth_best(idx)
    
    new_idx = transition(idx, n_downsample)
    train_dataset = sst2_downsample.select(new_idx)
    assert(len(train_dataset) == n_search)
    
    model, tokenizer = create_hf(model_card, num_labels=2)
    model.to(device)
    
    wandb.init(project="simulated_annealing-sst", entity="johntzwei", tags=[model_card])
    wandb.log({'n_downsample' : n_downsample})
    wandb.log({'n_search' : n_search})
    wandb.log({'model_card' : model_card})
    wandb.log({'indexes' : json.dumps(idx.tolist())})

    train(model, train_dataset)
    eval_dict = evaluate(model, test_dataloader)
    eval_dict = {'sst2_test:%s' % k : v for k, v in eval_dict.items()}
    new_quality = eval_dict['sst2_test:accuracy']
    wandb.log(eval_dict)
    
    insert_run(new_idx, new_quality)

In [None]:
while True:
    one_run()

475 633


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.16 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2022-05-07 05:40:25.100748: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64
2022-05-07 05:40:25.100788: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/277 [00:00<?, ?it/s]

511 550


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,███▆▄▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
n_downsample,▁
n_search,▁
sst2_test:accuracy,▁
sst:val_acc,▁▇█▆██

0,1
indexes,"[517, 882, 82, 341, ..."
loss,0.00014
model_card,bert-base-cased
n_downsample,1000
n_search,100
sst2_test:accuracy,0.75928
sst:val_acc,0.78111


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.16 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2022-05-07 05:43:57.411142: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64
2022-05-07 05:43:57.411181: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


  0%|          | 0/6000 [00:00<?, ?it/s]