In [1]:
%load_ext autoreload
%autoreload 2

import logging

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss

from util_funcs import *
from data_processors import *
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from tensorboardX import SummaryWriter
from distortions import *

from tqdm import tqdm_notebook as tqdm
from tqdm import trange

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
processor = processors['sst-2']()

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO, 
                    filename=f"log_dir/{get_log_name()}.txt")
logger = logging.getLogger(__name__)

In [3]:
runtime_config = dict(data_dir = "glue_data/SST-2",
                      bert_model = "bert-base-uncased",
                      output_mode = "classification",
                      max_seq_length = 64,
                      local_rank = -1,
                      batch_size = 32,
                      num_train_epochs = 3,
                      do_lower_case=True,
                      do_train=True,
                      train_batch_size=32,
                      gradient_accumulation_steps = 1,
                      n_gpu = 1,
                      learning_rate = 5e-5,
                      logger=logger,
                      warmup_proportion = 0.1)
locals().update(runtime_config)
assert train_batch_size == batch_size


In [4]:
label_list, num_labels, tokenizer, train_examples, \
           num_train_optimization_steps, train_dataloader = get_data(processor, runtime_config)

In [7]:
num_train_optimization_steps

6312

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

In [None]:
# pruning was every 10 for the first round of experiments

In [None]:
for svd_dim in [50,100,150,200,300]:
    
    model = BertForSequenceClassification.from_pretrained(
    bert_model, num_labels=num_labels).to(device).train()

    optimizer = get_optimizer(
        model, num_train_optimization_steps=num_train_optimization_steps, 
        **runtime_config)
    
    tensorboard_log_dir = "tensorboard_data/"
    output_dir = f"output/deeptwist/svd{svd_dim}_{get_log_name()}/"
#     try:
    os.mkdir(output_dir)
#     except:
#         os.rmdir(output_dir)
#         os.mkdir(output_dir)

    report_frequency = 5
    twist_frequency = 200
    twist_args = dict(k=svd_dim)
    distort = svd_compress

    counter = 0        
    total_period_loss = 0
    tb_writer = SummaryWriter(log_dir=tensorboard_log_dir)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    for _ in tqdm(range(int(num_train_epochs)), desc="Epoch"):
        try:
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if gradient_accumulation_steps > 1:
                    loss = loss / gradient_accumulation_steps

                loss.backward()
                tr_loss += loss.item()

                # Log to tensorboard
                counter += 1
                total_period_loss += loss.item()
                if counter % report_frequency == 0 and counter > report_frequency:
                    writer_callback(counter, total_period_loss / report_frequency, 
                                    tb_writer, run_name=output_dir.replace("/", "_"))
                    total_period_loss = 0

                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                # Deep twist
                if counter % twist_frequency == 0 and counter > 0 and twist_frequency > 0:
                    state_dict = distort(model.cpu().state_dict(), **twist_args)
                    model.load_state_dict(state_dict)
    #                 num_zeros = 0
    #                 for name, el in model.state_dict().items():
    #                     num_zeros += (el == 0).sum().item()
    #                 print(num_zeros)
                    model.cuda()



        except KeyboardInterrupt:
            break

    save_model(model, output_dir)
    tb_writer.close()

HBox(children=(IntProgress(value=0, description='Epoch', max=3, style=ProgressStyle(description_width='initial…

HBox(children=(IntProgress(value=0, description='Iteration', max=2105, style=ProgressStyle(description_width='…