In [1]:
%load_ext autoreload
%autoreload 2

from distortions import *

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss

from util_funcs import *
from data_processors import *
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME, BertEncoder
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from tensorboardX import SummaryWriter
from distortions import *

from tqdm import tqdm_notebook as tqdm
from tqdm import trange

from trainer import Trainer, DeepTwistTrainer
import logging

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
processor = processors['sst-2']()
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO, 
                    filename=f"log_dir/{get_log_name()}.txt")
logger = logging.getLogger(__name__)

runtime_config = dict(data_dir = "glue_data/SST-2",
                      bert_model = "bert-base-uncased",
                      output_mode = "classification",
                      max_seq_length = 64,
                      local_rank = -1,
                      batch_size = 32,
                      num_train_epochs = 32,
                      do_lower_case=True,
                      do_train=True,
                      train_batch_size=32,
                      gradient_accumulation_steps = 1,
                      n_gpu = 1,
                      learning_rate = 5e-5,
                      logger=logger,
                      warmup_proportion = 0.1)
locals().update(runtime_config)
assert train_batch_size == batch_size

label_list, num_labels, tokenizer, train_examples, \
           num_train_optimization_steps, train_dataloader = get_data(processor, runtime_config)

eval_examples = processor.get_dev_examples(data_dir)
eval_dataloader = get_dataloader(
                eval_examples, label_list,
                tokenizer, eval_data=True,
                **runtime_config)

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
loss_fn = CrossEntropyLoss()

In [3]:
model = BertForSequenceClassification.from_pretrained(
        bert_model, num_labels=num_labels).to(device).train()

base_model = BertForSequenceClassification.from_pretrained(
            bert_model, num_labels=num_labels).eval().state_dict()

In [None]:
Bert

In [5]:
base_model.keys()

odict_keys(['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output

In [29]:
decorator = to_bert(base_model)
decorated_prune = decorator(weight_prune)