In [43]:
%load_ext autoreload
%autoreload 2

import logging

import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from torch.nn import CrossEntropyLoss, MSELoss, KLDivLoss

from util_funcs import *
from data_processors import *
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from tensorboardX import SummaryWriter
from distortions import *

from tqdm import tqdm_notebook as tqdm
from tqdm import trange

from trainer import Trainer, KDTrainer
import json
from torch.nn import LSTM, Embedding, Linear
from torch import nn

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
processor = processors['sst-2']()
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO, 
                    filename=f"log_dir/{get_log_name()}.txt")
logger = logging.getLogger(__name__)

runtime_config = dict(data_dir = "glue_data/SST-2",
                      bert_model = "bert-base-uncased",
                      output_mode = "classification",
                      max_seq_length = 64,
                      local_rank = -1,
                      batch_size = 32,
                      num_train_epochs = 32,
                      do_lower_case=True,
                      do_train=True,
                      train_batch_size=32,
                      gradient_accumulation_steps = 1,
                      n_gpu = 1,
                      learning_rate = 1e-3,
                      logger=logger,
                      warmup_proportion = 0.1)
locals().update(runtime_config)
assert train_batch_size == batch_size

label_list, num_labels, tokenizer, train_examples, \
           num_train_optimization_steps, train_dataloader = get_data(processor, runtime_config)

eval_examples = processor.get_dev_examples(data_dir)
eval_dataloader = get_dataloader(
                eval_examples, label_list,
                tokenizer, eval_data=True,
                **runtime_config)

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
loss_fn = CrossEntropyLoss()

In [4]:
V = len(tokenizer.vocab)

In [5]:
finetuned_model_config = BertConfig("output_DEPRECATED/bert_config.json")
finetuned_model = BertForSequenceClassification(
    finetuned_model_config, num_labels
).cpu()

# with open("output_DEPRECATED/pytorch_model.bin", "rb") as f:
state_dict = torch.load("output_DEPRECATED/pytorch_model.bin")
for k, v in state_dict.items():
    v.cpu()
finetuned_model.load_state_dict(state_dict)
del state_dict

In [6]:
with open("output_DEPRECATED/bert_config.json", "r") as f:
    config = json.load(f)

config['num_hidden_layers'] = 3
config['num_attention_heads'] = 3

with open("kd_configs/kd.33.config.json", "w") as f:
    json.dump(config, f)

In [20]:
class LSTMModel(nn.Module):
    def __init__(self, size=200):
        super().__init__()
        self.embed = Embedding(num_embeddings=V, 
                               embedding_dim=size)
        self.lstm = LSTM(input_size=size, hidden_size=size)
        self.w = Linear(in_features=size, out_features=num_labels)
    def forward(self, x, *args, **kwargs):
        emb = self.embed(x).transpose(0,1)
        _, (h, _) = self.lstm(emb)
        return self.w(h).squeeze(0) 

In [95]:
model = LSTMModel().to(device)
model.embed.from_pretrained(finetuned_model.state_dict()["bert.embeddings.word_embeddings.weight"][:, :200])
input_ids, input_mask, segment_ids, label_ids = batch = batch = tuple(t.to(device) for t in next(iter(train_dataloader)))

In [96]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
model.train()

LSTMModel(
  (embed): Embedding(30522, 200)
  (lstm): LSTM(200, 200)
  (w): Linear(in_features=200, out_features=2, bias=True)
)

In [114]:
tensorboard_log_dir = "tensorboard_data/"
tb_writer = SummaryWriter(log_dir=tensorboard_log_dir)
output_dir = f"output/kd/kd_LSTM_{get_log_name()}/"
os.mkdir(output_dir)
loss_fn = MSELoss()
val_loss_fn = CrossEntropyLoss()

trainer = KDTrainer(
        model=model, data=train_dataloader, 
        val_data=eval_dataloader,
        teacher=finetuned_model,
        num_labels=num_labels,
        output_dir=output_dir,
        loss_fn=loss_fn,
        val_loss_fn=val_loss_fn,
        optimizer=optimizer,
        writer=tb_writer,
        device=device,
        **runtime_config,
    )

In [None]:
trainer.train(progress_bar='notebook', num_train_epochs=50, 
              patience=51,
              report_frequency=10, report_validation=True)

HBox(children=(IntProgress(value=0, description='Epoch', max=50, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='Iteration', max=2105, style=ProgressStyle(description_width='…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



HBox(children=(IntProgress(value=0, description='Iteration', max=2105, style=ProgressStyle(description_width='…

In [62]:
correct = 0
total = 0
eval_loss = 0

In [76]:
model.to(device)
for step, batch in enumerate(eval_dataloader):
    batch = tuple(t.to(device) for t in batch)
    input_ids, input_mask, segment_ids, label_ids = batch

    # define a new function to compute loss values for both output_modes
    with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask, labels=None)
    # No regression support
    loss = val_loss_fn(logits.view(-1, num_labels), label_ids.view(-1))
    eval_loss += loss.mean().item()   
#     nb_eval_steps += 1
    
    print(np.argmax(logits.detach().cpu().numpy(), axis=1))
    correct += (np.argmax(logits.detach().cpu().numpy(), axis=1) == label_ids.detach().cpu().numpy().flatten()).sum()
    total += len(label_ids)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1

In [107]:
trainer.validate()

RuntimeError: Expected object of backend CPU but got backend CUDA for argument #3 'index'

In [None]:
torch.save(model.state_dict())