The Multilingual Cased (New) model also fixes normalization issues in many languages, so it is recommended in languages with non-Latin alphabets (and is often better for most languages with Latin alphabets). When using this model, make sure to pass **--do_lower_case=false** to run_pretraining.py and other scripts.

In [3]:
import os
import pandas
import numpy as np


In [4]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [5]:
wic_path  = "/content/drive/My Drive/datasets/WiC_dataset"


In [6]:
! ls "/content/drive/My Drive/datasets/WiC_dataset"

dev  README.txt  test  train


In [7]:
wic_train_path_d = os.path.join(wic_path, "train/train.data.txt")
wic_train_path_g = os.path.join(wic_path, "train/train.gold.txt")

train_data_df = pandas.read_csv(wic_train_path_d, sep = "\t", names = ['word', 'pos', 'position1', 'sent1', 'sent2'])
train_gold_df = pandas.read_csv(wic_train_path_g, sep = "\t", names = ['label'])

train_data_df[['position1', 'position2']] = train_data_df['position1'].str.split("-", expand = True)
train_data_df = train_data_df[['word', 'pos', 'position1', 'position2', 'sent1', 'sent2']]

train_data_df.head()
# train_gold_df.head()

Unnamed: 0,word,pos,position1,position2,sent1,sent2
0,carry,V,2,1,You must carry your camping gear .,Sound carries well over water .
1,go,V,2,6,Messages must go through diplomatic channels .,Do you think the sofa will go through the door ?
2,break,V,0,2,Break an alibi .,The wholesaler broke the container loads into ...
3,cup,N,8,4,He wore a jock strap with a metal cup .,Bees filled the waxen cups with honey .
4,academy,N,1,2,The Academy of Music .,The French Academy .


In [8]:
# do same for dev data
wic_dev_path_d = os.path.join(wic_path, "dev/dev.data.txt")
wic_dev_path_g = os.path.join(wic_path, "dev/dev.gold.txt")

dev_data_df = pandas.read_csv(wic_dev_path_d, sep = "\t", names = ['word', 'pos', 'position1', 'sent1', 'sent2'])
dev_gold_df = pandas.read_csv(wic_dev_path_g, sep = "\t", names = ['label'])

dev_data_df[['position1', 'position2']] = dev_data_df['position1'].str.split("-", expand = True)
dev_data_df = dev_data_df[['word', 'pos', 'position1', 'position2', 'sent1', 'sent2']]

dev_data_df.head()
# dev_gold_df.head()

Unnamed: 0,word,pos,position1,position2,sent1,sent2
0,board,N,2,2,Room and board .,He nailed boards across the windows .
1,circulate,V,0,4,Circulate a rumor .,This letter is being circulated among the facu...
2,hook,V,0,1,Hook a fish .,"He hooked a snake accidentally , and was so sc..."
3,recreation,N,1,9,For recreation he wrote poetry and solved cros...,Drug abuse is often regarded as a form of recr...
4,domesticity,N,4,6,Making a hobby of domesticity .,A royal family living in unpretentious domesti...


In [9]:
!nvidia-smi

Thu Oct 15 04:35:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [11]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 4.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 24.2MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 39.8MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K 

In [12]:
from transformers import BertTokenizerFast

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [13]:
# sentences is a list 0f str
train_sentences_1 =  list(train_data_df['sent1'])
train_sentences_2 = list(train_data_df['sent2'])
print(len(train_sentences_1))

dev_sentences_1 =  list(dev_data_df['sent1'])
dev_sentences_2 = list(dev_data_df['sent2'])
print(len(dev_sentences_1))

5428
638


In [14]:
# find out the offset start position for each character
train_offsets_1 = []
train_offsets_2 = []

for i in range(len(train_sentences_1)):
  sentence_1 = train_sentences_1[i]
  sentence_2 = train_sentences_2[i]

  sentence_1 = sentence_1.split()
  offset_1 = 0
  for j, word in enumerate(sentence_1):
    if j == int(train_data_df['position1'][i]):
      break
    offset_1 = offset_1 + len(word)+1
  train_offsets_1.append(offset_1)

  sentence_2 = sentence_2.split()
  offset_2 = 0
  for j, word in enumerate(sentence_2):
    if j == int(train_data_df['position2'][i]):
      break
    offset_2 = offset_2 + len(word)+1
  train_offsets_2.append(offset_2)

print(len(train_sentences_2))
print(len(train_offsets_2))

5428
5428


In [15]:
encoded_inputs_train = tokenizer(train_sentences_1, train_sentences_2, padding = True, truncation = True, return_tensors = 'pt', return_offsets_mapping=True)
print(len(encoded_inputs_train['input_ids'][0]))
print(type(encoded_inputs_train['input_ids']))
print(train_sentences_1[3])
print(train_sentences_2[3])
print((encoded_inputs_train['offset_mapping'][3]))
print((encoded_inputs_train['input_ids'][3]))

encoded_inputs_dev = tokenizer(dev_sentences_1, dev_sentences_2, padding = True, truncation = True, return_tensors = 'pt', return_offsets_mapping=True)
print(len(encoded_inputs_dev['input_ids'][0]))
print(type(encoded_inputs_dev['input_ids']))

70
<class 'torch.Tensor'>
He wore a jock strap with a metal cup .
Bees filled the waxen cups with honey .
tensor([[ 0,  0],
        [ 0,  2],
        [ 3,  7],
        [ 8,  9],
        [10, 13],
        [13, 14],
        [15, 18],
        [18, 20],
        [21, 25],
        [26, 27],
        [28, 33],
        [34, 37],
        [38, 39],
        [ 0,  0],
        [ 0,  4],
        [ 5, 11],
        [12, 15],
        [16, 18],
        [18, 21],
        [22, 25],
        [25, 26],
        [27, 31],
        [32, 35],
        [35, 37],
        [38, 39],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,  0],
        [ 0,

In [16]:
# create wordpeice indices of the words of interest now
train_pos_1 = []
train_pos_2 = []
for j in range(len(train_sentences_1)):
  vocab_ind = encoded_inputs_train['input_ids'][j].detach().numpy()
  which_word = 0
  second_sentence = False
  done = False
  pos1 = -1
  pos2 = -1
  target_word_1 = int(train_data_df['position1'][j])
  target_word_2 = int(train_data_df['position2'][j])
  # print("The targets are "+str((target_word_1, target_word_2)) + " and we have j = "+str(j))
  for i,v_ind in enumerate(vocab_ind):
    # print(v_ind)
    # skip cls token
    if i == 0:
      # print("oui"+ str(v_ind))
      which_word = 0
      old_char_poses =  encoded_inputs_train['offset_mapping'][j][i].detach().numpy()
      continue

    
    # this token's character positions
    char_poses =  encoded_inputs_train['offset_mapping'][j][i].detach().numpy()
    # print(char_poses)

    # if this char_poses's first index is last one's last +1 it's which_word should be last one's plus one

    if char_poses[0] == old_char_poses[1]+1:
      which_word = which_word + 1

    # print("Span is "+str(char_poses)+ " word is word - "+str(which_word) + " and i = "+str(i))
    

    if not second_sentence:
      if (which_word == target_word_1 and not done):
        done = True
        pos1 = i
    else:
      if (which_word == target_word_2 and not done):
        done = True
        pos2 = i
        break

    # now the which word is the correct one, unless of course it's the separator token, for which we must reset the count, and move to sentence2

    if (v_ind == 102 and second_sentence == False):
      which_word = 0
      old_char_poses = char_poses
      second_sentence = True
      done = False
    elif (v_ind == 102):
      assert False, "Something wrong happened"

    old_char_poses = char_poses

  # print((pos1, pos2))
  train_pos_1.append(pos1)
  train_pos_2.append(pos2)

dev_pos_1 = []
dev_pos_2 = []
for j in range(len(dev_sentences_1)):
  vocab_ind = encoded_inputs_dev['input_ids'][j].detach().numpy()
  which_word = 0
  second_sentence = False
  done = False
  pos1 = -1
  pos2 = -1
  target_word_1 = int(dev_data_df['position1'][j])
  target_word_2 = int(dev_data_df['position2'][j])
  # print("The targets are "+str((target_word_1, target_word_2)) + " and we have j = "+str(j))
  for i,v_ind in enumerate(vocab_ind):
    # print(v_ind)
    # skip cls token
    if i == 0:
      # print("oui"+ str(v_ind))
      which_word = 0
      old_char_poses =  encoded_inputs_dev['offset_mapping'][j][i].detach().numpy()
      continue

    
    # this token's character positions
    char_poses =  encoded_inputs_dev['offset_mapping'][j][i].detach().numpy()
    # print(char_poses)

    # if this char_poses's first index is last one's last +1 it's which_word should be last one's plus one

    if char_poses[0] == old_char_poses[1]+1:
      which_word = which_word + 1

    # print("Span is "+str(char_poses)+ " word is word - "+str(which_word) + " and i = "+str(i))
    

    if not second_sentence:
      if (which_word == target_word_1 and not done):
        done = True
        pos1 = i
    else:
      if (which_word == target_word_2 and not done):
        done = True
        pos2 = i
        break

    # now the which word is the correct one, unless of course it's the separator token, for which we must reset the count, and move to sentence2

    if (v_ind == 102 and second_sentence == False):
      which_word = 0
      old_char_poses = char_poses
      second_sentence = True
      done = False
    elif (v_ind == 102):
      assert False, "Something wrong happened"

    old_char_poses = char_poses

  # print((pos1, pos2))
  dev_pos_1.append(pos1)
  dev_pos_2.append(pos2)   


print(len(train_pos_1))
print(len(dev_pos_2))

train_pos_1 = torch.LongTensor(train_pos_1)
train_pos_2 = torch.LongTensor(train_pos_2)

train_pos = torch.stack((train_pos_1, train_pos_2), dim =1)

dev_pos_1 = torch.LongTensor(dev_pos_1)
dev_pos_2 = torch.LongTensor(dev_pos_2)

dev_pos = torch.stack((dev_pos_1, dev_pos_2), dim=1)
print((train_pos.size()))

5428
638
torch.Size([5428, 2])


In [17]:
# labels = torch.from_numpy(train_gold_df['label'].values)
train_gold_df_tmp = train_gold_df.replace({'F' : 0, 'T' : 1})
train_labels = torch.from_numpy(train_gold_df_tmp.values)
print(train_labels)

dev_gold_df_tmp = dev_gold_df.replace({'F' : 0, 'T' : 1})
dev_labels = torch.from_numpy(dev_gold_df_tmp.values)
print(len(dev_labels))

tensor([[0],
        [0],
        [0],
        ...,
        [1],
        [1],
        [1]])
638


In [18]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(encoded_inputs_train['input_ids'], encoded_inputs_train['token_type_ids'],
                              encoded_inputs_train['attention_mask'], train_pos, train_labels)

dev_dataset = TensorDataset(encoded_inputs_dev['input_ids'], encoded_inputs_dev['token_type_ids'],
                              encoded_inputs_dev['attention_mask'], dev_pos, dev_labels)


In [19]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
dev_dataloader = DataLoader(
            dev_dataset, # The validation samples.
            sampler = SequentialSampler(dev_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

**CUTOFF**

In [20]:
from transformers import BertModel, AdamW, BertConfig
import torch.nn as nn

class BERTi(nn.Module):

    def __init__(self):
        super(BERTi, self).__init__()

        options_name = "bert-base-multilingual-cased"
        hidden_states = False
        self.encoder = BertModel.from_pretrained(options_name, output_hidden_states = hidden_states)

    def forward(self, input_ids, token_type_ids, attention_mask):
        last_layer, _ = self.encoder(input_ids = input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)

        return last_layer

In [21]:
class Logistic_Reg(nn.Module):

  def __init__(self):
    super(Logistic_Reg, self).__init__()

    self.fc1 = nn.Linear(1536, 1)

  def forward(self, x):
    x = self.fc1(x)

    return x

In [22]:
model_bert = BERTi().to(device)
model_log_reg = Logistic_Reg().to(device)
# Get all of the model's parameters as a list of tuples.
params = list(model_bert.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…


The BERT model has 199 different named parameters.

==== Embedding Layer ====

encoder.embeddings.word_embeddings.weight               (119547, 768)
encoder.embeddings.position_embeddings.weight             (512, 768)
encoder.embeddings.token_type_embeddings.weight             (2, 768)
encoder.embeddings.LayerNorm.weight                           (768,)
encoder.embeddings.LayerNorm.bias                             (768,)

==== First Transformer ====

encoder.encoder.layer.0.attention.self.query.weight       (768, 768)
encoder.encoder.layer.0.attention.self.query.bias             (768,)
encoder.encoder.layer.0.attention.self.key.weight         (768, 768)
encoder.encoder.layer.0.attention.self.key.bias               (768,)
encoder.encoder.layer.0.attention.self.value.weight       (768, 768)
encoder.encoder.layer.0.attention.self.value.bias             (768,)
encoder.encoder.layer.0.attention.output.dense.weight     (768, 768)
encoder.encoder.layer.0.attention.output.dense.bias          

In [43]:
print(type(model_bert.parameters()))

<class 'generator'>


In [44]:
optimizer = AdamW(list(model_bert.parameters()) + list(model_log_reg.parameters()),
                  lr = 1e-5, # args.learning_rate - default is 5e-5, our notebook had 1e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

# scheduler??


In [45]:
def loss_fn(output, targets):
  return nn.BCEWithLogitsLoss(reduction='mean')(output, targets)

In [46]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [47]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    print(pred_flat)
    print(labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [48]:
def flat_accuracy_single_logit(preds, labels):
    pred_flat = (preds>0).flatten()
    labels_flat = labels.flatten()
    # print(pred_flat)
    # print(labels_flat)
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [49]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [50]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()
# epochs = 10
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model_bert.train()
    model_log_reg.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):


        # Progress update every 40 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.

        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_attention_mask = batch[2].to(device)
        b_poses = batch[3].to(device)
        b_labels = batch[4].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        optimizer.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        # It returns different numbers of parameters depending on what arguments
        # arge given and what flags are set. For our useage here, it returns
        # the loss (because we provided labels) and the "logits"--the model
        # outputs prior to activation.
        last_layer = model_bert(b_input_ids, b_token_type_ids, b_attention_mask)

        # print(b_poses[:,0].size())
        b_poses = b_poses.unsqueeze(-1).repeat(1, 1, 768)
        # print(b_poses.size())
        gathered_activations = torch.gather(last_layer, 1, b_poses)

        # concatted = torch.cat((last_layer[b_poses[:,0].view(16,1),:] , last_layer[:,b_poses[:,1],:]), dim = 1)

        logits = model_log_reg(gathered_activations.view(gathered_activations.size()[0], -1))
        # print(type(b_labels))
        loss = loss_fn(logits, b_labels.type_as(logits))
        # print(loss)
        # print("loss = "+str(loss))
        # print(logits.cpu().detach().numpy())

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        
        # print(loss)
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        # TODO logsistic REG????/
        torch.nn.utils.clip_grad_norm_(model_bert.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # # Update the learning rate.
        # scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model_bert.eval()
    model_log_reg.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    t_labels = []
    t_preds = []

    # Evaluate data for one epoch
    for batch in dev_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.

        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_attention_mask = batch[2].to(device)
        b_poses = batch[3].to(device)
        b_labels = batch[4].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            last_layer = model_bert(b_input_ids, b_token_type_ids, b_attention_mask)
            b_poses = b_poses.unsqueeze(-1).repeat(1, 1, 768)
            gathered_activations = torch.gather(last_layer, 1, b_poses)
            logits = model_log_reg(gathered_activations.view(gathered_activations.size()[0], -1))
            loss = loss_fn(logits, b_labels.type_as(logits))
            
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.cpu().detach().numpy()
        # print(logits)
        label_ids = b_labels.cpu().numpy()

        t_labels.append(label_ids)
        t_preds.append(logits)
        

    all_dev_labels = np.concatenate(t_labels, axis=0)
    all_dev_logits = np.concatenate(t_preds, axis=0)
    # Report the final accuracy for this validation run.
    # print(all_dev_logits)
    avg_val_accuracy = flat_accuracy_single_logit(all_dev_logits, all_dev_labels)
    print("  Accuracy: {0:.4f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(dev_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.3f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    # Do same on training data
    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    t_labels = []
    t_preds = []

    # Evaluate data for one epoch
    for batch in train_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.

        b_input_ids = batch[0].to(device)
        b_token_type_ids = batch[1].to(device)
        b_attention_mask = batch[2].to(device)
        b_poses = batch[3].to(device)
        b_labels = batch[4].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            last_layer = model_bert(b_input_ids, b_token_type_ids, b_attention_mask)
            b_poses = b_poses.unsqueeze(-1).repeat(1, 1, 768)
            gathered_activations = torch.gather(last_layer, 1, b_poses)
            logits = model_log_reg(gathered_activations.view(gathered_activations.size()[0], -1))
            loss = loss_fn(logits, b_labels.type_as(logits))
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.cpu().detach().numpy()
        label_ids = b_labels.cpu().numpy()

        t_labels.append(label_ids)
        t_preds.append(logits)
        

    all_dev_labels = np.concatenate(t_labels, axis=0)
    all_dev_logits = np.concatenate(t_preds, axis=0)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = flat_accuracy_single_logit(all_dev_logits, all_dev_labels)
    print("  Accuracy: {0:.4f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(train_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Train Avg Loss: {0:.3f}".format(avg_val_loss))
    print("  Training Validation took: {:}".format(validation_time))


print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    50  of    340.    Elapsed: 0:00:07.
  Batch   100  of    340.    Elapsed: 0:00:14.
  Batch   150  of    340.    Elapsed: 0:00:21.
  Batch   200  of    340.    Elapsed: 0:00:29.
  Batch   250  of    340.    Elapsed: 0:00:36.
  Batch   300  of    340.    Elapsed: 0:00:43.

  Average training loss: 0.61
  Training epcoh took: 0:00:48

Running Validation...
  Accuracy: 0.6646
  Validation Loss: 0.647
  Validation took: 0:00:01
  Accuracy: 0.8128
  Train Avg Loss: 0.434
  Training Validation took: 0:00:13

Training...
  Batch    50  of    340.    Elapsed: 0:00:07.
  Batch   100  of    340.    Elapsed: 0:00:14.
  Batch   150  of    340.    Elapsed: 0:00:21.
  Batch   200  of    340.    Elapsed: 0:00:29.
  Batch   250  of    340.    Elapsed: 0:00:36.
  Batch   300  of    340.    Elapsed: 0:00:43.

  Average training loss: 0.45
  Training epcoh took: 0:00:48

Running Validation...
  Accuracy: 0.6693
  Validation Loss: 0.640
  Validation took: 0:00:01
  Accuracy: 0.8867
