<a href="https://colab.research.google.com/github/feixiongzhang/ChatterBot/blob/master/DydraNet_TPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Pre-trained BERT model

In [1]:
# install
!pip install pytorch-pretrained-bert pytorch-nlp

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 2.8MB/s 
[?25hCollecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |████████████████████████████████| 92kB 6.1MB/s 
Installing collected packages: pytorch-pretrained-bert, pytorch-nlp
Successfully installed pytorch-nlp-0.5.0 pytorch-pretrained-bert-0.6.2


Import Settings

In [2]:
# import settings
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Using TensorFlow backend.


'Tesla K80'

Load Data

In [0]:
# reference: https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03
df_train = pd.read_csv('/content/drive/My Drive/HydraNet/SelectionData/train_data.csv', encoding='utf-8')
df_dev = pd.read_csv('/content/drive/My Drive/HydraNet/SelectionData/dev_data.csv', encoding='utf-8')
df_test = pd.read_csv('/content/drive/My Drive/HydraNet/SelectionData/test_data.csv', encoding='utf-8')
# add [CLS] and [SEP] for each sentence
df_train['QuestionColumn'] = '[CLS] ' + df_train['Column'].astype(str) + ' [SEP] ' + df_train['Question'].astype(str) + ' [SEP]'
df_dev['QuestionColumn'] = '[CLS] ' + df_dev['Column'].astype(str) + ' [SEP] ' + df_dev['Question'].astype(str) + ' [SEP]'
df_test['QuestionColumn'] = '[CLS] ' + df_test['Column'].astype(str) + ' [SEP] ' + df_test['Question'].astype(str) + ' [SEP]'

Verify GPU Availability

Use BERT for Preparing Input Data

In [4]:
# use large uncased bert which has 340M parameters to tokenize the question-column pairs
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
train_sentences = df_train['QuestionColumn'].tolist()
train_tokenized = [tokenizer.tokenize(sentence) for sentence in train_sentences]
dev_sentences = df_dev['QuestionColumn'].tolist()
dev_tokenized = [tokenizer.tokenize(sentence) for sentence in dev_sentences]
test_sentences = df_test['QuestionColumn'].tolist()
test_tokenized = [tokenizer.tokenize(sentence) for sentence in test_sentences]
# For each tokenized sentence, BERT requires input ids, a sequence of integers 
# identifying each input token to its index number in the BERT tokenizer vocabulary.
MAX_LEN = 128
train_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in train_tokenized]
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
dev_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in dev_tokenized]
dev_input_ids = pad_sequences(dev_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in test_tokenized]
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create the attention masks to indicate if the element is token (1) or padding (0)
train_attention_masks = []
for seq in train_input_ids:
  seq_mask = [float(i > 0) for i in seq]
  train_attention_masks.append(seq_mask)
dev_attention_masks = []
for seq in dev_input_ids:
  seq_mask = [float(i > 0) for i in seq]
  dev_attention_masks.append(seq_mask)
test_attention_masks = []
for seq in test_input_ids:
  seq_mask = [float(i > 0) for i in seq]
  test_attention_masks.append(seq_mask)
train_labels = df_train['Label'].tolist()
dev_labels = df_dev['Label'].tolist()
test_labels = df_test['Label'].tolist()
print(train_tokenized[5])
print(train_input_ids[5])
print(train_attention_masks[5])
print(train_labels[5])

100%|██████████| 231508/231508 [00:00<00:00, 5545044.54B/s]


['[CLS]', 'notes', '[SEP]', 'tell', 'me', 'what', 'the', 'notes', 'are', 'for', 'south', 'australia', '[SEP]']
[ 101 3964  102 2425 2033 2054 1996 3964 2024 2005 2148 2660  102    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

Prepare the train, dev and test sets

In [0]:
# convert data into torch tensors
train_input_ids = torch.tensor(train_input_ids)
train_attention_masks = torch.tensor(train_attention_masks)
train_labels = torch.tensor(train_labels)
dev_input_ids = torch.tensor(dev_input_ids)
dev_attention_masks = torch.tensor(dev_attention_masks)
dev_labels = torch.tensor(dev_labels)
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.tensor(test_labels)

batch_size = 8
# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
validation_data = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

Fine-Tuning BERT for Sentence Pair Classification

In [6]:
# we only have two classes, 0 or 1
model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=2)
# send model to GPU
model.cuda()
# BERT fine-tuning parameters
param_optimizer = list(model.named_parameters())

100%|██████████| 407873900/407873900 [00:07<00:00, 53859729.61B/s]


In [7]:
# don't allow weight decay for bias, gamma and beta
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
  
# Store our loss and accuracy for plotting
train_loss_set = []
# Number of training epochs 
epochs = 4

t_total value of -1 results in schedule not being applied


Training

In [8]:
# BERT training loop
for each_epoch in trange(epochs, desc="Epoch"):  
  
  ## TRAINING
  
  # Set our model to training mode
  model.train()  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  print("Train loss: {}".format(tr_loss/nb_tr_steps))
       
  ## VALIDATION

  # Put model in evaluation mode
  model.eval()
  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

# plot training performance
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: ignored