In [1]:
import tensorflow as tf
import torch.cuda

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print(torch.cuda.get_device_name(0))
else:
  print("No GPU available.")
  device = torch.device("cpu")

Tesla P100-PCIE-16GB


In [2]:
!pip install transformers
#AKA huggingface library

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/22/97/7db72a0beef1825f82188a4b923e62a146271ac2ced7928baa4d47ef2467/transformers-2.9.1-py3-none-any.whl (641kB)
[K     |████████████████████████████████| 645kB 2.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/3b/88/49e772d686088e1278766ad68a463513642a2a877487decbd691dec02955/sentencepiece-0.1.90-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 15.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 20.6MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K    

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import string
import numpy as np
from tqdm import tqdm

NEW_FILE = 'drive/My Drive/DATA_MINING/DATA/train_snli.csv'
snli_data = pd.read_csv(NEW_FILE)

In [5]:
snli_data.head()


Unnamed: 0.1,Unnamed: 0,gold_label,sentence1,sentence2
0,0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,4,entailment,Children smiling and waving at camera,There are children present


In [6]:
snli_data.gold_label.unique()

array(['neutral', 'contradiction', 'entailment', '-'], dtype=object)

In [0]:
snli_data = snli_data.drop(snli_data[snli_data.gold_label =="-"].index)
snli_data.reset_index(drop=True, inplace=True)
snli_data.sentence1.fillna('UNKNOWN', inplace=True)
snli_data.sentence2.fillna('UNKNOWN', inplace=True)

In [8]:
snli_data.shape

(549367, 4)

In [9]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




**One-hot encoding labels**

In [10]:
labels = ['neutral', 'contradiction', 'entailment']
temp=[]
def num_labels(data):
  for l in data['gold_label']:
    temp.append(labels.index(l))
  data['NUM_LABEL'] = temp
num_labels(snli_data)

snli_data.head()

Unnamed: 0.1,Unnamed: 0,gold_label,sentence1,sentence2,NUM_LABEL
0,0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,0
1,1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",1
2,2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",2
3,3,neutral,Children smiling and waving at camera,They are smiling at their parents,0
4,4,entailment,Children smiling and waving at camera,There are children present,2


**Define mean sentence1 + sentence2 size to get an idea of what sequence length to choose as bert input**

In [11]:
max_len = []
sentences1 = snli_data.sentence1.values
sentences2 = snli_data.sentence2.values

for i in tqdm(range(len(sentences1))):

    input_ids1 = tokenizer.encode(sentences1[i], add_special_tokens=True)
    input_ids2 = tokenizer.encode(sentences2[i], add_special_tokens=True)
    max_len.append(len(input_ids1)+ len((input_ids2)))
    

print('Mean sentence length: ', np.mean(max_len), np.percentile(max_len,50))

100%|██████████| 549367/549367 [04:30<00:00, 2030.87it/s]

Mean sentence length:  27.15927603951457 26.0





In [16]:
print(np.percentile(max_len,95))

42.0


**Tokenize the sentences to get BERT inputs**

In [17]:
input_ids = []
attention_masks = []
token_type_ids = []

for k in tqdm(range(snli_data.shape[0])):
    encoded_dict = tokenizer.encode_plus(
                        snli_data["sentence1"][k],  
                        snli_data["sentence2"][k],                    
                        add_special_tokens = True, 
                        max_length = 42,  #We want it lowest as possible but still high enough to be relevant.         
                        pad_to_max_length = True,
                        return_attention_mask = True,  
                        return_tensors = 'pt',  #pytorch tensors  
                   )
    
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(snli_data.NUM_LABEL.values)
token_type_ids = torch.cat(token_type_ids, dim=0)


100%|██████████| 549367/549367 [05:08<00:00, 1779.71it/s]


**Observe the results on an example**

In [18]:
print('Original: ', '[CLS]'+snli_data["sentence1"][0] , "[SEP]", snli_data["sentence2"][0])
print('Token IDs:', input_ids[0])
print (attention_masks[0])
print(token_type_ids[0])


Original:  [CLS]A person on a horse jumps over a broken down airplane. [SEP] A person is training his horse for a competition.
Token IDs: tensor([  101,  1037,  2711,  2006,  1037,  3586, 14523,  2058,  1037,  3714,
         2091, 13297,  1012,   102,  1037,  2711,  2003,  2731,  2010,  3586,
         2005,  1037,  2971,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


**Define the training and validation datasets**

In [0]:
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


batch_size = 128
dataset = TensorDataset(input_ids, attention_masks, labels, token_type_ids)

#Split training data into 80/20 for validation purpose
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size)
validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size)

**Create the model : BERT model + a classification layer on top**

In [20]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", #"bert-base-uncased" for the english version
    num_labels = 3, # The number of output labels--3 for our classification task. 
    output_attentions = False, 
    output_hidden_states = False, 
)

#Run the model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
!pip install pytorch-lamb

Collecting pytorch-lamb
  Downloading https://files.pythonhosted.org/packages/43/98/3bce14a319317a2856db722f2542d329baf42845fa53563d0d749c5a2d40/pytorch_lamb-1.0.0-py3-none-any.whl
Collecting tensorboardX
[?25l  Downloading https://files.pythonhosted.org/packages/35/f1/5843425495765c8c2dd0784a851a93ef204d314fc87bcc2bbb9f662a3ad1/tensorboardX-2.0-py2.py3-none-any.whl (195kB)
[K     |████████████████████████████████| 204kB 2.6MB/s 
Installing collected packages: tensorboardX, pytorch-lamb
Successfully installed pytorch-lamb-1.0.0 tensorboardX-2.0


**Define the optimizer for our model**

In [0]:
from pytorch_lamb import Lamb 

optimizer = Lamb(model.parameters(),
                  lr = 1e-3,
                  eps = 1e-8,
                  adam = False)

# optimizer = AdamW(model.parameters(),
#                   lr = 5e-5, #learning-rate
#                   eps = 1e-8)


**Create a scheduler to update the learning rate during the training**

In [23]:
from transformers import get_linear_schedule_with_warmup

epochs = 2

num_steps = int(len(train_dataloader) * epochs)
print("Steps :", num_steps)
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = int(num_steps/100),
                                            num_training_steps = num_steps-(int(num_steps/100)))

Steps : 6868


**Define the metrics we want to use**

In [0]:
# Function to calculate the accuracy of our predictions vs labels
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def flat_accuracy(labels, preds):
    labels_flat = labels.flatten()
    return np.sum(preds == labels_flat) / len(labels_flat)


In [0]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


# **Model training**

In [31]:
import random
import numpy as np

# This training code is based on Chris McCormick's script which is based on "Transformers" example.

# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')


    # Measure how long the training epoch takes.
    t0 = time.time()
    acc = 0
    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        # Progress update every 200 batches.
        if step % 200 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)          
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. '.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_token_type_ids = batch[3].to(device)

        
        model.zero_grad()        

        loss, logits = model(b_input_ids, 
                             token_type_ids=b_token_type_ids, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        scheduler.step()




    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_precision = 0
    total_eval_recall = 0
    total_eval_f1 = 0
    total_eval_loss = 0
    

    for batch in validation_dataloader:
    
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_token_type_ids = batch[3].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=b_token_type_ids, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()


        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        y_pred = np.argmax(logits, axis=1).flatten()
        # Calculate the metrics for this batch of test sentences, and
        # accumulate it over all batches.
        acc = flat_accuracy(label_ids, y_pred)
        prec = precision_score(label_ids, y_pred, average='micro')
        recall = recall_score(label_ids, y_pred, average='micro')

        total_eval_accuracy += acc
        total_eval_precision += prec
        total_eval_recall += recall


    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_precision = total_eval_precision / len(validation_dataloader)
    avg_val_recall = total_eval_recall / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    print("  Precision: {0:.2f}".format(avg_val_precision))
    print("  Recall: {0:.2f}".format(avg_val_recall))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch   200  of  3,434.    Elapsed: 0:01:43. 
  Batch   400  of  3,434.    Elapsed: 0:03:27. 
  Batch   600  of  3,434.    Elapsed: 0:05:10. 
  Batch   800  of  3,434.    Elapsed: 0:06:53. 
  Batch 1,000  of  3,434.    Elapsed: 0:08:36. 
  Batch 1,200  of  3,434.    Elapsed: 0:10:20. 
  Batch 1,400  of  3,434.    Elapsed: 0:12:03. 
  Batch 1,600  of  3,434.    Elapsed: 0:13:46. 
  Batch 1,800  of  3,434.    Elapsed: 0:15:29. 
  Batch 2,000  of  3,434.    Elapsed: 0:17:12. 
  Batch 2,200  of  3,434.    Elapsed: 0:18:56. 
  Batch 2,400  of  3,434.    Elapsed: 0:20:39. 
  Batch 2,600  of  3,434.    Elapsed: 0:22:22. 
  Batch 2,800  of  3,434.    Elapsed: 0:24:06. 
  Batch 3,000  of  3,434.    Elapsed: 0:25:49. 
  Batch 3,200  of  3,434.    Elapsed: 0:27:32. 
  Batch 3,400  of  3,434.    Elapsed: 0:29:16. 

  Average training loss: 0.34
  Training epoch took: 0:29:33

Running Validation...
  Accuracy: 0.88
  Precision: 0.88
  Recall: 0.88
  Validation Loss: 0.32
  Validation