# Phase 1: Baseline model

In [1]:
# basic reqs
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F

# pre processing reqs 
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# model reqs
import torch.nn as nn
from transformers import BertModel
import time

## Functions

In [1]:
def pre_process(path, N):
    '''
    input: path to json file
    output: bert encodings
    JSON -> df -> bert tokens -> bert embeddings
    '''
    df = pd.read_json(path, lines=True)

    df['concatSummaryReview'] = df['summary'] + ' ' + df['reviewText']
    df['concatSummaryReview'] = df['concatSummaryReview'].str.lower().fillna('[NA]')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # get the labels
    sentiment_binary = {
    'positive' : 1,
    'negative' : 0
    }

    df['sentiment_binary'] = df['sentiment'].map(sentiment_binary)
    labels = torch.tensor(df['sentiment_binary'])

    # lists to store outputs of bert tokeniser 
    input_ids = []
    attention_masks = []

    MAX_LEN = 512
    BATCH_SIZE = 16 # For fine-tuning BERT, the authors recommend a batch size of 16 or 32

    for i in range(N):
        text = df['concatSummaryReview'][i]
        tokens = tokenizer.tokenize(text)
        encoded_plus = tokenizer.encode_plus(
            text=tokens,                  # Preprocess sentence
            add_special_tokens=True,    # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,         # Max length to truncate/pad
            padding='max_length',       # Pad sentence to max length
            return_attention_mask=True, # Return attention mask
            truncation = True
            # return_tensors='pt',        # Return PyTorch tensor
        )

        input_ids.append(encoded_plus.get('input_ids'))
        attention_masks.append(encoded_plus.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    # from here will be different for the test data
    # and for the sata used for predictions - might need to split into 2 functions

    # Create the DataLoader to be used as input in the model
    labels = labels[0:N] # to limit the labels for testing purposes
    data = TensorDataset(input_ids, attention_masks, labels)
    sampler = RandomSampler(data)
    loader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

    return loader, labels
    

In [3]:
# Create the BertClassfier class
class BertClassifier(nn.Module):
    '''Bert Model for Classification Tasks.
    '''
    def __init__(self, freeze_bert=False):
        '''
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        '''
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        '''
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size, max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size, num_labels)
        '''
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

# defining optimiser
def initialize_model(epochs=4):
    '''
    Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    '''
    from transformers import AdamW, get_linear_schedule_with_warmup
    
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_labels) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

def set_seed(seed_value=42):
    import random
    '''
    Set seed for reproducibility.
    '''
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

# Specify loss function
LOSS_FN = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    '''
    Train the BertClassifier model.
    '''

    # Start training loop
    print("Start training...\n")

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = LOSS_FN(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    '''
    inputs: model and dataset to be evaluated
    outputs:list of losses (?) and accuracy

    After the completion of each training epoch, measure the model's performance
    on our validation set.
    '''

    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = LOSS_FN(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


In [4]:
def bert_predict(model, loader):
    '''Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    '''
    # Put the model into the evaluation mode. The dropout layers are disabled during the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in loader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

## Pre process data

In [5]:
TRAIN = '../data/raw/music_reviews_train.json'
DEV = '../data/raw/music_reviews_dev.json'
TEST = '../data/raw/music_reviews_test_masked.json'

In [16]:
%%time

# N = df.shape[0]
N = 100

train_loader, train_labels = pre_process(TRAIN, N)
dev_loader, dev_labels = pre_process(DEV, N)
test_loader, test_labels = pre_process(TEST, N)

CPU times: user 3.43 s, sys: 319 ms, total: 3.75 s
Wall time: 18.8 s


## Training

In [17]:
# define which device to run this on
# if torch.cuda.is_available():       
#     device = torch.device("cuda")
#     print(f'There are {torch.cuda.device_count()} GPU(s) available.')
#     print('Device name:', torch.cuda.get_device_name(0))

# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")
device = torch.device("cpu")

In [18]:
%%time

EPOCHS = 2

set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=EPOCHS)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: user 2.04 s, sys: 550 ms, total: 2.59 s
Wall time: 3.62 s




In [19]:
train(bert_classifier, train_loader, dev_loader, epochs=EPOCHS, evaluation=True)

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


In [14]:
# save model
import pickle

model = bert_classifier
pickle.dump(model, open('sab_model.pkl', 'wb'))

## Prediction on test set

In [15]:
# pre process test for prediction

# predict on saved model
model = pickle.load(open('sab_model.pkl', 'rb'))
bert_predict(model, test_loader)

array([[0.46445876, 0.5355412 ],
       [0.47242603, 0.527574  ],
       [0.46377763, 0.53622234],
       [0.43349838, 0.56650156],
       [0.43608373, 0.5639162 ]], dtype=float32)

# Phase 2: Break it!

In [None]:
import numpy as np
import pandas as pd

# checklist for perturbations
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [3]:
# load data
data_train = pd.read_json('../data/raw/music_reviews_train.json', lines=True)

input_data = list(data_train['reviewText'])[0:1000]
input_data_labels = list(data_train['sentiment'])[0:1000]

In [28]:
input_data[1], input_data_labels[1]

('This tape can hardly be understood and it was listed for sale as "very good".  It\'s VERY BAD.',
 'negative')

### Generate sentences with Checklist

In [6]:
editor = Editor()
ret = editor.template('This {obj} is {adj}.',
                       adj=['great', 'terrible', 'fantastic'],
                       obj = ['book', 'song', 'album', 'product']
                       )
np.random.choice(ret.data, 5)

array(['This album is terrible.', 'This album is terrible.',
       'This book is terrible.', 'This album is great.',
       'This song is fantastic.'], dtype='<U26')

In [7]:
editor = Editor(language='danish')
ret = editor.template('{male1} went to see {male2} in {city}.', remove_duplicates=True)
list(np.random.choice(ret.data, 10))

['Niels went to see Søren in Viborg.',
 'Alexander went to see Ove in Esbjerg.',
 'Simon went to see Steen in Viborg.',
 'Peter went to see Johannes in Hørsholm.',
 'Aage went to see Ulrik in Esbjerg.',
 'Valdemar went to see Jonas in Rønne.',
 'Børge went to see Adam in Fredericia.',
 'Søren went to see Flemming in Helsingør.',
 'Leif went to see Carl in Kolding.',
 'Henning went to see Børge in Sønderborg.']

### Adding perturbations 
Typos and stripping punctuation.

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
pdata = list(nlp.pipe(input_data)) # need this to strip punct

In [38]:
typos = Perturb.perturb(input_data, Perturb.add_typos)
punct = Perturb.perturb(pdata, Perturb.strip_punctuation)

punct_df = pd.DataFrame(punct['data'], columns=['original', 'no_punct'])

pert = pd.DataFrame(typos.data, columns=['original', 'typos'])
pert['sentiment'] = input_data_labels
pert['no_punct'] = punct_df['no_punct']
pert

Unnamed: 0,original,typos,sentiment,no_punct
0,"So creative! Love his music - the words, the ...","So creative! Love his music - the words, the ...",positive,"So creative! Love his music - the words, the ..."
1,This tape can hardly be understood and it was ...,This tape can hardly be understood and it was ...,negative,This tape can hardly be understood and it was ...
2,Buy the CD. Do not buy the MP3 album. Downlo...,Buy the CD. Do not buy the MP3 album. Downlo...,negative,Buy the CD. Do not buy the MP3 album. Downlo...
3,I love Dallas Holms music and voice! Thank Yo...,I lvoe Dallas Holms music and voice! Thank Yo...,positive,I love Dallas Holms music and voice! Thank Yo...
4,Great memories of my early years in Christ,Great memories of mye arly years in Christ,positive,Great memories of my early years in Christ
...,...,...,...,...
995,A friend in my husband local band back in the ...,A friend in my husband local band back in the ...,positive,A friend in my husband local band back in the ...
996,this is a New copy of a old song hated it. Wil...,this is a New copy of a old song hated it. Wil...,negative,this is a New copy of a old song hated it. Wil...
997,This is a very nice song. Well performed by Na...,This is a very nice song. Well performed by Na...,positive,This is a very nice song. Well performed by Na...
998,Great songs. Originals. No re-records.,Great songs. Origianls. No re-records.,positive,Great songs. Originals. No re-records
