# Phase 1: Baseline model

In [1]:
# basic reqs
import pandas as pd
import torch
import numpy as np
import torch.nn.functional as F

# pre processing reqs 
from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# model reqs
import torch.nn as nn
from transformers import BertModel
import time

## Functions

In [2]:
def pre_process(path, N):
    '''
    input: path to json file
    output: bert encodings
    JSON -> df -> bert tokens -> bert embeddings
    '''
    df = pd.read_json(path, lines=True)

    df['concatSummaryReview'] = df['summary'] + ' ' + df['reviewText']
    df['concatSummaryReview'] = df['concatSummaryReview'].str.lower().fillna('[NA]')

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # get the labels
    sentiment_binary = {
    'positive' : 1,
    'negative' : 0
    }

    df['sentiment_binary'] = df['sentiment'].map(sentiment_binary)
    labels = torch.tensor(df['sentiment_binary'])

    # lists to store outputs of bert tokeniser 
    input_ids = []
    attention_masks = []

    MAX_LEN = 512
    BATCH_SIZE = 32 # For fine-tuning BERT, the authors recommend a batch size of 16 or 32

    for i in range(N):
        text = df['concatSummaryReview'][i]
        tokens = tokenizer.tokenize(text)
        encoded_plus = tokenizer.encode_plus(
            text=tokens,                  # Preprocess sentence
            add_special_tokens=True,    # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,         # Max length to truncate/pad
            padding='max_length',       # Pad sentence to max length
            return_attention_mask=True, # Return attention mask
            truncation = True
            # return_tensors='pt',        # Return PyTorch tensor
        )

        input_ids.append(encoded_plus.get('input_ids'))
        attention_masks.append(encoded_plus.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    # from here will be different for the test data
    # and for the sata used for predictions - might need to split into 2 functions

    # Create the DataLoader to be used as input in the model
    labels = labels[0:N] # to limit the labels for testing purposes
    data = TensorDataset(input_ids, attention_masks, labels)
    sampler = RandomSampler(data)
    loader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

    return loader, labels
    

In [3]:
# Create the BertClassfier class
class BertClassifier(nn.Module):
    '''Bert Model for Classification Tasks.
    '''
    def __init__(self, freeze_bert=False):
        '''
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        '''
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        '''
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size, max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size, num_labels)
        '''
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits

# defining optimiser
def initialize_model(epochs=4):
    '''
    Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    '''
    from transformers import AdamW, get_linear_schedule_with_warmup
    
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_labels) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

def set_seed(seed_value=42):
    import random
    '''
    Set seed for reproducibility.
    '''
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

# Specify loss function
LOSS_FN = nn.CrossEntropyLoss()

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    '''
    Train the BertClassifier model.
    '''

    # Start training loop
    print("Start training...\n")

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1

            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = LOSS_FN(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    '''
    inputs: model and dataset to be evaluated
    outputs:list of losses (?) and accuracy

    After the completion of each training epoch, measure the model's performance
    on our validation set.
    '''

    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = LOSS_FN(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy


In [4]:
def bert_predict(model, loader):
    '''Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    '''
    # Put the model into the evaluation mode. The dropout layers are disabled during the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in loader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

## Pre process data

In [5]:
TRAIN = '../../data/raw/music_reviews_train.json'
DEV = '../../data/raw/music_reviews_dev.json'
TEST = '../../data/raw/music_reviews_test_masked.json'

In [6]:
%%time

# N = df.shape[0]
N = 10

train_loader, train_labels = pre_process(TRAIN, N)
dev_loader, dev_labels = pre_process(DEV, N)
test_loader, test_labels = pre_process(TEST, N)

CPU times: user 1.76 s, sys: 252 ms, total: 2.02 s
Wall time: 16.4 s


## Training

In [13]:
# # define which device to run this on
# if torch.cuda.is_available():       
#     device = torch.device("cuda")
#     print(f'There are {torch.cuda.device_count()} GPU(s) available.')
#     print('Device name:', torch.cuda.get_device_name(0))

# else:
#     print('No GPU available, using the CPU instead.')
#     device = torch.device("cpu")

# gpu not working
device = torch.device("cpu")

In [None]:
%%time

EPOCHS = 2

set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=EPOCHS)

In [None]:
# train(bert_classifier, train_loader, dev_loader, epochs=EPOCHS, evaluation=True)

In [None]:
# # save model
# import pickle

# model = bert_classifier
# pickle.dump(model, open('sab_model_tiny.pkl', 'wb'))

## Prediction on test set

In [7]:
# pre process test for prediction

# predict on saved model
import pickle

In [16]:
model = pickle.load(open('sab_model.pkl', 'rb')) # doesn't work??
# model = pickle.load(open('../model_100_100.pkl', 'rb')) # works
# training script saving the model in a weird way? not sure why, code is the same
bert_predict(model, test_loader)

AttributeError: Can't get attribute 'GELUActivation' on <module 'transformers.activations' from '/home/sabrina/miniconda3/envs/torch/lib/python3.9/site-packages/transformers/activations.py'>

# Phase 2: Break it!

In [None]:
import numpy as np
import pandas as pd

# checklist for perturbations
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb

In [None]:
# load data
data_train = pd.read_json('../data/raw/music_reviews_train.json', lines=True)

input_data = list(data_train['reviewText'])[500:1000]
input_data_labels = list(data_train['sentiment'])[500:1000]

In [None]:
input_data[1], input_data_labels[1]

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
mod_input_data = []

for paragraph in input_data:
    sentences = sent_tokenize(paragraph)
    pdata = list(nlp.pipe(sentences))
    back_to_p = []

    for sentence in pdata:
        sentence = Perturb.strip_punctuation(sentence)
        try:
            sentence = Perturb.add_typos(sentence)
        except:
            continue
        try:
            sentence = Perturb.expand_contractions(sentence)
        except:
            continue
        print(sentence)
        back_to_p.append(sentence)
        
    back_to_p = " ".join(back_to_p)
    mod_input_data.append(back_to_p)
    

In [None]:
mod_df = pd.DataFrame(mod_input_data, columns=['reviewText'])
mod_df['sentiment'] = input_data_labels
mod_df['category'] = 'typos, expand contractions, remove punctuation'
mod_df.to_json('sab_mods_df.json', orient='records')

In [None]:
mod_df.columns

In [None]:
mod_df.iloc[0,0]

In [None]:
output = []
for i in range(500):
    dicti = {}
    dicti['reviewText'] = mod_df.iloc[i,0]
    dicti['sentiment'] = mod_df.iloc[i,1]
    dicti['category'] = mod_df.iloc[i,2]
    output.append(dicti)

test_json=[json.dumps(i)+'\n' for i in output]
with open ('../data/predictions/sab_mods.json', 'w') as file:
    file.writelines(test_json)

In [None]:
mod_dict = dict()
mod_dict['reviewText'] = mod_input_data
mod_dict['sentiment'] = input_data_labels
mod_dict['category'] = ['typos, expand contractions, remove punctuation'] * 500

In [None]:
mod_dict

In [None]:
mod_dict

import json
out_file = open('sabrina_mods.json', "w")
json.dump(mod_dict, out_file, indent = 4)

### Generate sentences with Checklist

In [None]:
editor = Editor()
ret = editor.template('This {obj} is {adj}.',
                       adj=['great', 'terrible', 'fantastic', 'horrible', 'horrendous', 'brilliant'],
                       obj = ['book', 'song', 'album', 'product', 'game', 'computer']
                       )
np.random.choice(ret.data, 50)

In [None]:
editor = Editor(language='danish')
ret = editor.template('{male1} went to see {male2} in {city}.', remove_duplicates=True)
list(np.random.choice(ret.data, 10))

### Adding perturbations 
Typos and stripping punctuation.

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
pdata = list(nlp.pipe(input_data)) # need this to strip punct

In [None]:
punct['data'][0]

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
  
text = "Hello everyone. Welcome to GeeksforGeeks. You are studying NLP article"
sent_tokenize(text)

In [None]:
typos = Perturb.perturb(input_data, Perturb.add_typos)
punct = Perturb.perturb(pdata, Perturb.strip_punctuation)

punct_df = pd.DataFrame(punct['data'], columns=['original', 'no_punct'])

pert = pd.DataFrame(typos.data, columns=['original', 'typos'])
pert['sentiment'] = input_data_labels
pert['no_punct'] = punct_df['original']
pert