# Part 2: Baseline Model using Pre-Trained Language Model

This python notebook corresponds directly to the section 4.4 in the final thesis report. 

### Mount Google Drive

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

### Load Required Libraries

In [None]:
pip install transformers

In [None]:
pip install git+https://github.com/vered1986/comet-commonsense.git

In [None]:
import os
import sys
import json
import random
import time
import re
import datetime
import pickle

from tqdm import tqdm

import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

import spacy
nlp = spacy.load("en_core_web_sm")

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix

from comet2.comet_model import PretrainedCometModel

from transformers import BertForSequenceClassification,  DistilBertForSequenceClassification, RobertaForSequenceClassification
from transformers import BertForMaskedLM, DistilBertForMaskedLM, RobertaForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer, RobertaTokenizer

from transformers import BertConfig, DistilBertConfig, RobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup 

from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings('ignore')

### Model Configs

In [None]:
# set path variables
basepath = '/content/gdrive/MyDrive/ljmu-ms-thesis/'
datapath = '/content/gdrive/MyDrive/ljmu-ms-thesis/data/'
modelpath =  '/content/gdrive/MyDrive/ljmu-ms-thesis/model/'

In [None]:
# common config settings
EPOCHS = 10
TOTAL_ITERATIONS = 3
TEST_SIZE = 0.2
MAX_LEN = 32
BATCH_SIZE = 16
RANDOM_STATE = 2022
LEARNING_RATE = 2e-5
EPS = 1e-8
SEED_VAL = 42

In [None]:
# choose one model at a time
#model_name = 'bert'
#model_name = 'roberta'
model_name = 'distilbert'

### Common Sense Extraction

In [None]:
comet_model = PretrainedCometModel(device=0)

In [None]:
CATEGORIES = ["oReact", "oEffect", "oWant", "xAttr", "xEffect", "xIntent", "xNeed", "xReact", "xWant"]

def gather_commonsense(sentence):
    commonsense = {}
    for c in CATEGORIES:
        commonsense[c] = comet_model.predict(sentence, c, num_beams = 8)
    return commonsense

def get_comet_data(filename):
    num_lines = sum(1 for line in open(filename,'r'))
    content = []
    with open(filename) as f:
      for line in tqdm(f, total = num_lines):
        linedata = json.loads(line.strip())
        entry = {}
        entry['sentence'] = linedata['sentence'].lower()
        entry['label'] = linedata['label']        
        entry['common_sense'] = gather_commonsense(entry['sentence'])
        content.append(json.dumps(entry))        
    return  content

In [None]:
def write_content(filename, content):
    with open(filename, "w") as f:
        for c in content:
            f.write(c + '\n')

In [None]:
def extract_commonsense():
    read_filename = 'amazon_data.json'
    write_filename = 'amazon_'+ model_name +'_comet.json'
    content = get_comet_data(datapath + read_filename)
    write_content(datapath + write_filename, content)
    return content

In [None]:
# Extract commonsense. This is the long running part. Run this just once for each model
# content = extract_commonsense() 

### Auto complete COMET sequences

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased', output_attentions=True)
model.to(device)
model.eval()

In [None]:
def get_sentence(trail, subject):
    text = '[CLS] ' + subject + ' [MASK] ' + trail +'[SEP]'
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    
    # Create the segments tensors.
    segments_ids = [0] * len(tokenized_text)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segments_tensors = torch.tensor([segments_ids]).to(device)

    # Predict all tokens
    with torch.no_grad():
        outputs = model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = outputs[0]
        attention = outputs[-1]
    predicted_index = torch.argmax(predictions[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

    return subject + ' ' + predicted_token + ' ' + trail

In [None]:
def get_subject(text):
    doc = nlp(text)
    sentence = next(doc.sents) 
    for w in sentence:
        if w.dep_ == 'nsubj':
            return w.text
    return ""

def process_concepts(subject, concept_list):
    for c in concept_list:
        if c == 'none':
            continue
            
        if subject == '':
            return c
        
        return get_sentence(c, subject)

def reprocess_dataset(filename):
    num_lines = sum(1 for line in open(filename,'r'))
    content = []
    
    with open(filename) as f:
        for line in tqdm(f, total = num_lines):
            new_entry = {}
            entry = json.loads(line.strip())
            new_entry['sentence'] = entry['sentence']
            new_entry['label'] = entry['label']
            new_entry['common_sense'] = {}
            
            s = get_subject(entry['sentence'])
            
            for c in entry['common_sense'].keys():
                if c in ['xWant', 'xNeed', 'xIntent', 'xEffect']:
                    new_entry['common_sense'][c] = process_concepts(s, entry['common_sense'][c])
            content.append(new_entry)
    return content

In [None]:
def write_to_file(filename, content):
    with open(filename, "w") as f:
        for line in content:
            f.write(json.dumps(line) + '\n')

In [None]:
def autocomplete_dataset():
    read_filename = 'amazon_' + model_name + '_comet.json'
    write_filename = 'amazon_' + model_name + '_comet_autocomplete.json'
    content = reprocess_dataset(datapath + read_filename)
    write_to_file(datapath + write_filename, content)

In [None]:
# Run the above function once for each LM
# autocomplete_dataset() 

In [None]:
def load_dataset(filename):
    dataset = []
    with open(filename) as f:
        for line in f:
            entry = {}
            
            line = line.strip()
            d = json.loads(line)
            
            entry['sentence'] = d['sentence']
            entry['label'] = int(d['label'])
            entry['support'] = []

            for k in d['common_sense'].keys():
                if k == 'xWant' or k == 'xEffect':
                    entry['support'].append(d['common_sense'][k])
            dataset.append(entry)
    return dataset

In [None]:
read_filename = 'amazon_' + model_name + '_comet_autocomplete.json'
dataset = load_dataset(datapath + read_filename)

### Data Pre-processing

In [None]:
# Initialise tokenizer
def get_tokenizer(model_name):
    if model_name == 'distilbert':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
    elif model_name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    elif model_name == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
    return tokenizer

tokenizer = get_tokenizer(model_name)

In [None]:
# Tokenization and sentence embedding
def encode_all(sentences, model_name):  
    input_ids = []
    for data in sentences:
        input_ids.append(tokenizer.encode(data, add_special_tokens=True))
    return input_ids

In [None]:
# adding attention mask
def get_attn(input_ids):
    attention_masks = []
    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [None]:
# Create Dataset
def create_dataset(dataset):
    all_data = []
    for data in tqdm(dataset):
        input_ids = []
        input_ids.append(tokenizer.encode(data['sentence'].lower()))

        for s in data['support']:
            input_ids.append(tokenizer.encode(s.lower()))

        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', value=0, truncating='post', padding='post')
        input_ids = torch.tensor(input_ids)
        
        attn_mask = torch.tensor(get_attn(input_ids))

        entry = {}
        #entry['raw_sentence'] = data['sentence']
        entry['raw_sentence'] = data['sentence'] + " [SUPPORT]: " + " [SEP] ".join(data['support'])
        entry['sentence'] = input_ids[0]
        entry['sentence_mask'] = attn_mask[0]
        
        entry['support'] = input_ids[1:]
        entry['support_mask'] = attn_mask[1:]

        all_data.append((entry, data['label']))
    return all_data

In [None]:
dataset_list = create_dataset(dataset)

### Training & Validation Split

In [None]:
trainset, validationset = train_test_split(dataset_list, random_state=RANDOM_STATE, test_size=TEST_SIZE)

In [None]:
def save_dataset():
    with open(modelpath + 'trainset-comet-' + model_name +'.data', 'wb') as f:
        pickle.dump(trainset, f)

    with open(modelpath + 'validationset-comet-' + model_name + '.data', 'wb') as f:
        pickle.dump(validationset, f)

In [None]:
save_dataset()

### Data Loaders

In [None]:
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(validationset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Model Initialisation

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

def reset_model(model_name):
    if model_name == 'distilbert':
        model = DistilBertForSequenceClassification.from_pretrained(
                                                'distilbert-base-uncased',
                                                num_labels = 2, 
                                                output_attentions = True, 
                                                output_hidden_states = False, 
                                            )
    elif model_name == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained(
                                                'roberta-base',
                                                num_labels = 2, 
                                                output_attentions = True, 
                                                output_hidden_states = True, 
                                            )
    elif model_name == 'bert':
        model = BertForSequenceClassification.from_pretrained(
                                                'bert-base-uncased',
                                                num_labels = 2, 
                                                output_attentions = True, 
                                                output_hidden_states = True, 
                                            )
    model.to(device)
    
    optimizer = AdamW(model.parameters(),
                  lr = LEARNING_RATE,
                  eps = EPS 
                )

    total_steps = len(train_loader) * EPOCHS

    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    return model, optimizer, scheduler

### Utility

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Training

In [None]:
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)

In [None]:
def train(model, optimizer, scheduler):
    total_loss = 0
    model.train()
    for step, (batch, labels) in enumerate(train_loader):
        b_input_ids = batch['sentence'].to(device)
        b_input_mask = batch['sentence_mask'].to(device)
        b_labels = labels.to(device)
        model.zero_grad()        
        
        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
                
        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)

In [None]:
def test(model):
    
    t0 = time.time()
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for batch, b_labels in test_loader:
        
        with torch.no_grad():        
            outputs = model(batch['sentence'].to(device), 
                            attention_mask=batch['sentence_mask'].to(device))
        
        logits = outputs[0]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        prediction = list(np.argmax(logits, axis=1).flatten())
        all_predictions.extend(prediction)
        all_labels.extend(label_ids.flatten())
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    
    matrix = confusion_matrix(all_predictions, all_labels)

    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]

    accuracy = eval_accuracy/nb_eval_steps
    f1 = f1_score(all_predictions, all_labels, average = 'macro')
    precision = precision_score(all_predictions, all_labels, average = 'macro')
    recall = recall_score(all_predictions, all_labels, average = 'macro')

    return [f1, precision, recall, accuracy]

In [None]:
def save_model(filename = modelpath + model_name +'_comet.pb'):
    torch.save(model, filename)

### Run the model

In [None]:
def run(model, optimizer, scheduler):
    best_results = [0, 0, 0, 0]
    t0 = time.time()
    
    for epoch_i in range(0, EPOCHS):
        train(model, optimizer, scheduler)
        elapsed = format_time(time.time() - t0)
        print('Epoch {:} / {:}    Elapsed: {:}.'.format(epoch_i + 1, EPOCHS, elapsed))
        results = test(model)
        if best_results[0] < results[0]:
            best_results = results
            save_model()
 
    return best_results

In [None]:
all_results = []
t0 = time.time()

for iteration in range(TOTAL_ITERATIONS):
    it0 = time.time()
    print('-'*50)
    print('Iteration {:2d}'.format(iteration+1))
    print('-'*50)
    model, optimizer, scheduler = reset_model(model_name)
    result = run(model, optimizer, scheduler)
    all_results.append(result)
    print('-'*50)
    print('Result for this iteration: ', result)
    print('Time taken for this iteration: {:}'.format(format_time(time.time() - it0)))

# Final results is the average of all the iterations
final_results = [sum(value)/len(value) for value in zip(*all_results)]

print('-'*50)
print('Final Results for the COMET + ' + model_name.upper() + ' model')
print('-'*50)
print('F1-score: {0:.4f}'.format(final_results[0]))
print('Precision: {0:.4f}'.format(final_results[1]))
print('Recall: {0:.4f}'.format(final_results[2]))
print('Accuracy: {0:.4f}'.format(final_results[3]))
print('Time Taken: {:}'.format(format_time(time.time() - t0)))
print('-'*50)
