In [None]:
pip install dgl

In [None]:
pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os
import sys
import json
import random
import time
import re
import datetime
import pickle

from tqdm import tqdm

import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix

from transformers import BertForSequenceClassification,  DistilBertForSequenceClassification, RobertaForSequenceClassification
from transformers import BertForMaskedLM, DistilBertForMaskedLM, RobertaForMaskedLM
from transformers import BertTokenizer, DistilBertTokenizer, RobertaTokenizer

from transformers import BertConfig, DistilBertConfig, RobertaConfig
from transformers import AdamW, get_linear_schedule_with_warmup 

from transformers import logging
logging.set_verbosity_error()

import warnings
warnings.filterwarnings('ignore')

In [None]:
import dgl
from dgl import DGLGraph
from dgl.data import MiniGCDataset

from dgl.nn.pytorch import conv as dgl_conv
from dgl.data import citegrh

# Load Pytorch as backend
dgl.load_backend('pytorch')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset Configs

In [None]:
# set path variables
basepath = '/content/gdrive/MyDrive/ljmu-ms-thesis/'
datapath = '/content/gdrive/MyDrive/ljmu-ms-thesis/data/'
modelpath =  '/content/gdrive/MyDrive/ljmu-ms-thesis/model/'

In [None]:
EDGE_TYPE = {
    'bidirectional': 1,
    'input2COMET': 2,
    'COMET2input': 3
}

In [None]:
# common config settings
EPOCHS = 10
TOTAL_ITERATIONS = 3
TEST_SIZE = 0.2
MAX_LEN = 32
BATCH_SIZE = 16
RANDOM_STATE = 2022
LEARNING_RATE = 2e-5
EPS = 1e-8
SEED_VAL = 42

In [None]:
# choose one model at a time
#model_name = 'bert'
model_name = 'roberta'
#model_name = 'distilbert'

### Dataset Load

In [None]:
def load_dataset(filename):
    data = []
    with open(filename) as f:
        for line in f:
            entry = {}
            entry['sentences'] = []
            
            line = line.strip()
            d = json.loads(line)
            
            entry['sentences'].append(d['sentence'])
            entry['label'] = int(d['label'])
            
            for k in d['common_sense'].keys():
                if k == 'xWant' or k == 'xEffect':
                    entry['sentences'].append(d['common_sense'][k])
            data.append(entry)
                
    return data    

In [None]:
read_filename = 'amazon_'+ model_name +'_comet.json'
dataset = load_dataset(datapath + read_filename)

### Pre-trained BERT model

In [None]:
def get_attn(input_ids):
    attention_masks = []

    for sent in input_ids:
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
    return attention_masks

In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
def get_tokenizer(model_name):
    if model_name == 'distilbert':
        tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
    elif model_name == 'bert':
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    elif model_name == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)
    return tokenizer

tokenizer = get_tokenizer(model_name)     

In [None]:
# load the previously trained comet model
load_model = model_name +'_comet.pb'
model = torch.load(modelpath + load_model, map_location=torch.device(device) )
model.to(device)

## Load dataset from pickle dump

In [None]:
def get_pickle_file(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [None]:
MAX_LEN = 128

def create_dataset_cached(dataset):
    all_data = []
    
    for data, label in tqdm(dataset):
        input_ids = []
        
        input_ids.append(data['sentence'])
        
        for s in data['support']:
            input_ids.append(s)
            
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
        att_mask = torch.tensor(get_attn(input_ids))
        input_ids = torch.tensor(input_ids)
        

        loss, hidden_state = model(input_ids.to(device), attention_mask=att_mask.to(device), return_dict=False, output_hidden_states=False)
        
        output = hidden_state[-1][:, 0, :].detach().to('cpu')
        
        torch.cuda.empty_cache()

        graph = dgl.graph((torch.tensor([0, 0]), 
                               torch.tensor([1, 2]))) # Edge type is by default chosen as input to comet
            
        graph.ndata['x'] = torch.tensor(output)
        all_data.append((graph, label, data['raw_sentence']))
    return all_data

In [None]:
trainset_, validationset_ = get_pickle_file(modelpath + 'trainset-comet-' + model_name +'.data'), get_pickle_file(modelpath + 'validationset-comet-' + model_name + '.data')

In [None]:
trainset, validationset = create_dataset_cached(trainset_), create_dataset_cached(validationset_)

### Training Data generation

In [None]:
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels, raw_sentence = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor([labels]), raw_sentence

In [None]:
BATCH_SIZE = 16

In [None]:
train_loader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
test_loader = DataLoader(validationset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)

## Basic SAGE Model

In [None]:
class GraphSAGEModel(nn.Module):
    def __init__(self,
                 in_feats,
                 n_hidden,
                 out_dim,
                 n_layers,
                 activation,
                 dropout,
                 aggregator_type):
        super(GraphSAGEModel, self).__init__()
        self.node_count = 3
        self.layers = nn.ModuleList()
        self.index = torch.tensor([1, 2]).to(device)

        # input layer
        self.layers.append(dgl_conv.SAGEConv(768, n_hidden, aggregator_type,
                                         feat_drop=dropout, activation=None))
        self.lin1 = nn.Linear(64, 5)
        self.lin2 = nn.Linear(5 * self.node_count, 2)

    def forward(self, g, features):
        h = features
        for layer in self.layers:
            h = layer(g, h)  
            
        x = self.lin1(h.view(-1, self.node_count, 64))
        x = self.lin2(x.view(-1, 5 * self.node_count))
        
        return F.log_softmax(x.view(-1, 2), dim=-1)

In [None]:
# using default Hyperparameters
n_hidden = 64
n_layers = 2
dropout = 0.5
aggregator_type = 'gcn'
n_classes = 2
in_feats = trainset[0][0].ndata['x'].shape[1]

def reset_model():
    weight_decay = 5e-4
    lr = 2e-3
    neg_sample_size = 100

    model = GraphSAGEModel(in_feats,
                             n_hidden,
                             n_classes,
                             n_layers,
                             F.relu,
                             dropout,
                             aggregator_type)

    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    return model, optimizer

In [None]:
# use optimizer
criteria = nn.NLLLoss()

### Evaluation

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def train(model, optimizer, epoch):
    model.train()
    
    eval_accuracy = 0
    nb_eval_steps = 0
    
    for batch, (g, label, _) in enumerate(train_loader):
        g = g.to(device)
        output = model(g, g.ndata['x'])
        loss = criteria(output, label.view(-1).to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        eval_accuracy += loss.item()
        nb_eval_steps += 1

In [None]:
def test(model, test_loader):   
    eval_accuracy = 0
    nb_eval_steps = 0
    
    model.eval()
    
    all_predictions = []
    all_labels = []
    
    for batch, (g, label, _) in enumerate(test_loader):
        g = g.to(device)
        
        with torch.no_grad():
            output = model(g, g.ndata['x']) 
        

        logits = output.detach().cpu().numpy()
        label_ids = label.to('cpu').numpy()
        
        prediction = list(np.argmax(logits, axis=1).flatten())
        all_predictions.extend(prediction)
        all_labels.extend(label_ids.flatten())
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    
    accuracy = eval_accuracy/nb_eval_steps
    f1 = f1_score(all_predictions, all_labels, average = 'macro')
    precision = precision_score(all_predictions, all_labels, average = 'macro')
    recall = recall_score(all_predictions, all_labels, average = 'macro')
        
    matrix = confusion_matrix(all_predictions, all_labels)

    tp = matrix[0][0]
    fp = matrix[0][1]
    fn = matrix[1][0]
    tn = matrix[1][1]

    return [f1, precision, recall, accuracy]

In [None]:
def save_model(filename = modelpath + model_name +'_gcn_comet.pb'):
    torch.save(model, filename)

In [None]:
def run(model, optimizer):
    best_results = [0, 0, 0, 0]
    t0 = time.time()
    for epoch in range(EPOCHS):
        train(model, optimizer, epoch)
        elapsed = format_time(time.time() - t0)
        print('Epoch {:} / {:}    Elapsed: {:}.'.format(epoch_i + 1, EPOCHS, elapsed))  
        results = test(model, test_loader)      
        if results[0] > best_results[0]:
            best_results = results
            save_model()
    return best_results

In [None]:
all_results = []
t0 = time.time()

for iteration in range(TOTAL_ITERATIONS):
    it0 = time.time()
    print('-'*50)
    print('Iteration {:2d}'.format(iteration+1))
    print('-'*50)
    model, optimizer = reset_model()
    result = run(model, optimizer)
    all_results.append(result)
    print('-'*50)
    print('Result for this iteration: ', result)
    print('Time taken for this iteration: {:}'.format(format_time(time.time() - it0)))

# Final results is the average of all the iterations
final_results = [sum(value)/len(value) for value in zip(*all_results)]

print('-'*50)
print('Final Results for the baseline model: ' + model_name)
print('-'*50)
print('F1-score: {0:.4f}'.format(final_results[0]))
print('Precision: {0:.4f}'.format(final_results[1]))
print('Recall: {0:.4f}'.format(final_results[2]))
print('Accuracy: {0:.4f}'.format(final_results[3]))
print('Time Taken: {:}'.format(format_time(time.time() - t0)))
print('-'*50)
