In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
! pip install transformers -q
! pip install dgl-cu100 -q
! pip install word2vec -q

In [None]:
import re
import pandas as pd
from pathlib import Path
import matplotlib.cm as cm
import numpy as np
import pandas as pd
from typing import *
from tqdm.notebook import tqdm
from sklearn.utils.extmath import softmax
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from transformers import AdamW
import dgl
import word2vec

In [None]:
def seed_all(seed = 42):
  """
  Fix seed for reproducibility
  """
  # python RNG
  import random
  random.seed(seed)

  # pytorch RNGs
  import torch
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

  # numpy RNG
  import numpy as np
  np.random.seed(seed)

In [None]:
class config:
  ADD_SUBJ = False
  SAVE_DIR = 'bert_gcn'
  KFOLD = 3
  MAX_LEN = 96
  SEED = 42
  MODEL = 'bert-base-cased'
  TOKENIZER = transformers.BertTokenizer.from_pretrained(MODEL)
  EPOCHS = 1
  TRAIN_BATCH_SIZE = 32
  VALID_BATCH_SIZE = 32
  TRAIN_FILE = 'task1_train.csv'
  VAL_FILE =  'task1_dev.csv'
  TEST_FILE = 'task1_test.csv'

In [None]:
import os
os.chdir('/gdrive/My Drive/DEFINITION EXTRACTION/DEFT_Updated')

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    """
    Early stopping utility
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):
        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [None]:
def process_data(text, subject, tokenizer, max_len, label):

  ## remove initial numbers
  text = re.findall('^\s*\d*\s*\.?\s*(.*)', text)[0]  

  ## add subject
  if config.ADD_SUBJ:
    text = subject + ' ' + text
  
  token_ids = tokenizer.encode(text, add_special_tokens=True)
  mask = [1] * len(token_ids)

  padding = max_len - len(token_ids)
  
  if padding>=0:
    token_ids = token_ids + ([0] * padding)
    mask = mask + ([0] * padding)
  else:
    token_ids = token_ids[0:max_len]
    mask = mask[0:max_len]

  return {'text':text,
          'subject':subject,
          'ids':token_ids,
          'mask':mask,
          'label':label
          }

## The [official implementation](https://github.com/HuangLianzhe/TextLevelGCN) of Text Level GCN has been used to build the BERT GCN Joint Model.

In [None]:
class DEFTDataset:
    def __init__(self, text, subject, label, vocab=None):
        self.text = text
        self.subject = subject
        self.label = label
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN

        if vocab is None:
          self.vocab = []
          try:
              self.get_vocab(vocab)
              print(f'Vocab file found')
          except:
              self.build_vocab(self.text, min_count=5)
        else:
            self.vocab = vocab

        self.d = dict(zip(self.vocab, range(len(self.vocab))))
        self.rev_d = { v:k for k,v in self.d.items()}


    def word2id(self, word):
        try:
            result = self.d[word]
        except KeyError:
            result = self.d['UNK']
        return result

    def get_vocab(self, filename):
        with open(filename) as f:
            vocab = f.read()
            self.vocab = vocab.split('\n')

    def build_vocab(self, content, min_count=10):
        vocab = []
        for c in content:
            words = c.split(' ')
            for word in words:
                if word not in vocab:
                    vocab.append(word)
        freq = dict(zip(vocab, [0 for i in range(len(vocab))]))
        for c in content:
            words = c.split(' ')
            for word in words:
                freq[word] += 1
        results = []
        for word in freq.keys():
            if freq[word] < min_count:
                continue
            else:
                results.append(word)
        results.insert(0, 'UNK')
        with open('vocab.txt', 'w') as f:
            f.write('\n'.join(results))
        self.vocab = results


    def get_gcn_data(self, text, max_len):
      seq = list(map(lambda x: self.word2id(x), text.split(' ')))
      length = len(seq)

      padding = max_len - length
      if padding>=0:
        seq = seq + [0] * padding
      else:
        seq = seq[0:max_len]

      return {'ids': seq, 
              'length': length,
              }

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        data = process_data(
            self.text[item], 
            self.subject[item], 
            self.tokenizer,
            self.max_len,
            self.label[item],
        )

        gcn_data = self.get_gcn_data(data['text'], self.max_len)


        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'gcn_ids' : torch.tensor(gcn_data["ids"], dtype=torch.long),
            'gcn_length' : torch.tensor(gcn_data["length"], dtype=torch.long),
            'text': data['text'],
            'subject': data['subject'],
            'label': data['label'],
        }

In [None]:
def cal_PMI(helper, window_size=20):
    content = []
    for d in helper:
      content.append(d['text'])

    # co-occurence matrix
    pair_count_matrix = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=int)
    # frequency of words
    word_count =np.zeros(len(helper.vocab), dtype=int)

    for sentence in tqdm(content):
        sentence = sentence.split(' ')
        for i, word in enumerate(sentence):
            try:
                word_count[helper.d[word]] += 1
            except KeyError:
                continue
            start_index = max(0, i - window_size)
            end_index = min(len(sentence), i + window_size)
            # iterating over n-gram neighbourhood
            for j in range(start_index, end_index):
                if i == j:
                    continue
                else:
                    target_word = sentence[j]
                    try:
                        pair_count_matrix[helper.d[word], helper.d[target_word]] += 1
                    except KeyError:
                        continue

    total_count = np.sum(word_count)
    word_count = word_count / total_count
    pair_count_matrix = pair_count_matrix / total_count

    pmi_matrix = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=float)
    for i in range(len(helper.vocab)):
        for j in range(len(helper.vocab)):
            pmi_matrix[i, j] = np.log(
                pair_count_matrix[i, j] / (word_count[i] * word_count[j])
            )

    pmi_matrix = np.nan_to_num(pmi_matrix)
    pmi_matrix = np.maximum(pmi_matrix, 0.0)

    # map edge between two words to edge-id
    edges_weights = [0.0]
    count = 1
    edges_mappings = np.zeros((len(helper.vocab), len(helper.vocab)), dtype=int)
    for i in range(len(helper.vocab)):
        for j in range(len(helper.vocab)):
            if pmi_matrix[i, j] != 0:
                edges_weights.append(pmi_matrix[i, j])
                edges_mappings[i, j] = count
                count += 1

    edges_weights = np.array(edges_weights)
    edges_weights = edges_weights.reshape(-1, 1)
    edges_weights = torch.Tensor(edges_weights)

    return edges_weights, edges_mappings, count


In [None]:
class GCNModel(torch.nn.Module):
    def __init__(self,
                 hidden_size_node,
                 vocab,
                 n_gram,
                 edges_num,
                 edges_matrix,
                 max_length=350,
                 trainable_edges=True,
                 pmi=None,
                 cuda=True
                 ):
        super(GCNModel, self).__init__()

        self.is_cuda = cuda
        self.vocab = vocab
        
        self.seq_edge_w = torch.nn.Embedding(edges_num, 1)
        self.node_hidden = torch.nn.Embedding(len(vocab), hidden_size_node)
        
        self.seq_edge_w = torch.nn.Embedding.from_pretrained(pmi, freeze=True)
            
        self.edges_num = edges_num
        if trainable_edges:
            self.seq_edge_w = torch.nn.Embedding.from_pretrained(torch.ones(edges_num, 1), freeze=False)
        else:
            self.seq_edge_w = torch.nn.Embedding.from_pretrained(pmi, freeze=True)

        self.hidden_size_node = hidden_size_node
        self.node_hidden.weight.data.copy_(torch.tensor(self.load_word2vec('glove.6B.200d.vec.txt')))
        self.node_hidden.weight.requires_grad = True

        self.len_vocab = len(vocab)
        self.ngram = n_gram
        self.d = dict(zip(self.vocab, range(len(self.vocab))))
        self.max_length = max_length
        self.edges_matrix = edges_matrix

        self.dropout = torch.nn.Dropout(0.5)
        self.activation = torch.nn.ReLU()

        
    def word2id(self, word):
        try:
            result = self.d[word]
        except KeyError:
            result = self.d['UNK']
        return result

    def load_word2vec(self, word2vec_file):
        model = word2vec.load(word2vec_file)
        embedding_matrix = []
        for word in self.vocab:
            try:
                embedding_matrix.append(model[word])
            except KeyError:
                embedding_matrix.append(model['the'])
        embedding_matrix = np.array(embedding_matrix)
        return embedding_matrix

    

    def add_seq_edges(self, doc_ids: list, old_to_new: dict):
        '''
          doc_ids : list of ids of words in a sentence
          old_to_new : the ids of the words are global; it is mapping of these ids to local ids(text-level graph)

          The function returns the edge list w.r.t. text-level graph and the corresponding global edge ids

        '''

        edges = []
        old_edge_id = []
        for index, src_word_old in enumerate(doc_ids):
            src = old_to_new[int(src_word_old.item())]
            for i in range(max(0, index - self.ngram), min(index + self.ngram + 1, len(doc_ids))):
                dst_word_old = doc_ids[i].item()
                dst = old_to_new[dst_word_old]

                # - first connect the new sub_graph
                edges.append([src, dst])
                # - then get the hidden from parent_graph
                old_edge_id.append(self.edges_matrix[src_word_old, dst_word_old])

            # self circle
            edges.append([src, src])
            old_edge_id.append(self.edges_matrix[src_word_old, src_word_old])
        return edges, old_edge_id

    def seq_to_graph(self, doc_ids: list, doc_length) -> dgl.DGLGraph():
        '''
            doc_ids : global ids of words in a sentence
            doc_length : the actual length of the sentence without padding

            The function returns the text-level graph for the sentence

        '''

        doc_ids = doc_ids[0:doc_length]
        if len(doc_ids) > self.max_length:
            doc_ids = doc_ids[:self.max_length]


        local_vocab = set(doc_ids)
        old_to_new = {}
        # mapping of words global ids to local ids
        for i,j in enumerate(local_vocab):
          old_to_new[j.item()] = i
        

        if self.is_cuda:
            local_vocab = torch.tensor(list(local_vocab)).cuda()
        else:
            local_vocab = torch.tensor(list(local_vocab))

        # create dgl graph
        sub_graph = dgl.DGLGraph()

        sub_graph.add_nodes(len(local_vocab))
        local_node_hidden = self.node_hidden(local_vocab)

        sub_graph.ndata['h'] = local_node_hidden

        seq_edges, seq_old_edges_id = self.add_seq_edges(doc_ids, old_to_new)

        edges, old_edge_id = [], []
        
        edges.extend(seq_edges)

        old_edge_id.extend(seq_old_edges_id)

        if self.is_cuda:
            old_edge_id = torch.LongTensor(old_edge_id).cuda()
        else:
            old_edge_id = torch.LongTensor(old_edge_id)

        srcs, dsts = zip(*edges)
        # adding edges to graph
        sub_graph.add_edges(srcs, dsts)

        try:
            seq_edges_w = self.seq_edge_w(old_edge_id)
        except RuntimeError:
            print(old_edge_id)
      
        sub_graph.edata['w'] = seq_edges_w

        return sub_graph

    def forward(self, doc_ids, doc_lengths):
        # create corresponding text-level graph for each sentence
        sub_graphs = [self.seq_to_graph(doc, length) for doc, length in zip(doc_ids, doc_lengths)]

        batch_graph = dgl.batch(sub_graphs)
        batch_graph.update_all(
            message_func = dgl.function.src_mul_edge('h', 'w', 'weighted_message'),
            reduce_func= dgl.function.max('weighted_message', 'h')
        )

        h1 = dgl.sum_nodes(batch_graph, feat='h')
        drop1 = self.dropout(h1)
        act1 = self.activation(drop1)
        return act1


In [None]:
class BertFeature(transformers.BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = transformers.BertModel(config)
        self.init_weights()
        self.features = config.hidden_size

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

            bert_outputs = self.bert(
              input_ids,
              attention_mask=attention_mask,
              token_type_ids=token_type_ids,
              position_ids=position_ids,
              head_mask=head_mask,
              inputs_embeds=inputs_embeds,
            )
            pooled_output = bert_outputs[1]

            return pooled_output

In [None]:
class BertGCN(torch.nn.Module):
  def __init__(self, hidden, vocabulary, n_grams, edges_mappings, edges_weights, count, num_classes, dropout_prob):
    super(BertGCN, self).__init__()
    self.bert = BertFeature.from_pretrained(config.MODEL, output_attentions=False)
    self.gcn = GCNModel(hidden_size_node = hidden,
                            vocab = vocabulary,
                            n_gram = n_grams,
                            edges_matrix = edges_mappings,
                            edges_num = count,
                            trainable_edges = True, 
                            pmi = edges_weights, 
                            cuda = True
                            )
    self.num_classes = num_classes

    self.dropout = nn.Dropout(dropout_prob)
    self.bn1 = nn.BatchNorm1d(self.bert.features)
    self.bn2 = nn.BatchNorm1d(hidden)
    self.classifier = nn.Linear(self.bert.features + hidden, self.num_classes)

  def forward(self, input_ids, attention_mask, input_gcn, sent_len, labels=None):
    bert_out = self.bert(input_ids, attention_mask)
    gcn_out = self.gcn(input_gcn, sent_len)

    out = torch.cat((bert_out, gcn_out), 1)
    logits = self.classifier(out)

    outputs = (logits,)

    if labels is not None:
        if self.num_classes == 1:
            loss_fct = MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
        outputs = (loss,) + outputs
    return outputs

In [None]:
def train_fn(data_loader, model, optimizer, device):
  model.train()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  
  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    gcn_ids = d['gcn_ids']
    gcn_length = d['gcn_length']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    gcn_ids = gcn_ids.to(device, dtype=torch.long)
    gcn_length = gcn_length.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    

    model.zero_grad()
    outputs = model(ids, mask, gcn_ids, gcn_length, label)

    loss, logits = outputs[:2]
    loss.backward()
    optimizer.step()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)


In [None]:
def eval_fn(data_loader, model, device):
  model.eval()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  yt, yp = [], []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    gcn_ids = d['gcn_ids']
    gcn_length = d['gcn_length']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    gcn_ids = gcn_ids.to(device, dtype=torch.long)
    gcn_length = gcn_length.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    
    with torch.no_grad():
      outputs = model(ids, mask, gcn_ids, gcn_length, label)        
      loss, logits = outputs[:2]

    logits = logits.detach().cpu().numpy()

    preds = softmax(logits)
    pred_labels = np.argmax(preds, axis=1).flatten()
    ground_labels = label.to('cpu').numpy()

    yt = yt + ground_labels.tolist()
    yp = yp + pred_labels.tolist()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)


  print('Classification Report')
  print(classification_report(yt, yp))   
  # return losses.avg 
  return f1_score(yt, yp)


In [None]:
def test_fn(data_loader, model, device):
  model.eval()
  tk0 = tqdm(data_loader, total=len(data_loader))
  test_preds = []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    gcn_ids = d['gcn_ids']
    gcn_length = d['gcn_length']
    
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    gcn_ids = gcn_ids.to(device, dtype=torch.long)
    gcn_length = gcn_length.to(device, dtype=torch.long)
    
    with torch.no_grad():
      outputs = model(ids, mask, gcn_ids, gcn_length)        
              
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    preds = softmax(logits)[:, 1]        
    test_preds = test_preds + preds.tolist()

  return test_preds

In [None]:
def run(df_train, df_val, df_test, fold=None):

  train_dataset = DEFTDataset(
        text = df_train.Sentence.values,
        subject = df_train.Subject.values,
        label = df_train.Label.values,
    )
  
  valid_dataset = DEFTDataset(
        text = df_val.Sentence.values,
        subject = df_val.Subject.values,
        label = df_val.Label.values,
    )
  
  test_dataset = DEFTDataset(
        text = df_test.Sentence.values,
        subject = df_test.Subject.values,
        label = df_test.Label.values,
    )
  

  train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

  valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
  
  test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )


  edges_weights, edges_mappings, count = cal_PMI(train_dataset, 5)

  model = BertGCN(hidden = 200,
            vocabulary = train_dataset.vocab,
            n_grams = 5,
            edges_mappings = edges_mappings,
            edges_weights = edges_weights, 
            count = count, 
            num_classes = 2, 
            dropout_prob = 0.3
          )


  device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
  model.to(device)

  lr = 2e-5
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

  es = EarlyStopping(patience=3, mode="max")

  print('Starting training....')
  for epoch in range(config.EPOCHS):
    train_fn(train_data_loader, model, optimizer, device)
    valid_loss = eval_fn(valid_data_loader, model, device)
    print(f'Epoch :{epoch + 1} | Validation Score :{valid_loss}')
    if fold is None:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model.bin"))
    else:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model_{fold}.bin"))


  print('Predicting for test-set')
  if fold is None:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, 'model.bin')))
  else:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, f'model_{fold}.bin')))
  model.to(device)
  
  test_predictions = test_fn(test_data_loader, model, device)

In [None]:
def run_train_val():
  '''
  Train model, validate and return predictions on test-set
  '''
  seed_all()
  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  df_test = pd.read_csv(config.TEST_FILE)

  df_test['Label'] = -1

  scores = pd.DataFrame()
  y = run(df_train, df_val, df_test)
  scores['prob'] = y
  
  scores.to_csv(os.path.join(config.SAVE_DIR, 'submission.csv'), index=False)
  


In [None]:
def run_k_fold():
  '''
    Perform k-fold cross-validation
  '''

  seed_all()
  scores = pd.DataFrame()

  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  df_test = pd.read_csv(config.TEST_FILE)

  # concatenating train and validation set
  train = pd.concat([df_train, df_val]).reset_index()
  
  # dividing folds
  kf = model_selection.StratifiedKFold(n_splits=config.KFOLD, shuffle=False, random_state=config.SEED)
  for fold, (train_idx, val_idx) in enumerate(kf.split(X=train, y=train.Label.values)):
      train.loc[val_idx, 'kfold'] = fold

  
  df_test['Label'] = -1

  for i in range(config.KFOLD):
    print(f'################# Fold {i} #################')
    df_train = train[train.kfold!=i]
    df_val = train[train.kfold==i]

    y = run(df_train, df_val, df_test, i)
    scores[f'prob_{i}'] = y
  
  scores.to_csv(os.path.join(config.SAVE_DIR, 'submission.csv'), index=False)


In [None]:
if __name__=='__main__':
    ! rm -rf {config.SAVE_DIR} && mkdir {config.SAVE_DIR}
    # run_train_val()
    run_k_fold()