In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
! pip install transformers -q

In [None]:
import re
import pandas as pd
from pathlib import Path
import matplotlib.cm as cm
import numpy as np
import pandas as pd
from typing import *
from tqdm.notebook import tqdm
from sklearn.utils.extmath import softmax
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score

In [None]:
import torch
import torch.optim as optim
import transformers
from transformers import AdamW

In [None]:
def seed_all(seed = 42):
  """
  Fix seed for reproducibility
  """
  # python RNG
  import random
  random.seed(seed)

  # pytorch RNGs
  import torch
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

  # numpy RNG
  import numpy as np
  np.random.seed(seed)

In [None]:
class config:
  SEED = 42
  KFOLD = 2
  SAVE_DIR = 'bert_lm'
  ADD_SUBJ = False
  TRAIN_FILE = 'task1_train.csv'
  VAL_FILE =  'task1_dev.csv'
  TEST_FILE = 'task1_test.csv'
  MAX_LEN = 96
  MODEL = 'bert-base-cased'
  FT_MODEL = './bert_ft'
  TOKENIZER = transformers.BertTokenizer.from_pretrained(MODEL)
  EPOCHS = 1
  TRAIN_BATCH_SIZE = 32
  VALID_BATCH_SIZE = 32

In [None]:
import os
os.chdir('/gdrive/My Drive/DEFINITION EXTRACTION/DEFT_Updated')

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value of a parameter
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    """
    Early stopping utility
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):
        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [None]:
def process_data(text, subject, tokenizer, max_len, label):
  ## remove initial numbers
  text = re.findall('^\s*\d*\s*\.?\s*(.*)', text)[0]  
  ## add subject
  if config.ADD_SUBJ:
    text = subject + ' ' + text
  
  token_ids = tokenizer.encode(text, add_special_tokens=True)
  mask = [1] * len(token_ids)

  padding = max_len - len(token_ids)
  
  if padding>=0:
    token_ids = token_ids + ([0] * padding)
    mask = mask + ([0] * padding)
  else:
    token_ids = token_ids[0:max_len]
    mask = mask[0:max_len]

  return {'text':text,
          'subject':subject,
          'ids':token_ids,
          'mask':mask,
          'label':label
          }

In [None]:
class DEFTDataset:
    def __init__(self, text, subject, label):
        self.text = text
        self.subject = subject
        self.label = label
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        data = process_data(
            self.text[item], 
            self.subject[item], 
            self.tokenizer,
            self.max_len,
            self.label[item],
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'text': data['text'],
            'subject': data['subject'],
            'label': data['label'],
        }

In [None]:
def train_fn(data_loader, model, optimizer, device):
  model.train()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  
  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    model.zero_grad()
    outputs = model(ids, attention_mask=mask, labels=label)

    loss, logits = outputs[:2]
    loss.backward()
    optimizer.step()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)


In [None]:
def eval_fn(data_loader, model, device):
  model.eval()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  yt, yp = [], []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    with torch.no_grad():
      outputs = model(ids, 
                      attention_mask=mask,
                      labels= label)        
      loss, logits = outputs[:2]

    
    logits = logits.detach().cpu().numpy()

    preds = softmax(logits)
    pred_labels = np.argmax(preds, axis=1).flatten()
    ground_labels = label.to('cpu').numpy()

    yt = yt + ground_labels.tolist()
    yp = yp + pred_labels.tolist()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)


  print('Classification Report')
  print(classification_report(yt, yp))   
  # return losses.avg 
  return f1_score(yt, yp)


In [None]:
def test_fn(data_loader, model, device):
  model.eval()
  tk0 = tqdm(data_loader, total=len(data_loader))
  test_preds = []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    with torch.no_grad():
      outputs = model(ids, attention_mask=mask,)        
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    preds = softmax(logits)[:, 1]        
    test_preds = test_preds + preds.tolist()

  return test_preds

In [None]:
def language_model_finetuning():
  '''
    run_language_modeling.py from huggingface - https://github.com/huggingface/transformers/tree/master/examples/language-modeling
    Function to finetune language model of BERT and save new model in bert_ft directory
  '''

  print('Finetuning Language Model.......')

  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  df_test = pd.read_csv(config.TEST_FILE)

  if not os.path.exists('./ft_data'):
    os.makedirs('./ft_data')

  with open('./ft_data/deft.train.raw', 'w') as f:
    for i,row in df_train.iterrows():
      f.write(row.Sentence)
    for i,row in df_val.iterrows():
      f.write(row.Sentence)
    
  with open('./ft_data/deft.test.raw', 'w') as f:
    for i,row in df_test.iterrows():
      f.write(row.Sentence)

  ! rm -rf bert_ft && mkdir bert_ft
  ! python run_language_modeling.py \
      --output_dir=bert_ft \
      --model_type=bert \
      --model_name_or_path=bert-base-cased \
      --do_train \
      --train_data_file=./ft_data/deft.train.raw \
      --do_eval \
      --eval_data_file=./ft_data/deft.test.raw \
      --mlm\

In [None]:
def run(df_train, df_val, df_test, fold=None):
  '''
    Train model, validate and return predictions on test-set

  '''

  train_dataset = DEFTDataset(
        text = df_train.Sentence.values,
        subject = df_train.Subject.values,
        label = df_train.Label.values,
    )
  
  valid_dataset = DEFTDataset(
        text = df_val.Sentence.values,
        subject = df_val.Subject.values,
        label = df_val.Label.values,
    )
  
  test_dataset = DEFTDataset(
        text = df_test.Sentence.values,
        subject = df_test.Subject.values,
        label = df_test.Label.values,
    )
  

  train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

  valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
  
  test_data_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
  
  model = transformers.BertForSequenceClassification.from_pretrained(config.FT_MODEL, num_labels= 2)
  device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
  model.to(device)

  lr = 2e-5
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

  es = EarlyStopping(patience=3, mode="max")

  print('Starting training....')
  for epoch in range(config.EPOCHS):
    train_fn(train_data_loader, model, optimizer, device)
    valid_loss = eval_fn(valid_data_loader, model, device)
    print(f'Epoch : {epoch + 1} | Validation Score : {valid_loss}')
    if fold is None:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model.bin"))
    else:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model_{fold}.bin"))


  print('Predicting for test-set')
  if fold is None:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, f'model.bin')))
  else:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, f'model_{fold}.bin')))
  model.to(device)
  
  test_predictions = test_fn(test_data_loader, model, device)

  return test_predictions


In [None]:
def run_k_fold():
  '''
    Perform k-fold cross-validation
  '''

  seed_all()
  scores = pd.DataFrame()

  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  df_test = pd.read_csv(config.TEST_FILE)

  # concatenating train and validation set
  train = pd.concat([df_train, df_val]).reset_index()
  
  # dividing folds
  kf = model_selection.StratifiedKFold(n_splits=config.KFOLD, shuffle=False, random_state=config.SEED)
  for fold, (train_idx, val_idx) in enumerate(kf.split(X=train, y=train.Label.values)):
      train.loc[val_idx, 'kfold'] = fold

  
  df_test['Label'] = -1

  for i in range(config.KFOLD):
    print(f'################# Fold {i} #################')
    df_train = train[train.kfold!=i]
    df_val = train[train.kfold==i]

    y = run(df_train, df_val, df_test, i)
    scores[f'prob_{i}'] = y
  
  scores.to_csv(os.path.join(config.SAVE_DIR, 'submission.csv'), index=False)


In [None]:
def run_train_val():
  seed_all()
  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  df_test = pd.read_csv(config.TEST_FILE)

  df_test['Label'] = -1

  scores = pd.DataFrame()
  y = run(df_train, df_val, df_test)
  scores['prob'] = y
  
  scores.to_csv(os.path.join(config.SAVE_DIR, 'submission.csv'), index=False)

In [None]:
if __name__=='__main__':
    ! rm -rf {config.SAVE_DIR} && mkdir {config.SAVE_DIR}
    # run_train_val()
    run_k_fold()