# NLP assignment 3 - finetune BERT for test sets

We use to PyTorch implementation of BERT from: https://github.com/huggingface/pytorch-pretrained-BERT
​
We have used this blog post (https://medium.com/huggingface/multi-label-text-classification-using-bert-the-mighty-transformer-69714fa3fb3d) and the supporting code (https://nbviewer.jupyter.org/github/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb) as a model for implementing our classifier. We refer to this below as Trivedi 2019. 

## Imports

In [0]:
!pip install pytorch-pretrained-bert



In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import f1_score

In [0]:
from pytorch_pretrained_bert.tokenization import BertTokenizer, WordpieceTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertModel, BertConfig, BertForMaskedLM, BertForSequenceClassification #PretrainedBertModel
from pathlib import Path
import torch
import re
from torch import Tensor
from torch.nn import BCEWithLogitsLoss
from fastai.text import Tokenizer, Vocab
import collections
import os
import pdb
from tqdm import tqdm, trange
import sys
import random
from sklearn.model_selection import train_test_split
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [0]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Define classes

These are all from the PyTorch BERT Github - copied in for reference when we were setting up the features.

In [0]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [0]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

In [0]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_ids = label_map[example.label]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_ids))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_ids=label_ids))
    return features

## Training functions

These functions are based on those from Trivedi 2019.

In [0]:
def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

In [0]:
def fit(model, train_dataloader, device, optimizer, num_epochs):
    
    resultsdf = pd.DataFrame(columns = ['epoch', 'train loss', 'train accuracy', 'validation loss', 'validation accuracy'])
    batch_losses = []
    
    global_step = 0
    model.train()
    for i_ in (range(int(num_epochs))):

        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(train_dataloader):

            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            
            batch_losses.append(loss.item())
            
            if step % 50 ==0:
              logger.info(f"Loss on batch {step}: {loss}")
                      
            if args['fp16']:
              optimizer.backward(loss)
            else:
              loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1

            if (step + 1) % args['gradient_accumulation_steps'] == 0:
              if args['fp16']:
                # modify learning rate with special warm up BERT uses
                # if args.fp16 is False, BertAdam is used that handles this automatically
                lr_this_step = args['learning_rate'] * warmup_linear(global_step/num_train_optimization_steps, args['warmup_proportion'])
                for param_group in optimizer.param_groups:
                  param_group['lr'] = lr_this_step
              optimizer.step()
              optimizer.zero_grad()
              global_step += 1
            
        logger.info('Training loss after epoch {}'.format(tr_loss / nb_tr_steps))
        train_tup = eval(train_examples, train_features, model=model, device=device)
        logger.info('Training accuracy after epoch {}'.format(train_tup[0]['accuracy']))
        logger.info("***** Running evaluation *****")
        logger.info('Eval after epoch {}'.format(i_+1))
        eval_tup = eval(eval_examples, eval_features, model = model, device = device)
        logger.info(eval_tup[0])
        
        resultsdf = resultsdf.append({"epoch": i_+1, "train loss": train_tup[0]['loss'], "train accuracy": train_tup[0]['accuracy'],
                         "validation loss": eval_tup[0]['loss'], "validation accuracy": eval_tup[0]['accuracy']}, ignore_index=True)
    
    return resultsdf, batch_losses

## Evaluation functions

The functions accuracy() and eval() are based on those from Trevedi 2019.

In [0]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

Use this function to caluclate the accuracy on the balanced task

In [0]:
def balanced_accuracy(out, labels):
  
  #'out' should be the logits put into a softmax
  paired_pred = []
  
  for i in np.arange(0, len(out),2):
    if out[i][1] < out[i+1][1]:
      paired_pred.append(0)
      paired_pred.append(1)
    else:
      paired_pred.append(1)
      paired_pred.append(0)
  
  return np.sum(np.array(paired_pred) == labels)/len(out)

In [0]:
def softmax(A):
    """
    Calculates the value of softmax function
    
    Inputs:
    A -- N x 2 array of logits from BERT

    Output:
    probs -- N x 2 array of probabilities based on the softmax function
    """ 
    
    temp = np.exp(A)
    sumtemp = np.sum(temp, axis=1).reshape(-1,1)
    
    probs = temp/sumtemp
    
    return probs

In [0]:
def eval(eval_examples, eval_features, model, device):
        
    args['output_dir'].mkdir(exist_ok=True)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args['eval_batch_size'])
    
    all_logits = None
    all_labels = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)
        
        if all_logits is None:
            all_logits = logits
        else:
            all_logits = np.concatenate((all_logits, logits), axis=0)
            
        if all_labels is None:
            all_labels = label_ids
        else:    
            all_labels = np.concatenate((all_labels, label_ids), axis=0)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    
    result = {'loss': eval_loss,
              'accuracy': eval_accuracy}
    
    return (result, all_logits, all_labels)

## Set an output path and the default value of the arguments

In [0]:
OUTPUT_PATH = Path('gdrive/My Drive/tmp/output')
OUTPUT_PATH.mkdir(parents=True, exist_ok = True)

The default arguments are based on those from Trivedi (2019)

In [0]:
args = {
    "train_size": -1,
    "val_size": -1,
    "task_name": "sarcpol",
    "no_cuda": False,
    "bert_model": 'bert-base-uncased',
    "output_dir": OUTPUT_PATH,
    "max_seq_length": 50,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 32 ,
    "eval_batch_size": 32,
    "learning_rate": 3e-5,
    "num_train_epochs": 5,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "loss_scale": 128
}

In [0]:
if args["local_rank"] == -1 or args["no_cuda"]:
    device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
    n_gpu = torch.cuda.device_count()
#     n_gpu = 1
else:
    torch.cuda.set_device(args['local_rank'])
    device = torch.device("cuda", args['local_rank'])
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))

03/11/2019 14:46:58 - INFO - __main__ -   device: cuda n_gpu: 1, distributed training: False, 16-bits training: False


## Load in the training and validation sets

In [0]:
#Select the path contained the datasets
SARC_POL = '/content/gdrive/My Drive/SARC pol/'

In [0]:
#Load in the required training sets
#traindf_100 = pd.read_csv(SARC_POL+'project_data/project_training_100.csv', index_col=0)
#traindf_50 = pd.read_csv(SARC_POL+'project_data/project_training_50.csv', index_col=0)
#traindf_25 = pd.read_csv(SARC_POL+'project_data/project_training_25.csv', index_col=0)
#traindf_12 = pd.read_csv(SARC_POL+'project_data/project_training_12.csv', index_col=0)

In [0]:
#Load in the validation set
validdf = pd.read_csv(SARC_POL+'project_data/project_validation.csv', index_col = 0)

In [0]:
#Load in the test sets
testdf_bal = pd.read_csv(SARC_POL+'project_data/balanced_test.csv', index_col = 0)
testdf_unbal = pd.read_csv(SARC_POL+'project_data/unbalanced_test.csv', index_col = 0)

## Process the training and validation sets

In [0]:
#Process the training examples
#train_examples100 = []

#for i in range(0,len(traindf.index)):
#        train_examples100.append(InputExample(str(i), traindf.loc[i,'response'], None, str(traindf.loc[i,'label'])))

In [0]:
#Process the validation examples
eval_examples = []

for i in range(0,len(validdf.index)):
        eval_examples.append(InputExample(str(i), validdf.loc[i,'response'], None, str(validdf.loc[i,'label'])))

In [0]:
#Process the balances test set examples
testbal_examples = []

for i in range(0,len(testdf_bal.index)):
        testbal_examples.append(InputExample(str(i), testdf_bal.loc[i,'response'], None, str(testdf_bal.loc[i,'label'])))

In [0]:
#Process the unbalanced test set examples
testunbal_examples = []

for i in range(0,len(testdf_unbal.index)):
        testunbal_examples.append(InputExample(str(i), testdf_unbal.loc[i,'response'], None, str(testdf_unbal.loc[i,'label'])))

In [0]:
#Create a list of labels
label_list = ['0', '1']
num_labels = len(label_list)

In [0]:
#Instantiate the tokenizer
tokenizer = BertTokenizer.from_pretrained(args['bert_model'], do_lower_case=args['do_lower_case'])

03/11/2019 14:47:13 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [0]:
#Create the features based on the training set
#train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

02/27/2019 07:39:01 - INFO - __main__ -   *** Example ***
02/27/2019 07:39:01 - INFO - __main__ -   guid: 0
02/27/2019 07:39:01 - INFO - __main__ -   tokens: [CLS] or anyone that ' s ever had to make an appeal . [SEP]
02/27/2019 07:39:01 - INFO - __main__ -   input_ids: 101 2030 3087 2008 1005 1055 2412 2018 2000 2191 2019 5574 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/27/2019 07:39:01 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/27/2019 07:39:01 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
02/27/2019 07:39:01 - INFO - __main__ -   label: 0 (id = 0)
02/27/2019 07:39:01 - INFO - __main__ -   *** Example ***
02/27/2019 07:39:01 - INFO - __main__ -   guid: 1
02/27/2019 07:39:01 - INFO - __main__ -   tokens: [CLS] trump is the health ##iest president to ever take office

In [0]:
#Create the features based on the validation set
eval_features = convert_examples_to_features(eval_examples, label_list, args["max_seq_length"], tokenizer)

03/11/2019 14:47:16 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:16 - INFO - __main__ -   guid: 0
03/11/2019 14:47:16 - INFO - __main__ -   tokens: [CLS] and if trump builds that wall they will be stuck here . [SEP]
03/11/2019 14:47:16 - INFO - __main__ -   input_ids: 101 1998 2065 8398 16473 2008 2813 2027 2097 2022 5881 2182 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:16 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:16 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:16 - INFO - __main__ -   label: 1 (id = 1)
03/11/2019 14:47:16 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:16 - INFO - __main__ -   guid: 1
03/11/2019 14:47:16 - INFO - __main__ -   tokens: [CLS] says a voluntary survey . [SEP]
03/11/2019 14:4

In [0]:
#Create the features based on the balanced test set
testbal_features = convert_examples_to_features(testbal_examples, label_list, args["max_seq_length"], tokenizer)

03/11/2019 14:47:18 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:18 - INFO - __main__ -   guid: 0
03/11/2019 14:47:18 - INFO - __main__ -   tokens: [CLS] and we ' re upset since the democrats would * never * try something as sneak ##y as this , right ? [SEP]
03/11/2019 14:47:18 - INFO - __main__ -   input_ids: 101 1998 2057 1005 2128 6314 2144 1996 8037 2052 1008 2196 1008 3046 2242 2004 13583 2100 2004 2023 1010 2157 1029 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:18 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:18 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:18 - INFO - __main__ -   label: 1 (id = 1)
03/11/2019 14:47:18 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:18 - INFO - __main__ -   guid: 1
03/11/2019 14:47:18 - INFO - _

In [0]:
#Create the features based on the unbalance test set
testunbal_features = convert_examples_to_features(testunbal_examples, label_list, args["max_seq_length"], tokenizer)

03/11/2019 14:47:20 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:20 - INFO - __main__ -   guid: 0
03/11/2019 14:47:20 - INFO - __main__ -   tokens: [CLS] well yeah , but it ' ll work this time . [SEP]
03/11/2019 14:47:20 - INFO - __main__ -   input_ids: 101 2092 3398 1010 2021 2009 1005 2222 2147 2023 2051 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:20 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:20 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 14:47:20 - INFO - __main__ -   label: 1 (id = 1)
03/11/2019 14:47:20 - INFO - __main__ -   *** Example ***
03/11/2019 14:47:20 - INFO - __main__ -   guid: 1
03/11/2019 14:47:20 - INFO - __main__ -   tokens: [CLS] o ##oo ##h baby you caught me red handed creep ##in ' on the senat

## Define function to train the model

This function is based on the training function from Trivedi (2019)

In [0]:
def train():

#Set up PyTorch options

  num_train_optimization_steps = None
  if args["do_train"]:
      num_train_optimization_steps = int(
          len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps']) * args['num_train_epochs']
      if args["local_rank"] != -1:
          num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
  num_train_steps = int(
          len(train_examples) / args['train_batch_size'] / args['gradient_accumulation_steps'] * args['num_train_epochs'])

#  if args["local_rank"] == -1 or args["no_cuda"]:
#      device = torch.device("cuda" if torch.cuda.is_available() and not args["no_cuda"] else "cpu")
#      n_gpu = torch.cuda.device_count()
#  #     n_gpu = 1
#  else:
#      torch.cuda.set_device(args['local_rank'])
#      device = torch.device("cuda", args['local_rank'])
#      n_gpu = 1
#      # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
#      torch.distributed.init_process_group(backend='nccl')
#  logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
#          device, n_gpu, bool(args['local_rank'] != -1), args['fp16']))

  #Instantiate the model

  model = BertForSequenceClassification.from_pretrained(args["bert_model"],
            num_labels = num_labels)
  if args["fp16"]:
      model.half()
  model.to(device)
  if args["local_rank"] != -1:
      try:
          from apex.parallel import DistributedDataParallel as DDP
      except ImportError:
          raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

      model = DDP(model)
  elif n_gpu > 1:
      model = torch.nn.DataParallel(model)


  #Instantiate the optimizer
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
      ]
  if args["fp16"]:
      try:
          from apex.optimizers import FP16_Optimizer
          from apex.optimizers import FusedAdam
      except ImportError:
          raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

      optimizer = FusedAdam(optimizer_grouped_parameters,
                            lr=args["learning_rate"],
                            bias_correction=False,
                            max_grad_norm=1.0)
      if args["loss_scale"]== 0:
          optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
      else:
          optimizer = FP16_Optimizer(optimizer, static_loss_scale=args["loss_scale"])

  else:
      optimizer = BertAdam(optimizer_grouped_parameters,
                           lr=args["learning_rate"],
                           warmup=args["warmup_proportion"],
                           t_total=num_train_optimization_steps)


  #Instantiate the PyTorch datasets and print key details
  logger.info("  Num examples = %d", len(train_examples))
  logger.info("  Batch size = %d", args['train_batch_size'])
  logger.info("  Num steps = %d", num_train_steps)
  all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
  all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
  all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long)
  train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
  if args['local_rank'] == -1:
      train_sampler = RandomSampler(train_data)
  else:
      train_sampler = DistributedSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])

  resultsdf, batch_losses = fit(model = model, train_dataloader = train_dataloader, device = device, optimizer = optimizer, num_epochs = args["num_train_epochs"])
  
  return model, resultsdf, batch_losses

## 100% project training set

Best hyperparameters found in cross validation were batch size = 32, learning rate = 3e-5, training for 2 epochs. 

In [0]:
#Set the hyperparamters
args["learning_rate"] = 3e-5
args["train_batch_size"] = 32
args["num_train_epochs"] = 2

### Load in and process the training data

In [0]:
#Load in the required training sets
traindf_100 = pd.read_csv(SARC_POL+'project_data/project_training_100.csv', index_col=0)

In [0]:
#Process the training examples
train_examples = []

for i in range(0,len(traindf_100.index)):
        train_examples.append(InputExample(str(i), traindf_100.loc[i,'response'], None, str(traindf_100.loc[i,'label'])))

In [0]:
#Create the features based on the training set
train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

03/01/2019 12:14:04 - INFO - __main__ -   *** Example ***
03/01/2019 12:14:04 - INFO - __main__ -   guid: 0
03/01/2019 12:14:04 - INFO - __main__ -   tokens: [CLS] donald did say he was interested in going to mars , didn ' t he ? [SEP]
03/01/2019 12:14:04 - INFO - __main__ -   input_ids: 101 6221 2106 2360 2002 2001 4699 1999 2183 2000 7733 1010 2134 1005 1056 2002 1029 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 12:14:04 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 12:14:04 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 12:14:04 - INFO - __main__ -   label: 0 (id = 0)
03/01/2019 12:14:04 - INFO - __main__ -   *** Example ***
03/01/2019 12:14:04 - INFO - __main__ -   guid: 1
03/01/2019 12:14:04 - INFO - __main__ -   tokens: [CLS] so you ' re saying it ' s 

### Train the model, save the models and results

In [0]:
#Not explicitly setting seed, each run because each lopp will generate a different random number from the seed set in args
for n in range(0,5):
  model, resultsdf, batch_losses = train()
  
  with open(SARC_POL+f"models/100_{n}.pickle", 'wb') as handle:
    pickle.dump(model, handle)

03/01/2019 17:34:00 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 17:34:00 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp58d106kx
03/01/2019 17:34:04 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.810335        0.740752       0.742641


03/01/2019 17:49:03 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 17:49:03 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmprltaf3vw
03/01/2019 17:49:07 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.810335        0.740752       0.742641
1        0.812096        0.733118       0.727436


03/01/2019 18:04:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 18:04:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp2p38qats
03/01/2019 18:04:10 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.810335        0.740752       0.742641
1        0.812096        0.733118       0.727436
2        0.815032        0.736348       0.734947


03/01/2019 18:19:07 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 18:19:07 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp36_w5l0e
03/01/2019 18:19:12 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.810335        0.740752       0.742641
1        0.812096        0.733118       0.727436
2        0.815032        0.736348       0.734947
3        0.806224        0.735467       0.732562


03/01/2019 18:34:09 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 18:34:09 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpj1nmyr2p
03/01/2019 18:34:14 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.810335        0.740752       0.742641
1        0.812096        0.733118       0.727436
2        0.815032        0.736348       0.734947
3        0.806224        0.735467       0.732562
4        0.820317        0.739577       0.730804


In [0]:
record_100 = pd.DataFrame()

In [0]:
for n in range(0,5):
  
  with open(SARC_POL+f"models/100_{n}.pickle", 'rb') as handle:
    model = pickle.load(handle)

  #Evaluate on the balanced test set
  tupb = eval(testbal_examples, testbal_features, model, device)
  bal_test_balacc = balanced_accuracy(softmax(tupb[1]), tupb[2])
  bal_test_acc = tupb[0]['accuracy']
  bal_test_F1 = f1_score(tupb[2], np.argmax(tupb[1], axis=1), labels=None, pos_label=1, average='binary', sample_weight=None)

  #Store the results
  record_100 = record_100.append({'Acc (bal, bal)': bal_test_balacc, 'Acc (bal, reg)': bal_test_acc, 'F1 (bal, reg)': bal_test_F1}, ignore_index=True)

print(record_100)

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.812096        0.740752       0.742641
1        0.816207        0.733118       0.727436
2        0.816794        0.736348       0.734947
3        0.808573        0.735467       0.732562
4        0.818555        0.739577       0.730804


In [0]:
with open(SARC_POL+f"test_set_results/test100.pickle", 'wb') as handle:
  pickle.dump(record_100, handle)

## 50% project training set

Best hyperparameters found in cross validation were batch size = 32, learning rate = 2e-5, training for 2 epochs. 

In [0]:
#Set the hyperparamters
args["learning_rate"] = 2e-5
args["train_batch_size"] = 32
args["num_train_epochs"] = 2

### Load in and process the training data

In [0]:
#Load in the required training sets
traindf_50 = pd.read_csv(SARC_POL+'project_data/project_training_50.csv', index_col=0)

In [0]:
#Process the training examples
train_examples = []

for i in range(0,len(traindf_50.index)):
        train_examples.append(InputExample(str(i), traindf_50.loc[i,'response'], None, str(traindf_50.loc[i,'label'])))

In [0]:
#Create the features based on the training set
train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

03/01/2019 18:59:33 - INFO - __main__ -   *** Example ***
03/01/2019 18:59:33 - INFO - __main__ -   guid: 0
03/01/2019 18:59:33 - INFO - __main__ -   tokens: [CLS] i bet it ' s alabama or something . . . click link nailed it ! [SEP]
03/01/2019 18:59:33 - INFO - __main__ -   input_ids: 101 1045 6655 2009 1005 1055 6041 2030 2242 1012 1012 1012 11562 4957 26304 2009 999 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 18:59:33 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 18:59:33 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 18:59:33 - INFO - __main__ -   label: 0 (id = 0)
03/01/2019 18:59:33 - INFO - __main__ -   *** Example ***
03/01/2019 18:59:33 - INFO - __main__ -   guid: 1
03/01/2019 18:59:33 - INFO - __main__ -   tokens: [CLS] and this is the exact reason

### Train the model, save the models and results

In [0]:
#Not explicitly setting seed, each run because each lopp will generate a different random number from the seed set in args
for n in range(0,5):
  model, resultsdf, batch_losses = train()
  
  with open(SARC_POL+f"models/50_{n}.pickle", 'wb') as handle:
    pickle.dump(model, handle)

03/01/2019 18:59:49 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 18:59:49 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpg6_c5dcd
03/01/2019 18:59:54 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.749266        0.701703       0.678277


03/01/2019 19:08:25 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:08:25 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp6eq5roel
03/01/2019 19:08:30 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.749266        0.701703       0.678277
1        0.777452        0.723429       0.713329


03/01/2019 19:17:01 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:17:01 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpjw95ao5f
03/01/2019 19:17:06 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.749266        0.701703       0.678277
1        0.777452        0.723429       0.713329
2        0.788608        0.710804       0.696830


03/01/2019 19:25:38 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:25:38 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpl2lcl6t1
03/01/2019 19:25:43 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.749266        0.701703       0.678277
1        0.777452        0.723429       0.713329
2        0.788608        0.710804       0.696830
3        0.781562        0.719025       0.718612


03/01/2019 19:34:15 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:34:15 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpikruwp80
03/01/2019 19:34:20 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.749266        0.701703       0.678277
1        0.777452        0.723429       0.713329
2        0.788608        0.710804       0.696830
3        0.781562        0.719025       0.718612
4        0.772167        0.716970       0.714962


In [0]:
record_50 = pd.DataFrame()

In [0]:
for n in range(0,5):
  
  with open(SARC_POL+f"models/50_{n}.pickle", 'rb') as handle:
    model = pickle.load(handle)

  #Evaluate on the balanced test set
  tupb = eval(testbal_examples, testbal_features, model, device)
  bal_test_balacc = balanced_accuracy(softmax(tupb[1]), tupb[2])
  bal_test_acc = tupb[0]['accuracy']
  bal_test_F1 = f1_score(tupb[2], np.argmax(tupb[1], axis=1), labels=None, pos_label=1, average='binary', sample_weight=None)

  #Store the results
  record_50 = record_50.append({'Acc (bal, bal)': bal_test_balacc, 'Acc (bal, reg)': bal_test_acc, 'F1 (bal, reg)': bal_test_F1}, ignore_index=True)

print(record_50)

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.746330        0.701703       0.678277
1        0.780975        0.723429       0.713329
2        0.783911        0.710804       0.696830
3        0.786260        0.719025       0.718612
4        0.774516        0.716970       0.714962


In [0]:
with open(SARC_POL+f"test_set_results/test50.pickle", 'wb') as handle:
  pickle.dump(record_50, handle)

## 25% project training set

Best hyperparameters found in cross validation were batch size = 16, learning rate = 2e-5, training for 2 epochs. 

In [0]:
#Set the hyperparamters
args["learning_rate"] = 2e-5
args["train_batch_size"] = 16
args["num_train_epochs"] = 2

### Load in and process the training data

In [0]:
#Load in the required training sets
traindf_25 = pd.read_csv(SARC_POL+'project_data/project_training_25.csv', index_col=0)

In [0]:
#Process the training examples
train_examples = []

for i in range(0,len(traindf_25.index)):
        train_examples.append(InputExample(str(i), traindf_25.loc[i,'response'], None, str(traindf_25.loc[i,'label'])))

In [0]:
#Create the features based on the training set
train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

03/01/2019 19:50:51 - INFO - __main__ -   *** Example ***
03/01/2019 19:50:51 - INFO - __main__ -   guid: 0
03/01/2019 19:50:51 - INFO - __main__ -   tokens: [CLS] watching the go ##p imp ##lo ##de , knowing it is personally responsible for this cluster ##fu ##ck , is absolutely delicious . [SEP]
03/01/2019 19:50:51 - INFO - __main__ -   input_ids: 101 3666 1996 2175 2361 17727 4135 3207 1010 4209 2009 2003 7714 3625 2005 2023 9324 11263 3600 1010 2003 7078 12090 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 19:50:51 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 19:50:51 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 19:50:51 - INFO - __main__ -   label: 0 (id = 0)
03/01/2019 19:50:51 - INFO - __main__ -   *** Example ***
03/01/2019 19:50:51 - INFO - __main__ -   guid

### Train the model, save the models and results

In [0]:
#Not explicitly setting seed, each run because each lopp will generate a different random number from the seed set in args
for n in range(0,5):
  model, resultsdf, batch_losses = train()
  
  with open(SARC_POL+f"models/25_{n}.pickle", 'wb') as handle:
    pickle.dump(model, handle)

03/01/2019 19:51:06 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:51:06 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpj_1t_baw
03/01/2019 19:51:11 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763946        0.698767       0.684502


03/01/2019 19:57:04 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 19:57:04 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmprr3tamu3
03/01/2019 19:57:09 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763946        0.698767       0.684502
1        0.762184        0.699060       0.670100


03/01/2019 20:03:04 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:03:04 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmptn38x4g2
03/01/2019 20:03:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763946        0.698767       0.684502
1        0.762184        0.699060       0.670100
2        0.684087        0.622137       0.507085


03/01/2019 20:09:02 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:09:02 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp1f3e1v52
03/01/2019 20:09:07 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763946        0.698767       0.684502
1        0.762184        0.699060       0.670100
2        0.684087        0.622137       0.507085
3        0.756312        0.691720       0.697057


03/01/2019 20:15:03 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:15:03 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmprdddfv2n
03/01/2019 20:15:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763946        0.698767       0.684502
1        0.762184        0.699060       0.670100
2        0.684087        0.622137       0.507085
3        0.756312        0.691720       0.697057
4        0.755725        0.693776       0.688564


In [0]:
record_25 = pd.DataFrame()

In [0]:
for n in range(0,5):
  
  with open(SARC_POL+f"models/25_{n}.pickle", 'rb') as handle:
    model = pickle.load(handle)

  #Evaluate on the balanced test set
  tupb = eval(testbal_examples, testbal_features, model, device)
  bal_test_balacc = balanced_accuracy(softmax(tupb[1]), tupb[2])
  bal_test_acc = tupb[0]['accuracy']
  bal_test_F1 = f1_score(tupb[2], np.argmax(tupb[1], axis=1), labels=None, pos_label=1, average='binary', sample_weight=None)
  
  #Store the results
  record_25 = record_25.append({'Acc (bal, bal)': bal_test_balacc, 'Acc (bal, reg)': bal_test_acc, 'F1 (bal, reg)': bal_test_F1}, ignore_index=True)
  
print(record_25)

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.763359        0.698767       0.684502
1        0.762772        0.699060       0.670100
2        0.678215        0.622137       0.507085
3        0.756900        0.691720       0.697057
4        0.753376        0.693776       0.688564


In [0]:
with open(SARC_POL+f"test_set_results/test25.pickle", 'wb') as handle:
  pickle.dump(record_25, handle)

## 12.5% project training set

Best hyperparameters found in cross validation were batch size = 16, learning rate = 2e-5, training for 5 epochs. 

In [0]:
#Set the hyperparamters
args["learning_rate"] = 2e-5
args["train_batch_size"] = 16
args["num_train_epochs"] = 5

### Load in and process the training data

In [0]:
#Load in the required training sets
traindf_12 = pd.read_csv(SARC_POL+'project_data/project_training_12.csv', index_col=0)

In [0]:
#Process the training examples
train_examples = []

for i in range(0,len(traindf_12.index)):
        train_examples.append(InputExample(str(i), traindf_12.loc[i,'response'], None, str(traindf_12.loc[i,'label'])))

In [0]:
#Create the features based on the training set
train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

03/01/2019 20:27:29 - INFO - __main__ -   *** Example ***
03/01/2019 20:27:29 - INFO - __main__ -   guid: 0
03/01/2019 20:27:29 - INFO - __main__ -   tokens: [CLS] or anyone that ' s ever had to make an appeal . [SEP]
03/01/2019 20:27:29 - INFO - __main__ -   input_ids: 101 2030 3087 2008 1005 1055 2412 2018 2000 2191 2019 5574 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 20:27:29 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 20:27:29 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/01/2019 20:27:29 - INFO - __main__ -   label: 0 (id = 0)
03/01/2019 20:27:29 - INFO - __main__ -   *** Example ***
03/01/2019 20:27:29 - INFO - __main__ -   guid: 1
03/01/2019 20:27:29 - INFO - __main__ -   tokens: [CLS] trump is the health ##iest president to ever take office

### Train the model, save the models and results

In [0]:
#Not explicitly setting seed, each run because each lopp will generate a different random number from the seed set in args
for n in range(0,5):
  model, resultsdf, batch_losses = train()
  
  with open(SARC_POL+f"models/12_{n}.pickle", 'wb') as handle:
    pickle.dump(model, handle)

03/01/2019 20:27:34 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:27:34 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp2eteo_zx
03/01/2019 20:27:39 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643


03/01/2019 20:36:19 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:36:19 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpd9sxfwju
03/01/2019 20:36:24 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643
1        0.733412        0.679977       0.675595


03/01/2019 20:45:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:45:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp2iii8uom
03/01/2019 20:45:10 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643
1        0.733412        0.679977       0.675595
2        0.713447        0.663535       0.659537


03/01/2019 20:53:51 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 20:53:51 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp8fo8u3al
03/01/2019 20:53:56 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643
1        0.733412        0.679977       0.675595
2        0.713447        0.663535       0.659537
3        0.711098        0.668233       0.667060


03/01/2019 21:02:38 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/01/2019 21:02:38 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp15_9gm34
03/01/2019 21:02:43 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643
1        0.733412        0.679977       0.675595
2        0.713447        0.663535       0.659537
3        0.711098        0.668233       0.667060
4        0.737522        0.672930       0.646798


In [0]:
record_12 = pd.DataFrame()

In [0]:
for n in range(0,5):
  
  with open(SARC_POL+f"models/12_{n}.pickle", 'rb') as handle:
    model = pickle.load(handle)

  #Evaluate on the balanced test set
  tupb = eval(testbal_examples, testbal_features, model, device)
  bal_test_balacc = balanced_accuracy(softmax(tupb[1]), tupb[2])
  bal_test_acc = tupb[0]['accuracy']
  bal_test_F1 = f1_score(tupb[2], np.argmax(tupb[1], axis=1), labels=None, pos_label=1, average='binary', sample_weight=None)
  
  #Store the results
  record_12 = record_12.append({'Acc (bal, bal)': bal_test_balacc, 'Acc (bal, reg)': bal_test_acc, 'F1 (bal, reg)': bal_test_F1}, ignore_index=True)
  
print(record_12)

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.738109        0.675573       0.664643
1        0.735173        0.679977       0.675595
2        0.715208        0.663535       0.659537
3        0.718732        0.668233       0.667060
4        0.732824        0.672930       0.646798


In [0]:
with open(SARC_POL+f"test_set_results/test12.pickle", 'wb') as handle:
  pickle.dump(record_12, handle)

## 6.25% project training set


Best hyperparameters found in cross validation were batch size = 16, learning rate = 3e-5, training for 5 epochs.

In [0]:
#Set the hyperparamters
args["learning_rate"] = 3e-5
args["train_batch_size"] = 16
args["num_train_epochs"] = 5

### Load in and process the training data

In [0]:
#Load in the required training sets
traindf_6 = pd.read_csv(SARC_POL+'project_data/project_training_6.csv', index_col=0)

In [0]:
#Process the training examples
train_examples = []

for i in range(0,len(traindf_6.index)):
        train_examples.append(InputExample(str(i), traindf_6.loc[i,'response'], None, str(traindf_6.loc[i,'label'])))

In [0]:
#Create the features based on the training set
train_features = convert_examples_to_features(train_examples, label_list, args["max_seq_length"], tokenizer)

03/11/2019 16:11:01 - INFO - __main__ -   *** Example ***
03/11/2019 16:11:01 - INFO - __main__ -   guid: 0
03/11/2019 16:11:01 - INFO - __main__ -   tokens: [CLS] the ride never ends . [SEP]
03/11/2019 16:11:01 - INFO - __main__ -   input_ids: 101 1996 4536 2196 4515 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 16:11:01 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 16:11:01 - INFO - __main__ -   segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
03/11/2019 16:11:01 - INFO - __main__ -   label: 0 (id = 0)
03/11/2019 16:11:01 - INFO - __main__ -   *** Example ***
03/11/2019 16:11:01 - INFO - __main__ -   guid: 1
03/11/2019 16:11:01 - INFO - __main__ -   tokens: [CLS] the obama appointed john kirby is keeping the go ##p witch hunt alive i see . [SEP]
03/11/2019 16:11:01

### Train the model, save the models and results

In [0]:
#Not explicitly setting seed, each run because each lopp will generate a different random number from the seed set in args
for n in range(0,5):
  model, resultsdf, batch_losses = train()
  
  with open(SARC_POL+f"models/6_{n}.pickle", 'wb') as handle:
    pickle.dump(model, handle)

03/11/2019 16:19:07 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
03/11/2019 16:19:07 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /root/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmp9qn5xy5y
03/11/2019 16:19:12 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size

In [0]:
record_6 = pd.DataFrame()

In [0]:
for n in range(0,5):
  
  with open(SARC_POL+f"models/6_{n}.pickle", 'rb') as handle:
    model = pickle.load(handle)

  #Evaluate on the balanced test set
  tupb = eval(testbal_examples, testbal_features, model, device)
  bal_test_balacc = balanced_accuracy(softmax(tupb[1]), tupb[2])
  bal_test_acc = tupb[0]['accuracy']
  bal_test_F1 = f1_score(tupb[2], np.argmax(tupb[1], axis=1), labels=None, pos_label=1, average='binary', sample_weight=None)

  #Store the results
  record_6 = record_6.append({'Acc (bal, bal)': bal_test_balacc, 'Acc (bal, reg)': bal_test_acc, 'F1 (bal, reg)': bal_test_F1}, ignore_index=True)

print(record_6)

   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.706400        0.650323       0.633200
1        0.726365        0.656489       0.669492
2        0.696418        0.657957       0.646863
3        0.702290        0.654140       0.665910
4        0.710511        0.657957       0.649835


In [0]:
with open(SARC_POL+f"test_set_results/test6.pickle", 'wb') as handle:
  pickle.dump(record_6, handle)

## Show results

### 100%

In [0]:
with open(SARC_POL+f"test_set_results/test100.pickle", 'rb') as handle:
  df100 = pickle.load(handle)

In [0]:
df100

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
0,0.812096,0.740752,0.742641
1,0.816207,0.733118,0.727436
2,0.816794,0.736348,0.734947
3,0.808573,0.735467,0.732562
4,0.818555,0.739577,0.730804


### 50%

In [0]:
with open(SARC_POL+f"test_set_results/test50.pickle", 'rb') as handle:
  df50 = pickle.load(handle)

In [0]:
df50

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
0,0.74633,0.701703,0.678277
1,0.780975,0.723429,0.713329
2,0.783911,0.710804,0.69683
3,0.78626,0.719025,0.718612
4,0.774516,0.71697,0.714962


### 25%

In [0]:
with open(SARC_POL+f"test_set_results/test25.pickle", 'rb') as handle:
  df25 = pickle.load(handle)

In [0]:
df25

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
0,0.763359,0.698767,0.684502
1,0.762772,0.69906,0.6701
2,0.678215,0.622137,0.507085
3,0.7569,0.69172,0.697057
4,0.753376,0.693776,0.688564


### 12.5%

In [0]:
with open(SARC_POL+f"test_set_results/test12.pickle", 'rb') as handle:
  df12 = pickle.load(handle)

In [0]:
df12

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
0,0.738109,0.675573,0.664643
1,0.735173,0.679977,0.675595
2,0.715208,0.663535,0.659537
3,0.718732,0.668233,0.66706
4,0.732824,0.67293,0.646798


### 6.25%

In [0]:
with open(SARC_POL+f"test_set_results/test6.pickle", 'rb') as handle:
  df6 = pickle.load(handle)

In [0]:
df6

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
0,0.7064,0.650323,0.6332
1,0.726365,0.656489,0.669492
2,0.696418,0.657957,0.646863
3,0.70229,0.65414,0.66591
4,0.710511,0.657957,0.649835


### Compile dataframe of averages and standard deviations over the five repeats

In [0]:
m100 = np.mean(df100, axis=0)
m50 = np.mean(df50, axis=0)
m25 = np.mean(df25, axis=0)
m12 = np.mean(df12, axis=0)
m6 = np.mean(df6, axis=0)

In [0]:
meandf = pd.DataFrame()

In [0]:
meandf = meandf.append(m100, ignore_index=True)
meandf = meandf.append(m50, ignore_index=True)
meandf = meandf.append(m25, ignore_index=True)
meandf = meandf.append(m12, ignore_index=True)
meandf = meandf.append(m6, ignore_index=True)

In [0]:
meandf.index = ['100%', '50%', '25%', '12.5%', '6.25%']

In [0]:
meandf

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
100%,0.814445,0.737052,0.733678
50%,0.774398,0.714386,0.704402
25%,0.742924,0.681092,0.649462
12.5%,0.728009,0.672049,0.662726
6.25%,0.708397,0.655373,0.65306


In [0]:
with open(SARC_POL+f"test_set_results/mean_summary.pickle", 'wb') as handle:
  pickle.dump(meandf, handle)

In [0]:
sd100 = np.std(df100, axis=0)
sd50 = np.std(df50, axis=0)
sd25 = np.std(df25, axis=0)
sd12 = np.std(df12, axis=0)
sd6 = np.std(df6, axis=0)

In [0]:
stddf = pd.DataFrame()

In [0]:
stddf = stddf.append(sd100, ignore_index=True)
stddf = stddf.append(sd50, ignore_index=True)
stddf = stddf.append(sd25, ignore_index=True)
stddf = stddf.append(sd12, ignore_index=True)
stddf = stddf.append(sd6, ignore_index=True)

In [0]:
stddf.index = ['100%', '50%', '25%', '12.5%', '6.25%']

In [0]:
stddf

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
100%,0.00362,0.002777,0.005107
50%,0.014576,0.007532,0.015053
25%,0.032568,0.029613,0.07172
12.5%,0.009235,0.005711,0.009507
6.25%,0.01012,0.002885,0.013254


In [0]:
with open(SARC_POL+f"test_set_results/std_summary.pickle", 'wb') as handle:
  pickle.dump(stddf, handle)