In [0]:
'''!pip install pytorch_pretrained_bert
!pip install transformers'''

In [0]:
import logging
import os
import random
from tqdm import tqdm, trange
import glob 
import json

import numpy as np
import pandas as pd 
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForMultipleChoice
from transformers import RobertaTokenizer,BertTokenizer,XLNetTokenizer
from transformers import RobertaForMultipleChoice,XLNetForMultipleChoice

In [0]:
basePath = '/content/'

base_model = 'roberta'

bert_model = 'bert-base-uncased'
roberta_model = 'roberta-base'
xlnet_model = 'xlnet-base-uncased'

modelBE_file = basePath+'drive/My Drive/temp/models/base_model_epoch4.bin'
modelRo_file = basePath+'drive/My Drive/temp/models/epoch4_Robert_base.bin'
#modelXL_file = 

drive_dir = basePath+'drive/My Drive/temp/DatasetN'
data_dir = basePath+'dataset'

'''drive_dir = basePath+'drive/My Drive/temp/tempDataset'
data_dir = basePath+'tempdataset' '''

max_seq_length = 512
do_eval = True
do_lower_case = True
eval_batch_size = 1
no_cuda = False  # False = use cuda if available
local_rank = -1
seed = 42

In [8]:
'''import zipfile
import os
for file_name in os.listdir(drive_dir):
  if file_name.endswith('.zip'):
    with zipfile.ZipFile(drive_dir+'/'+file_name,'r') as zip_dir:
      zip_dir.extractall(path='/content/')'''

"import zipfile\nimport os\nfor file_name in os.listdir(drive_dir):\n  if file_name.endswith('.zip'):\n    with zipfile.ZipFile(drive_dir+'/'+file_name,'r') as zip_dir:\n      zip_dir.extractall(path='/content/')"

In [0]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

In [0]:
class RaceExample(object):
    """A single training/test example for the RACE dataset."""
    '''
    For RACE dataset:
    race_id: data id
    context_sentence: article
    start_ending: question
    ending_0/1/2/3: option_0/1/2/3
    label: true answer
    '''
    def __init__(self,
                 race_id,
                 context_sentence,
                 start_ending,
                 ending_0,
                 ending_1,
                 ending_2,
                 ending_3,
                 label = None):
        self.race_id = race_id
        self.context_sentence = context_sentence
        self.start_ending = start_ending
        self.endings = [
            ending_0,
            ending_1,
            ending_2,
            ending_3,
        ]
        self.label = label

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        l = [
            f"id: {self.race_id}",
            f"article: {self.context_sentence}",
            f"question: {self.start_ending}",
            f"option_0: {self.endings[0]}",
            f"option_1: {self.endings[1]}",
            f"option_2: {self.endings[2]}",
            f"option_3: {self.endings[3]}",
        ]

        if self.label is not None:
            l.append(f"label: {self.label}")

        return ", ".join(l)

In [0]:
class InputFeatures(object):
    def __init__(self,
                 example_id,
                 choices_features,
                 label

    ):
        self.example_id = example_id
        self.choices_features = [
            {
                'input_ids': input_ids,
                'input_mask': input_mask,
                'segment_ids': segment_ids
            }
            for _, input_ids, input_mask, segment_ids in choices_features
        ]
        self.label = label

In [0]:
## paths is a list containing all paths
def read_race_examples(paths):
    examples = []
    for path in paths:
        filenames = glob.glob(path+"/*json")
        for filename in filenames:
            with open(filename, 'r', encoding='utf-8') as fpr:
                data_raw = json.load(fpr)
                article = data_raw['article']
                ## for each qn
                for i in range(len(data_raw['answers'])):
                    truth = ord(data_raw['answers'][i]) - ord('A')
                    question = data_raw['questions'][i]
                    options = data_raw['options'][i]
                    examples.append(
                        RaceExample(
                            race_id = filename+'-'+str(i),
                            context_sentence = article,
                            start_ending = question,

                            ending_0 = options[0],
                            ending_1 = options[1],
                            ending_2 = options[2],
                            ending_3 = options[3],
                            label = truth))
                
    return examples

In [0]:
def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                 is_training):
    """Loads a data file into a list of `InputBatch`s."""

    # RACE is a multiple choice task. To perform this task using Bert,
    # we will use the formatting proposed in "Improving Language
    # Understanding by Generative Pre-Training" and suggested by
    # @jacobdevlin-google in this issue
    # https://github.com/google-research/bert/issues/38.
    #
    # The input will be like:
    # [CLS] Article [SEP] Question + Option [SEP]
    # for each option 
    # 
    # The model will output a single value for each input. To get the
    # final decision of the model, we will run a softmax over these 4
    # outputs.
    features = []
    for example_index, example in enumerate(examples):
        context_tokens = tokenizer.tokenize(example.context_sentence)
        start_ending_tokens = tokenizer.tokenize(example.start_ending)

        choices_features = []
        for ending_index, ending in enumerate(example.endings):
            # We create a copy of the context tokens in order to be
            # able to shrink it according to ending_tokens
            context_tokens_choice = context_tokens[:]
            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
            # Modifies `context_tokens_choice` and `ending_tokens` in
            # place so that the total length is less than the
            # specified length.  Account for [CLS], [SEP], [SEP] with
            # "- 3"
            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)

            #tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
            tokens = [tokenizer.cls_token] + context_tokens_choice + [tokenizer.sep_token] + ending_tokens + [tokenizer.sep_token]
            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)

            input_ids = tokenizer.convert_tokens_to_ids(tokens)
            input_mask = [1] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding = [0] * (max_seq_length - len(input_ids))
            input_ids += padding
            input_mask += padding
            segment_ids += padding

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

            choices_features.append((tokens, input_ids, input_mask, segment_ids))

        label = example.label
        ## display some example
        if example_index < 1:
            logger.info("*** Example ***")
            logger.info(f"race_id: {example.race_id}")
            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
                logger.info(f"choice: {choice_idx}")
                logger.info(f"tokens: {' '.join(tokens)}")
                logger.info(f"input_ids: {' '.join(map(str, input_ids))}")
                logger.info(f"input_mask: {' '.join(map(str, input_mask))}")
                logger.info(f"segment_ids: {' '.join(map(str, segment_ids))}")
            if is_training:
                logger.info(f"label: {label}")

        features.append(
            InputFeatures(
                example_id = example.race_id,
                choices_features = choices_features,
                label = label
            )
        )

    return features

In [0]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [0]:
def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

In [0]:
def select_field(features, field):
    return [
        [
            choice[field]
            for choice in feature.choices_features
        ]
        for feature in features
    ]

In [0]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [0]:
def main():

    device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()

    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)


    tokenizer = None
    model = None
    if base_model == 'bert':
        tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)
        model_state_dict = torch.load(modelBE_file)
        model = BertForMultipleChoice.from_pretrained(bert_model,state_dict=model_state_dict,num_choices=4)

    elif base_model == 'roberta':
        tokenizer = RobertaTokenizer.from_pretrained(roberta_model, do_lower_case=do_lower_case)
        model_state_dict = torch.load(modelRo_file)
        model = RobertaForMultipleChoice.from_pretrained(roberta_model,state_dict=model_state_dict)

    else:
        tokenizer = XLNetTokenizer.from_pretrained(xlnet_model, do_lower_case=do_lower_case)
        model_state_dict = torch.load(modelXL_file)
        model = XLNetForMultipleChoice.from_pretrained(xlnet_model,state_dict=model_state_dict)
    
    model.to(device)

    if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
        train_dir = os.path.join(data_dir, 'train')
        eval_dir = os.path.join(data_dir, 'dev')

        eval_examples = read_race_examples([train_dir,eval_dir])
        eval_features = convert_examples_to_features(
            eval_examples, tokenizer, max_seq_length, True)
        
        logger.info("***** Running evaluation: test high *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", eval_batch_size)

        all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
        all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
        all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
        all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

        model.eval()

        labelsSave = np.asarray([])
        logitsSave = np.asarray([])
        idsSave = np.asarray([])
        for step, batch in enumerate(eval_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch

            with torch.no_grad():
                if base_model == 'bert':
                    logits = model(input_ids, segment_ids, input_mask)
                    logits = logits.detach().cpu().numpy()
                else:
                    logits = model(input_ids = input_ids,attention_mask =  input_mask)
                    logits = logits[0].detach().cpu().numpy()

            label_ids = label_ids.to('cpu').numpy()
            example_id = eval_features[step].example_id.split('/')[-1].split('-')[0]

            if labelsSave.shape[0] == 0:
                labelsSave = label_ids
                logitsSave = logits
                idsSave = [example_id]
            else:
               labelsSave = np.concatenate((labelsSave, label_ids))
               logitsSave = np.concatenate((logitsSave, logits), axis=0)
               idsSave.append(example_id)
        

        data = {'id':idsSave, 
                'A':logitsSave[:,0],
                'B':logitsSave[:,1],
                'C':logitsSave[:,2],
                'D':logitsSave[:,3],
                'correct_answer':labelsSave
                } 
          
        df = pd.DataFrame(data)
        df.to_csv(basePath+'dataset_'+base_model+'.csv',index=False)

In [0]:
main()