In [None]:
#Installing libraries
!pip install transformers
!pip install  nlp
!pip install pyarrow==0.16.0

In [2]:
import torch
import nlp
from tqdm import tqdm
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration

In [None]:
#Befere running evaluation we have to convert tensorflow checkpoint into pytorch model.
#See here: https://github.com/huggingface/transformers/blob/master/src/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Import the tokenizer and the config file from drive
#The config file can be download from this link: https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json

config = T5Config.from_json_file('/content/drive/My Drive/conf/finetuned_model_2M/config.json')
tokenizer = T5Tokenizer.from_pretrained('/content/drive/My Drive/conf/finetuned_model_2M/dl4se_vocab.model')

In [6]:
# Change the prefix when want to evaluate different tasks
# (1) generate small patch
# (2) generate medium patch
# (3) generate abt assert
# (4) generate raw assert

# If you're evaluating abt/raw assert generative tasks, change example['method'].lower() for the input_text and  example['assertion'].lower() for the target_text

def add_eos_to_examples(example):
    example['input_text'] = 'generate abt assert: %s </s>' % example['method'].lower()
    example['target_text'] = '%s </s>' % example['assertion'].lower()
    return example


def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=512)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=512)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [7]:
%%capture

# Here we have to change the script for loading the dataset
# Pick the script according to the task and load it on this colab instance
# Make sure to load the test set as well; otherwise, it doesn't work.

valid_dataset = nlp.load_dataset('/content/assertion_dataset_script.py', split=nlp.Split.TEST)


# map add_eos_to_examples function to the dataset example wise 
valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)

# map convert_to_features batch wise
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


columns = ['input_ids', 'target_ids', 'attention_mask','target_attention_mask']
valid_dataset.set_format(type='torch', columns=columns)


In [17]:
#The BATCH_SIZE must be set according to the available VRAM.

BATCH_SIZE = 8
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [18]:
#Let's import the ground truth from the test dataset
import pandas as pd

df = pd.read_csv('test.tsv',header=None,sep='\t')

references=[]

for item in df[1]:
  references.append(item.lower())

references[1]

'org . junit . assert . assertequals ( expected , buf )'

In [None]:
#Set CUDA device to leverage GPU computation
CUDA = torch.device("cuda")

model = T5ForConditionalGeneration.from_pretrained(
        '/content/drive/My Drive/conf/finetuned_model_2M/model.bin',
        config=config
        ).to(CUDA)

In [None]:
# Change the max_length in model.generate according to specific tasks
# For bfp_small and bfp_medium we set respectively 128 and 256.
# For both abt assert and raw assert tasks, we used 512 as max length


from tqdm import tqdm

predictions = []

BEAM_SIZE = 10

torch.cuda.empty_cache()

for batch in tqdm(dataloader):

      outs = model.generate(
                          input_ids=batch['input_ids'].to(CUDA),
                          attention_mask=batch['attention_mask'].to(CUDA),
                          num_beams=BEAM_SIZE, 
                          max_length=512,
                          num_return_sequences=BEAM_SIZE, 
                          early_stopping=True
                          )
    

    
      outs = [tokenizer.decode(ids, skip_special_tokens=True)  for ids in outs]
      predictions.extend(outs)

  9%|▊         | 170/1977 [05:14<53:47,  1.79s/it]

In [12]:
pred_refined = []
for pred in predictions:
    if len(pred)>=2:
      if pred[0]=='"':
          pred = pred[1:]
      if pred[-1]=='"':
          pred = pred[:-1]
    pred_refined.append(pred)
    
len(pred_refined),len(predictions)

(79050, 79050)

In [None]:
counter_pred = 0

mispred_list = []

sanity_check_list = []

idx = 0

len_prediction=(len(pred_refined))

for i in range(0, len_prediction, BEAM_SIZE):

    items_to_analyze = pred_refined[i:i+BEAM_SIZE]
    target_item = ''.join(references[idx].split(' '))
    

    for pred in items_to_analyze:
        pred_ref = ''.join(pred.split(' '))
        if pred_ref == target_item:
            counter_pred+=1
            sanity_check_list.append(pred_ref)
            break
        else:
          mispred_list.append(pred)
         
          
        
    idx += 1

print('% of perfect predictions: ',(counter_pred/len(references))*100 )
print(counter_pred)

In [16]:
#SAVING RESULTS

idx=0

with open('/content/drive/My Drive/conf/results_final/assert/abt/predictions_5/mispredictions_5.txt', 'w') as f:
    for i in range( 0, len(mispred_list), BEAM_SIZE):
        
        items_to_analyze = mispred_list[i:i+BEAM_SIZE]

        f.write('\n************\n')
        f.write("tgt: %s\n" % references[idx])
        for (index,mispred) in enumerate(items_to_analyze):
          f.write('[%s]: %s\n' % (str(index),mispred) )
        f.write('\n************\n')

        idx+=1

In [14]:
with open('/content/drive/My Drive/conf/results_final/assert/abt/predictions_5/predictions_5.txt', 'w') as f:
    for item in pred_refined:
        f.write("%s\n" % item)