# GPT3 CUAD

The processing of the CUAD dataset with the GPT3 Q&A engine

Read in the questions

In [1]:
import pickle
import openai
from tqdm import tqdm
import pandas as pd

**Prepare dataset**

In [2]:
data = pd.read_pickle('../data/processed/cuad_drqa/data.pkl')
data = data.rename(columns={'id_': 'id'})
with open("../data/processed/cuad_drqa/drqa_idx2word.pickle", 'rb') as file:
    idx2word = pickle.load(file)
with open("../data/processed/cuad_drqa/drqa_word2idx.pickle", 'rb') as file:
    word2idx = pickle.load(file)

with open("../data/processed/cuad_drqa/drqa_word_vocab.pickle", 'rb') as file:
    word_vocab = pickle.load(file)

data['todrop'] = False
for idx, row in data.iterrows():
    t=row
    if not [idx2word[t.context_ids[x]] for x in range(t.label_idx[0], t.label_idx[1]+1)]:
        row.todrop=True
data=data[data['todrop']==False]
data.drop(columns=['todrop'],inplace=True)

valid_df = data[int(0.7*len(data)):]

**Fetch data from openai**

In [4]:
import os
openai.api_key = os.getenv("OPENAI_APIKEY")

res_ = {}
for idx, row in tqdm(valid_df.iterrows()):
    
    try:
        response = openai.Answer.create(
         search_model="ada",
         model="curie",
         question=row.question,
         documents=[row.context],
         examples_context="In 2017, U.S. life expectancy was 78.6 years.",
         examples=[["What is human life expectancy in the United States?","78 years."]],
         max_tokens=len(row.answer.split(' '))*3,
         stop=["\n", "<|endoftext|>"],
        )
        res_[row.id]=response.to_dict()
    except Exception as e:
        res_[row.id]={'error':True, 'ags': e.args}    


288it [01:42,  2.80it/s]


**Prepare for calculation of score**

In [5]:
valid_df['res']=valid_df.apply(lambda x: res_.get(x.id, {'error':True} ),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['res']=valid_df.apply(lambda x: res_.get(x.id, {'error':True} ),axis=1)


In [6]:
valid_df['res_ans']=valid_df.res.apply(lambda x: x.get('answers',[' '])[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df['res_ans']=valid_df.res.apply(lambda x: x.get('answers',[' '])[0])


Save results

In [8]:
valid_df.to_pickle('./GPT3CUAD_data.pkl')

In [15]:

def evaluate_single(predictions, answers, **kwargs):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.


    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth 
      match exactly, 0 otherwise.
    : f1_score: 
    '''
    assert len(predictions) == len(answers)
    f1 = exact_match = total = 0
    for key, value in predictions.items():
        prediction = value
        ground_truths = [answers[key]]

        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    total = len(predictions)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return exact_match, f1


def normalize_answer(s):
    '''
    Performs a series of cleaning steps on the ground truth and 
    predicted answer.
    '''
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
    '''
    Returns maximum value of metrics for predicition by model against
    multiple ground truths.

    :param func metric_fn: can be 'exact_match_score' or 'f1_score'
    :param str prediction: predicted answer span by the model
    :param list ground_truths: list of ground truths against which
                               metrics are calculated. Maximum values of 
                               metrics are chosen.


    '''
    scores_for_ground_truths = []
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        scores_for_ground_truths.append(score)

    return max(scores_for_ground_truths)


def f1_score(prediction, ground_truth):
    '''
    Returns f1 score of two strings.
    '''
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    '''
    Returns exact_match_score of two strings.
    '''
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def epoch_time(start_time, end_time):
    '''
    Helper function to record epoch time.
    '''
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


In [16]:
import re
def evaluate_single(predictions, answers, **kwargs):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.


    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth 
      match exactly, 0 otherwise.
    : f1_score: 
    '''
    assert len(predictions) == len(answers)
    f1 = exact_match = total = 0
    for key, value in predictions.items():
        prediction = value
        ground_truths = [answers[key]]

        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)

    total = len(predictions)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return exact_match, f1

In [17]:
predictions={row.id:row.res_ans for _, row in valid_df.iterrows()}
answers= {row.id:row.answer for _, row in valid_df.iterrows()}

In [18]:
em, f1 = evaluate_single(predictions,answers)

In [19]:
em

14.930555555555555

In [20]:
f1

34.37516937973732