## NLP Preprocessing of CUAD dataset


Almost excactly the same process as when dealing with the drqa dataset excpet we don't utilize the normalization anymore

In [19]:
import pandas as pd
import numpy as np
import os
import re
import spacy
nlp = spacy.load('en_core_web_sm', disable=["tagger", "ner", "lemmatizer"])

In [20]:
df_1= pd.read_pickle("../data/processed/squad_drqa/drqa_valid.pkl")

We create a dataframe as in the drqa dataset. The filters applied will be to filter cases where there doesn't exists a signle answer or where the dataset has split the answer into multiple sections 

In [21]:
import json
data = []
i=0
with open("../data/raw/CUAD_v1/CUAD_v1/CUAD_v1.json", encoding="utf-8") as f:
    cuad = json.load(f)
    # Contract
    for example in cuad["data"]:
        title = example.get("title", "").strip()
        # Paragraph in contract
        # We only look at the first one
        for paragraph in example["paragraphs"]:
            context = paragraph["context"].strip()
            for qa in paragraph["qas"]:
                if qa.get("is_impossible"):
                    continue
                question = qa["question"].strip()
                id_ = qa["id"]

                answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                answers = [answer["text"].strip() for answer in qa["answers"]]
                answer_end = [context.find(x)+len(x) for x in answers if context.find(x)]
                if len(answers)>1:
                    i+=1
                    continue
                if answer_end and len(context[min(answer_starts):max(answer_end)])<2000:
                    ctx_offset = max(1,min(answer_starts)-200)
                    data.append((id_, question, answers[0], [answer_starts[0]-ctx_offset, answer_end[0]-ctx_offset], context[ctx_offset:min(len(context),max(answer_end)+500)], ctx_offset))

Plot

In [None]:
df =pd.DataFrame(data, columns=["id_", "question", "answer", "label", "context", "ctx_offset"])
df = df[df['question'].isin(['Highlight the parts (if any) of this contract related to "Document Name" that should be reviewed by a lawyer. Details: The name of the contract',
                    'Highlight the parts (if any) of this contract related to "Agreement Date" that should be reviewed by a lawyer. Details: The date of the contract',
                    """Highlight the parts (if any) of this contract related to "Expiration Date" that should be reviewed by a lawyer. Details: On what date will the contract\'s initial term expire?""",
                    'Highlight the parts (if any) of this contract related to "Renewal Term" that should be reviewed by a lawyer. Details: What is the renewal term after the initial term expires? This includes automatic extensions and unilateral extensions with prior notice.',
                    """Highlight the parts (if any) of this contract related to "Effective Date" that should be reviewed by a lawyer. Details: The date when the contract is effective"""    
                    ])]

In [27]:
df.groupby('question').count().sort_values(['answer'], ascending=False).head(5)

Unnamed: 0_level_0,id_,answer,label,context,ctx_offset
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Highlight the parts (if any) of this contract related to ""Agreement Date"" that should be reviewed by a lawyer. Details: The date of the contract",463,463,463,463,463
"Highlight the parts (if any) of this contract related to ""Document Name"" that should be reviewed by a lawyer. Details: The name of the contract",447,447,447,447,447
"Highlight the parts (if any) of this contract related to ""Governing Law"" that should be reviewed by a lawyer. Details: Which state/country's law governs the interpretation of the contract?",412,412,412,412,412
"Highlight the parts (if any) of this contract related to ""Expiration Date"" that should be reviewed by a lawyer. Details: On what date will the contract's initial term expire?",368,368,368,368,368
"Highlight the parts (if any) of this contract related to ""Effective Date"" that should be reviewed by a lawyer. Details: The date when the contract is effective",336,336,336,336,336


In [152]:
def gather_text_for_vocab(dfs: list):
    '''
    Gathers text from contexts and questions to build a vocabulary.

    :param dfs: list of dataframes of SQUAD dataset.
    :returns: list of contexts and questions
    '''

    text = []
    total = 0
    for df in dfs:
        unique_contexts = list(df.context.unique())
        unique_questions = list(df.question.unique())
        total += df.context.nunique() + df.question.nunique()
        text.extend(unique_contexts + unique_questions)

    assert len(text) == total

    return text

In [153]:
from collections import Counter
def build_word_vocab(vocab_text):
    '''
    Builds a word-level vocabulary from the given text.

    :param list vocab_text: list of contexts and questions
    :returns 
        dict word2idx: word to index mapping of words
        dict idx2word: integer to word mapping
        list word_vocab: list of words sorted by frequency
    '''

    words = []
    nlp = spacy.load('en_core_web_sm')
    for sent in vocab_text:
        for word in nlp(sent, disable=['parser', 'tagger', 'ner']):
            words.append(word.text)

    word_counter = Counter(words)
    word_vocab = sorted(word_counter, key=word_counter.get, reverse=True)
    print(f"raw-vocab: {len(word_vocab)}")
    word_vocab.insert(0, '<unk>')
    word_vocab.insert(1, '<pad>')
    print(f"vocab-length: {len(word_vocab)}")
    word2idx = {word: idx for idx, word in enumerate(word_vocab)}
    print(f"word2idx-length: {len(word2idx)}")
    idx2word = {v: k for k, v in word2idx.items()}

    return word2idx, idx2word, word_vocab

In [154]:
vocab_text = gather_text_for_vocab([df])

In [177]:
word2idx, idx2word, word_vocab = build_word_vocab(vocab_text)

raw-vocab: 9816
vocab-length: 9818
word2idx-length: 9818


In [157]:
def index_answer(row, idx2word):
    '''
    Takes in a row of the dataframe or one training example and
    returns a tuple of start and end positions of answer by calculating 
    spans.
    '''

    context_span = [(word.idx, word.idx + len(word.text))
                    for word in nlp(row.context, disable=['parser', 'tagger', 'ner'])]
    starts, ends = zip(*context_span)

    answer_start, answer_end = row.label
    start_idx = starts.index(answer_start)

    end_idx = ends.index(answer_end)

    ans_toks = [w.text for w in nlp(row.answer, disable=['parser', 'tagger', 'ner'])]
    ans_start = ans_toks[0]
    ans_end = ans_toks[-1]
    assert idx2word[row.context_ids[start_idx]] == ans_start
    assert idx2word[row.context_ids[end_idx]] == ans_end

    return [start_idx, end_idx]

In [158]:
def context_to_ids(text, word2idx):
    '''
    Converts context text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.

    :param str text: context text to be converted
    :param dict word2idx: word to id mapping

    :returns list context_ids: list of mapped ids

    :raises assertion error: sanity check

    '''

    context_tokens = [w.text for w in nlp(text, disable=['parser', 'tagger', 'ner'])]
    context_ids = [word2idx[word] for word in context_tokens]

    assert len(context_ids) == len(context_tokens)
    return context_ids

In [159]:
def question_to_ids(text, word2idx):
    '''
    Converts question text to their respective ids by mapping each word
    using word2idx. Input text is tokenized using spacy tokenizer first.

    :param str text: question text to be converted
    :param dict word2idx: word to id mapping
    :returns list context_ids: list of mapped ids

    :raises assertion error: sanity check

    '''

    question_tokens = [w.text for w in nlp(text, disable=['parser', 'tagger', 'ner'])]
    question_ids = [word2idx[word] for word in question_tokens]

    assert len(question_ids) == len(question_tokens)
    return question_ids

In [160]:
def test_indices(df, idx2word):
    '''
    Performs the tests mentioned above. This method also gets the start and end of the answers
    with respect to the context_ids for each example.

    :param dataframe df: SQUAD df
    :param dict idx2word: inverse mapping of token ids to words
    :returns
        list start_value_error: example idx where the start idx is not found in the start spans
                                of the text
        list end_value_error: example idx where the end idx is not found in the end spans
                              of the text
        list assert_error: examples that fail assertion errors. A majority are due to the above errors

    '''

    start_value_error = []
    end_value_error = []
    assert_error = []
    for index, row in df.iterrows():

        answer_tokens = [w.text for w in nlp(row['answer'], disable=['parser', 'tagger', 'ner'])]

        start_token = answer_tokens[0]
        end_token = answer_tokens[-1]

        context_span = [(word.idx, word.idx + len(word.text))
                        for word in nlp(row['context'], disable=['parser', 'tagger', 'ner'])]
        if not context_span:
            assert_error.append(index)
            continue

        starts, ends = zip(*context_span)

        answer_start, answer_end = row['label']

        try:
            start_idx = starts.index(answer_start)
        except Exception:
            start_value_error.append(index)
        try:
            end_idx = ends.index(answer_end)
        except Exception:
            end_value_error.append(index)

        try:
            assert idx2word[row['context_ids'][start_idx]] == answer_tokens[0]
            assert idx2word[row['context_ids'][end_idx]] == answer_tokens[-1]
        except Exception:
            assert_error.append(index)

    return start_value_error, end_value_error, assert_error

In [161]:
def get_error_indices(df, idx2word):

    start_value_error, end_value_error, assert_error = test_indices(df, idx2word)
    err_idx = start_value_error + end_value_error + assert_error
    err_idx = set(err_idx)
    print(f"Number of error indices: {len(err_idx)}")

    return err_idx

In [164]:
df['context_ids'] = df.context.apply(context_to_ids, word2idx=word2idx)
df['question_ids'] = df.question.apply(question_to_ids, word2idx=word2idx)
err = get_error_indices(df, idx2word)
df.drop(err, inplace=True)
df_label_idx = df.apply(index_answer, axis=1, idx2word=idx2word)
df['label_idx'] = df_label_idx
l_e=get_error_indices(df, idx2word)

Number of error indices: 100
Number of error indices: 0


TODO:
* Build an evaluate function taking in predictions and assessing how good a match it is
* Normalizing answers


In [None]:
def evaluate(predictions, answers, **kwargs):
    '''
    Gets a dictionary of predictions with question_id as key
    and prediction as value. The validation dataset has multiple 
    answers for a single question. Hence we compare our prediction
    with all the answers and choose the one that gives us
    the maximum metric (em or f1). 
    This method first parses the JSON file, gets all the answers
    for a given id and then passes the list of answers and the 
    predictions to calculate em, f1.
    
    
    :param dict predictions
    Returns
    : exact_match: 1 if the prediction and ground truth 
      match exactly, 0 otherwise.
    : f1_score: 
    '''
    assert len(predictions)==len(answers)
    f1 = exact_match = total = 0
    for key, value in predictions.items():
        prediction = value
        ground_truths = [answers[key]]
        
        exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
        
    
    total = len(predictions)
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return exact_match, f1