---
# SQuAD Data Preprocessing for OWQA
---

Inspired from the Stanford's course "CS224N" - winter 2018 SQuAD preprocessing on <a href="https://github.com/abisee/cs224n-win18-squad/blob/master/code/preprocessing/squad_preprocess.py">GitHub</a>.


In [1]:
import json
import numpy as np
np.random.seed(42)


# Preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import nltk
from nltk import word_tokenize, wordpunct_tokenize


def import_json(filename):
    """load json from a file filename and returns it"""
    with open("../../data/" + filename) as file:
        json_data = json.load(file)
    return json_data


def get_data_from_file(tier):
    """retrieve the data dictionary from the preprocessed files"""
    contexts = np.load("../../data/ow_preprocessed/"+tier+"_contexts"+".npy", allow_pickle=True)
    questions = np.load("../../data/ow_preprocessed/"+tier+"_questions"+".npy", allow_pickle=True)
    contexts_tokens = np.load("../../data/ow_preprocessed/"+tier+"_contexts_tokens"+".npy", allow_pickle=True)
    questions_tokens = np.load("../../data/ow_preprocessed/"+tier+"_questions_tokens"+".npy", allow_pickle=True)
    answers = np.load("../../data/ow_preprocessed/"+tier+"_answers"+".npy", allow_pickle=True)
    start_charloc = np.load("../../data/ow_preprocessed/"+tier+"_start_charloc"+".npy", allow_pickle=True)
    end_charloc = np.load("../../data/ow_preprocessed/"+tier+"_end_charloc"+".npy", allow_pickle=True)
    start_wordloc = np.load("../../data/ow_preprocessed/"+tier+"_start_wordloc"+".npy", allow_pickle=True)
    end_wordloc = np.load("../../data/ow_preprocessed/"+tier+"_end_wordloc"+".npy", allow_pickle=True)
    
    data = {"contexts": contexts,
            "questions": questions,
            "contexts_tokens": contexts_tokens,
            "questions_tokens": questions_tokens,
            "answers": answers,
            "start_charloc": start_charloc,
            "end_charloc": end_charloc,
            "start_wordloc": start_wordloc,
            "end_wordloc": end_wordloc
           }
    return data

    
def write_data(tier, data):
    print("** Writing the preprocessed data in the respective files. . .")
    
    # write tokenized data to file
    np.save("../../data/ow_preprocessed/"+tier+"_contexts", data['contexts'])
    np.save("../../data/ow_preprocessed/"+tier+"_questions", data['questions'])
    np.save("../../data/ow_preprocessed/"+tier+"_contexts_tokens", data['contexts_tokens'])
    np.save("../../data/ow_preprocessed/"+tier+"_questions_tokens", data['questions_tokens'])
    np.save("../../data/ow_preprocessed/"+tier+"_answers", data['answers'])

    np.save("../../data/ow_preprocessed/"+tier+"_start_charloc", data['start_charloc'])
    np.save("../../data/ow_preprocessed/"+tier+"_end_charloc", data['end_charloc'])
    np.save("../../data/ow_preprocessed/"+tier+"_start_wordloc", data['start_wordloc'])
    np.save("../../data/ow_preprocessed/"+tier+"_end_wordloc", data['end_wordloc'])


def max_len(sequences):
    """Calculate the maximum length of the sequences in a list"""
    _max = 0;
    for sequence in sequences:
        if len(sequence) > _max:
            _max = len(sequence)
    return _max


def tokenize(text):
    """The following replacements:
    token.replace("``", '"'), token.replace("''", '"')
    are suggested in the paper BidAF (Seo et al., 2016)"""
    text = [[token.replace("``", '"').replace("''", '"').lower() for token in word_tokenize(sequence)] for sequence in text]
    return text


def check_indices(data):
    """Checks if the mapping from character location of the answer
    to word/token location of the answer is correct for every sample.
    Returns the indices of unsuccessful mapping."""
    wrong_tokens = []
    for i in range(len(data['contexts'])):
        answer = data['contexts'][i][data['start_charloc'][i]:data['end_charloc'][i]]
        a_tokens = data['contexts_tokens'][i][data['start_wordloc'][i]:data['end_wordloc'][i]]
    
        if "".join(a_tokens) != "".join(answer.split()):
            wrong_tokens.append(i)
    print("Not successful char loc -> word loc mapping: ", len(wrong_tokens))
    return wrong_tokens


def find_one_word_answ(token_starts, token_ends):
    """Finds the indices of all the not-one word answers"""
    not_one_word_indices = []
    for i in range(len(token_starts)):
        if token_starts[i] != token_ends[i]-1:
            not_one_word_indices.append(i)
            
    print("Number of not one-word answers: ", len(not_one_word_indices))
    return not_one_word_indices


def remove_entries(indices, data):
    """Removes entries of particular indices in all the lists"""
    print('Dataset length before: ', len(data['contexts']))
    
    for i in range(len(indices)-1, -1, -1):
        del data['contexts'][indices[i]]
        del data['questions'][indices[i]]
        del data['contexts_tokens'][indices[i]]
        del data['questions_tokens'][indices[i]]
        del data['answers'][indices[i]]
        del data['start_charloc'][indices[i]]
        del data['end_charloc'][indices[i]]
        del data['start_wordloc'][indices[i]]
        del data['end_wordloc'][indices[i]]
    
    print('Dataset length after: ', len(data['contexts']))
    return data


Using plaidml.keras.backend backend.


---

# Importing the Dataset

---

In [5]:
def import_dataset(json_file, contexts_max_len=None, max_size=None):
    num_spanalign_prob = 0
    
    json_data = import_json(json_file)
    
    if contexts_max_len is not None:
        print('** Considering only the contexts smaller than ', contexts_max_len, ' . . .')
    
    data = {"contexts": [],         # contexts/paragraphs
            "questions": [],        # questions
            "contexts_tokens": [],  # contexts tokens
            "questions_tokens": [], # questions tokens
            "answers": [],          # answers
            "start_charloc": [],    # character indices indicating the begin of the answer
            "end_charloc": [],      # character indices indicating the end of the answer
            "start_wordloc": [],    # word token indices indicating the begin of the answer
            "end_wordloc": []       # word token indices indicating the end of the answer
           }
    
    for dt in json_data['data']:
        paragraphs = dt['paragraphs']

        for paragraph in paragraphs:
            context = str(paragraph['context'])
            context = context.lower()
            
            if contexts_max_len is not None:
                if len(context) > contexts_max_len:
                    continue
    
            qas = paragraph['qas'] #  list of questions
            for qa in qas:
                question = str(qa['question'])
                question = question.lower()
                answer = qa['answers'][0]
                answer_text = str(answer['text']).lower() # get the answer text
                start_charloc = answer['answer_start'] # answer start loc (character count)
                end_charloc = start_charloc + len(answer_text) # answer end loc (character count)

                # Check that the provided character spans match the provided answer text
                # Sometimes this is misaligned, mostly because Python can interpret
                # certain Unicode characters to have length 2
                # https://stackoverflow.com/questions/29109944/
                # python-returns-length-of-2-for-single-unicode-character-string  
                if answer_text != context[start_charloc : end_charloc]:
                    num_spanalign_prob += 1
                    continue
                
                data['contexts'].append(context)
                data['questions'].append(question)
                data['answers'].append(answer_text)
                data['start_charloc'].append(start_charloc)
                data['end_charloc'].append(end_charloc)
                
    if max_size is not None:
        print('** Reducing the size of the dataset to ', max_size, ' . . .')
        data['contexts'] = data['contexts'][:max_size]
        data['questions'] = data['questions'][:max_size]
        data['answers'] = data['answers'][:max_size]
        data['start_charloc'] = data['start_charloc'][:max_size]
        data['end_charloc'] = data['end_charloc'][:max_size]
        
    # check they have all the same length
    assert len(data['contexts'])==len(data['questions'])==len(data['answers'])
    assert len(data['answers'])==len(data['start_charloc'])==len(data['end_charloc'])
    print("Number of span align problems: ", num_spanalign_prob)
    return data


---
# Mapping Character Indices to Token Indices
---

In [6]:
def map_char_to_token_index(data):
    """Map character location of the answer
    to word/token location of the answer"""
    print('** Mapping character location to word location of the answer . . .')
    word_starts = []  # list to store the token start locations
    word_ends = []  # list to store the token end locations
    
    for i, context in enumerate(data['contexts']):
        """loop through the contexts.
        for every single context and context tokens,
        take note of the current character index of the context, 
        the character index of the token and
        the index of the current token word of the context we are in"""
        tokens = data['contexts_tokens'][i] # single context's tokens
        char_i = 0 # Character index of the context
        token_i = 0 # Character index of the token
        current_token = 0 # Index of the current token word of the context we are in
        
        while char_i < len(context): # Loop through the characters in the context
            if current_token < len(tokens):
                context_char = context[char_i]  # current char in the context
                token_char = tokens[current_token][token_i]  # current char in the tokens

                if data['start_charloc'][i] == char_i:
                    word_starts.append(current_token)

                if data['end_charloc'][i] == char_i:
                    word_ends.append(current_token)

                """ skip the space, the only character that the contexts and
                the context tokens don't have in common.
                if token_char != context_char, it means that we reached a space.
                therefore we need to increase the token_i and consider the next token."""
                if token_char == context_char:
                    if token_i == len(tokens[current_token])-1:
                        current_token += 1
                        token_i = 0
                    else:
                        token_i += 1
            char_i += 1
            
        if len(word_ends) <= i:
            word_ends.append(current_token)
        
    return word_starts, word_ends


---
# Text Preprocessing
---

In [7]:
def preprocess(data, char_level=False, filter_one_word_answ=True):
    
    print("** Tokenizing contexts and questions. . .")
    data['contexts_tokens'] = tokenize(data['contexts'])
    data['questions_tokens'] = tokenize(data['questions'])
    
    data["start_wordloc"], data["end_wordloc"] = map_char_to_token_index(data)
    wrong_indices = check_indices(data)
        
    if len(wrong_indices) > 0:
        print('** Discarding', len(wrong_indices), 'entries due to char loc -> word loc mapping problems . . .')
        
        data = remove_entries(wrong_indices, data)
        wrong_indices = check_indices(data)
        
    if filter_one_word_answ:
        not_one_word_indices = find_one_word_answ(data["start_wordloc"], data["end_wordloc"])

        if len(not_one_word_indices) > 0:
            print('** Filtering One-Word Answers only. . .')
            
            data = remove_entries(not_one_word_indices, data)
            not_one_word_indices = find_one_word_answ(data["start_wordloc"], data["end_wordloc"])
            
    if char_level:
        data['contexts_tokens'] = data['contexts']
        data['questions_tokens'] = data['questions']
        
    return data



def onehot_labels(starts, ends, c_maxlen):
    
    print('** One-hot encoding the labels in two tensors . . .')
    labels_start = np.zeros( (len(starts), c_maxlen) )
    labels_end = np.zeros( (len(ends), c_maxlen) )
        
    for i in range(len(starts)):
        labels_start[i, starts[i]] = 1
        labels_end[i, ends[i]] = 1

    return labels_start, labels_end



---
# Main function
---

In [8]:
def squad_preprocessing_main(contexts_max_len=None, max_size=None):

    train_data = import_dataset("squad_train.json", contexts_max_len=contexts_max_len, max_size=max_size)
    print("Train data length = %i" % len(train_data['contexts']))
    dev_data = import_dataset("squad_dev.json", contexts_max_len=contexts_max_len, max_size=max_size)
    print("Test data length = %i" % len(dev_data['contexts']))
    
    char_level = False
    print("\n - train -")
    train_data = preprocess(train_data, char_level=char_level, filter_one_word_answ=True)
    print("\n - dev -")
    dev_data = preprocess(dev_data, char_level=char_level, filter_one_word_answ=True)
    
    t = Tokenizer(filters='', char_level=char_level)

    t.fit_on_texts(train_data['contexts_tokens'])
    t.fit_on_texts(train_data['questions_tokens'])
    t.fit_on_texts(dev_data['contexts_tokens'])
    t.fit_on_texts(dev_data['questions_tokens'])

    vocabulary_size = len(t.word_index)
    print('Vocabulary size: ', vocabulary_size)
    print('** Converting the tokens into sequences . . .')
    train_data['contexts'] = t.texts_to_sequences(train_data['contexts_tokens'])
    train_data['questions'] = t.texts_to_sequences(train_data['questions_tokens'])
    dev_data['contexts'] = t.texts_to_sequences(dev_data['contexts_tokens'])
    dev_data['questions'] = t.texts_to_sequences(dev_data['questions_tokens'])

    # Calculate the maximum length of the contexts and the questions
    train_cmaxlen = np.max(train_data['end_wordloc'])+1
    dev_cmaxlen = np.max(dev_data['end_wordloc'])+1
    
    train_qmaxlen = max_len(train_data['questions'])
    dev_qmaxlen = max_len(dev_data['questions'])
    
    c_maxlen = max(train_cmaxlen, dev_cmaxlen)
    q_maxlen = max(train_qmaxlen, dev_qmaxlen)
    
    print(' -- Ends max length train/dev: ', train_cmaxlen-1, dev_cmaxlen-1)
    print(' -- Contexts max length: ', c_maxlen)
    print(' -- Questions max length: ', q_maxlen)
    
    print('** Padding the sequences . . .')
    train_data['contexts'] = pad_sequences(train_data['contexts'],
                                           maxlen=c_maxlen, padding='post', truncating='post')
    train_data['questions'] = pad_sequences(train_data['questions'],
                                            maxlen=q_maxlen, padding='post')
    dev_data['contexts'] = pad_sequences(dev_data['contexts'],
                                         maxlen=c_maxlen, padding='post', truncating='post')
    dev_data['questions'] = pad_sequences(dev_data['questions'],
                                          maxlen=q_maxlen, padding='post')
    
    assert max_len(train_data['contexts']) == max_len(dev_data['contexts'])
    assert max_len(train_data['questions']) == max_len(dev_data['questions'])
    
    for i in range(len(train_data['contexts'])):
        assert(len(train_data['contexts'][i]) >= np.max(train_data["end_wordloc"])+1)
        assert(train_data['start_wordloc'][i] == train_data['end_wordloc'][i]-1)
    for i in range(len(dev_data['contexts'])):
        assert(len(dev_data['contexts'][i]) >= np.max(dev_data["end_wordloc"])+1)
        assert(dev_data['start_wordloc'][i] == dev_data['end_wordloc'][i]-1)
    
    # write to file: context, question tensors, answers, numbers for char/word start/end locations.
    write_data("train", train_data)
    write_data("dev", dev_data)
    
