# Squad Preprocessing

In [1]:
import os
import sys
import spacy
import re
import random
from ftfy import fix_text
from unidecode import unidecode
from tqdm import tqdm
from collections import Counter
import numpy as np


module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)

from preprocessing.squad import preprocess
from util.Tokenizer import Tokenizer
from util.util import load_json
from util.squad import extract_contexts

## Setup

In [2]:
train_path = '../data/squad/train-v1.1.json'
dev_path = '../data/squad/dev-v1.1.json'
vocab_path = '../data/glove/vocab.json'
oov_token = '<oov>'
trainable_words = [oov_token]
max_words = 150000
context_limit = 400
question_limit = 50
ans_limit = 30

In [3]:
train = load_json(train_path)
dev = load_json(dev_path)
vocab = set(load_json(vocab_path))

## Preprocessing

In [4]:
# Dims
word_embed_dims = 300
# char_embed_dims = 200
char_embed_dims = 64
# Other params
CONTEXT_LIMIT = 400
QUESTION_LIMIT = 50
# Regexes
apostrophe = re.compile(r"('')")
apostrophe_like = re.compile(r"(``)")


def convert_idx(text, tokens):
    current = 0
    spans = []

    for token in tokens:
        current = text.find(token, current)
        
        if current < 0:

            print("Token {} cannot be found".format(token))
            raise ValueError('Could not find token.')
        spans.append((current, current + len(token)))
        current += len(token)
    return spans


def fit_on_squad(data_set, tokenizer):
    for data in tqdm(data_set['data']):
        for question_answer in data['paragraphs']:
            context = fix_text(question_answer['context'])
            context = apostrophe.sub('" ', context)
            context = apostrophe_like.sub('" ', context)

            tokenizer.fit_on_texts(context)

            for qa in question_answer['qas']:
                ques = fix_text(qa['question'].replace("''", '" ').replace("``", '" '))
                tokenizer.fit_on_texts(ques)

    return tokenizer


def pre_process(data_set, tokenizer, context_limit, question_limit):
    indexed = []
    answers = {}
    contexts = {}
    context_id = 1
    answer_id = 1

    for data in tqdm(data_set['data']):
        for question_answer in data['paragraphs']:
            context = fix_text(question_answer['context'])
            context = apostrophe.sub('" ', context)
            context = apostrophe_like.sub('" ', context)
            context_words, context_chars, context_length = tokenizer.texts_to_sequences(context, max_words=context_limit,
                                                                           numpy=False, pad=False)
            # Tokenizer wraps in a list (len==1) so we need the last entry
            context_words = context_words[-1]
            context_chars = context_chars[-1]

            # Skip if outside of context limit
            if context_length[-1] > context_limit:
                continue

            spans = convert_idx(context, tokenizer.tokenize(context))

            for qa in question_answer['qas']:
                question = fix_text(qa['question'].replace("''", '" ').replace("``", '" '))
                question_words, question_chars, question_length = tokenizer.texts_to_sequences(question,
                                                                                               max_words=question_limit,
                                                                                               numpy=False, pad=False)
                # Tokenizer wraps in a list (len==1) so we need the last entry
                question_words = question_words[-1]
                question_chars = question_chars[-1]

                # Skip if its outside of the limits.
                if question_length[-1] > question_limit:
                    continue

                answer_starts, answer_ends = [], []
                answer_texts = []

                for answer in qa['answers']:
                    answer_text = fix_text(answer['text'])
                    answer_start = answer['answer_start']
                    answer_end = answer_start + len(answer_text)
                    answer_texts.append(answer_text)
                    answer_span = []

                    for i, span in enumerate(spans):
                        if not (answer_end <= span[0] or answer_start >= span[1]):
                            answer_span.append(i)

                    answer_starts.append(answer_span[0])
                    answer_ends.append(answer_span[-1])
                    
                if (answer_ends[-1] - answer_starts[-1]) > ans_limit:
                    continue

                indexed.append({
                    'context_words': context_words,
                    'context_chars': context_chars,
                    'question_words': question_words,
                    'question_chars': question_chars,
                    'answer_starts': answer_starts[-1],
                    'answer_ends': answer_ends[-1],
                    'answer_id': answer_id,
                })
                
                answers[answer_id] = {
                    'id': qa['id'],
                    'answer_id': answer_id,
                    'context_id': context_id,
                    'answers': answer_texts,
                }

                answer_id += 1

            contexts[context_id] = {
                'id': context_id,
                'context': context,
                'word_spans': spans,
            }

            context_id += 1

    random.shuffle(indexed)
    print("{} questions in total".format(len(indexed)))
    return indexed, contexts, answers

In [5]:
tokenizer = Tokenizer(max_words=max_words + 1,
                          vocab=vocab,
                          lower=False,
                          oov_token=oov_token,
                          min_word_occurrence=-1,
                          min_char_occurrence=-1,
                          trainable_words=trainable_words,
                          filters=set())  # Only filter stray whitespace

print('Fitting...')
tokenizer = fit_on_squad(train, tokenizer)
tokenizer = fit_on_squad(dev, tokenizer)

Fitting...


100%|████████████████████████████████████████████████████████████████████████████████| 442/442 [02:07<00:00,  3.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:14<00:00,  3.24it/s]


In [6]:
print('Processing...')
train_indexed, train_contexts, train_answers = pre_process(train, tokenizer,
                                                                           context_limit=context_limit,
                                                                           question_limit=question_limit)
dev_indexed, dev_contexts, dev_answers = pre_process(dev, tokenizer,
                                                                   context_limit=context_limit,
                                                                   question_limit=question_limit)

Processing...


100%|████████████████████████████████████████████████████████████████████████████████| 442/442 [03:40<00:00,  2.01it/s]


87353 questions in total


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:25<00:00,  1.90it/s]


10482 questions in total


In [7]:
nlp = spacy.blank("en")


def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]


def process_file(data, word_counter, char_counter):
    examples = []
    eval_examples = {}
    total = 0
    for article in tqdm(data["data"]):
        for para in article["paragraphs"]:
            context = para["context"].replace(
                    "''", '" ').replace("``", '" ')
            
            context_tokens = word_tokenize(context)
            context_chars = [list(token) for token in context_tokens]
            
            if len(context_tokens) > context_limit:
                continue
            
            spans = convert_idx(context, context_tokens)
            
            for token in context_tokens:
                word_counter[token] += len(para["qas"])
                for char in token:
                    char_counter[char] += len(para["qas"])
            for qa in para["qas"]:
                total += 1
                ques = qa["question"].replace(
                        "''", '" ').replace("``", '" ')
                ques_tokens = word_tokenize(ques)
                ques_chars = [list(token) for token in ques_tokens]
                
                # Skip if its outside of the limits.
                if len(ques_tokens) > question_limit:
                    continue
                
                for token in ques_tokens:
                    word_counter[token] += 1
                    for char in token:
                        char_counter[char] += 1
                        
                y1s, y2s = [], []
                answer_texts = []
                for answer in qa["answers"]:
                    answer_text = answer["text"]
                    answer_start = answer['answer_start']
                    answer_end = answer_start + len(answer_text)
                    answer_texts.append(answer_text)
                    answer_span = []
                    for idx, span in enumerate(spans):
                        if not (answer_end <= span[0] or answer_start >= span[1]):
                            answer_span.append(idx)
                    y1, y2 = answer_span[0], answer_span[-1]
                    y1s.append(y1)
                    y2s.append(y2)
                
                if (y2s[-1] - y1s[-1]) > ans_limit:
                    continue
                
                example = {"context_tokens": context_tokens, "context_chars": context_chars, "ques_tokens": ques_tokens,
                               "ques_chars": ques_chars, "y1s": y1s[-1], "y2s": y2s[-1], "id": total}
                examples.append(example)
                eval_examples[str(total)] = {
                        "context": context, "spans": spans, "answers": answer_texts, "uuid": qa["id"]}
    random.shuffle(examples)
    print("{} questions in total".format(len(examples)))
    return examples, eval_examples


def prepro():
    word_counter, char_counter = Counter(), Counter()
    train_examples, train_eval = process_file(train, word_counter, char_counter)
    dev_examples, dev_eval = process_file(dev, word_counter, char_counter)

In [8]:
prepro()

100%|████████████████████████████████████████████████████████████████████████████████| 442/442 [02:01<00:00,  3.65it/s]


87358 questions in total


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:13<00:00,  3.46it/s]


10482 questions in total


In [17]:
from operator import itemgetter
from collections import OrderedDict, Counter

In [10]:
test_dict = {
    'a': 5,
    'b': 3,
    'c': 16,
    'z': 1,
}

In [32]:
sorted(test_dict.items())

[('a', 5), ('b', 3), ('c', 16), ('z', 1)]

In [14]:
OrderedDict([(k,v) for k,v in test_dict.items()])

OrderedDict([('a', 5), ('c', 16), ('z', 1), ('b', 3)])

In [18]:
test = Counter()

In [19]:
test['a'] += 1

In [28]:
test

Counter({'a': 1, 'd': 35, 'v': 14, 'z': 21})

In [27]:
test['v'] += 14

In [31]:
test.most_common(150000)

[('d', 35), ('z', 21), ('v', 14), ('a', 1)]

In [33]:
test_dict.a

AttributeError: 'dict' object has no attribute 'a'

In [34]:
from types import SimpleNamespace

In [36]:
test_4 = SimpleNamespace(**test_dict)

In [37]:
test_4

namespace(a=5, b=3, c=16, z=1)

In [38]:
test_4.a

5