In [3]:
import nltk

from constants import *
import json 
import os
import pickle
import re

import numpy as np

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [4]:
def get_answer_indices(answers): 
    mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    return [mapping[entry] for entry in answers]
    
def fill_in_extra_blanks(sentence, answer_indices, options, blank_index): 
    index = sentence.find('_');
    num_blanks = sentence.count('_')
    replacements = []
    for i in range(num_blanks): 
        word = options[blank_index + i][answer_indices[blank_index + i]] if i > 0 else 'MASK'
        replacements.append(word)
        
    def replace_with(_):
        """Returns first value of substr and removes it."""
        return replacements.pop(0)
    
    sentence = re.sub("_",replace_with,sentence)

    return sentence, num_blanks
    
def create_training_example(sentence, options_for_sentence, correct_answer): 
    example = {}
    sentence = " ".join(sentence.split())
    example['sentence'] = sentence 
    example['answer'] = correct_answer 
    example['candidates'] = options_for_sentence
    return example
    
    
def create_processed_file(data_split, data_type, source_filename): 
    source_filepath = RAW_PATH + data_split + data_type + source_filename 
    dest_filename = source_filename[:-4] + 'pickle'
    dest_filepath = CLEANED_PATH + data_split + data_type + dest_filename
    with open(source_filepath, 'r') as json_file, open(dest_filepath, "wb") as pickle_file: 
        examples = []
        data = json.load(json_file)
        answers = data['answers']
        answer_indices = get_answer_indices(answers)
        options = data['options']
        article = data['article']
        blank_index = 0
        for sentence in nltk.sent_tokenize(article): 
            if blank_index >= len(answer_indices): 
                break
            sentence, num_blanks = fill_in_extra_blanks(sentence, answer_indices, options, blank_index)
            if num_blanks > 0: 
                options_for_sentence = options[blank_index]
                correct_answer = options_for_sentence[answer_indices[blank_index]]
                example = create_training_example(sentence, options_for_sentence, correct_answer)
                examples.append(example)
            blank_index += num_blanks 
        pickle.dump(examples, pickle_file)
            
 

In [5]:
def process_all_files(): 
    for data_split in DATA_SPLITS: 
        for data_type in DATA_TYPES:
            path = RAW_PATH + data_split + data_type 
            filenames = os.listdir(path)
            for filename in filenames: 
                filename_path = path + filename
                create_processed_file(data_split, data_type, filename)
            

In [6]:
process_all_files()

In [7]:
with open('data/cleaned/train/high/high0.pickle', 'rb') as pickle_file:
    content = pickle.load(pickle_file)
    print(content)

[{'sentence': 'My heart MASK when I was asked to the back room by the immigration officer.', 'answer': 'sank', 'candidates': ['ached', 'beat', 'sank', 'rose']}, {'sentence': 'My MASK , with his very American last name, had no trouble at all.', 'answer': 'husband', 'candidates': ['son', 'daughter', 'friend', 'husband']}, {'sentence': "In fact, I am MASK American born and raised, but they weren't quite ready to let me in yet.", 'answer': 'also', 'candidates': ['still', 'also', 'already', 'never']}, {'sentence': "The only reason was MASK they thought my name looked like the one of someone who's on their wanted list and I had to wait till they checked me out with Washington.", 'answer': 'that', 'candidates': ['that', 'because', 'why', 'whether']}, {'sentence': 'Time passed MASK .', 'answer': 'slowly', 'candidates': ['quickly', 'carefully', 'dangerously', 'slowly']}, {'sentence': 'One hour, one hour and a half...I could not wait any longer and MASK my cellphone out to call the friend I had 