In [22]:
import nltk

from constants import *
import json 
import os
import pickle
import re

import numpy as np

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [24]:
def get_answer_indices(answers): 
    mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    return [mapping[entry] for entry in answers]
    
def fill_in_extra_blanks(sentence, answer_indices, options, blank_index): 
    index = sentence.find('_');
    num_blanks = sentence.count('_')
    replacements = []
    for i in range(num_blanks): 
        word = options[blank_index + i][answer_indices[blank_index + i]] if i > 0 else '[MASK]'
        replacements.append(word)
        
    def replace_with(_):
        """Returns first value of substr and removes it."""
        return replacements.pop(0)
    
    sentence = re.sub("_",replace_with,sentence)

    return sentence, num_blanks
    
def create_training_example(sentence, options_for_sentence, correct_answer): 
    example = {}
    sentence = " ".join(sentence.split())
    example['sentence'] = sentence 
    example['answer'] = correct_answer 
    example['candidates'] = options_for_sentence
    return example
    
    
def create_processed_file(data_split, data_type, source_filename): 
    source_filepath = RAW_PATH + data_split + data_type + source_filename 
    dest_filename = source_filename[:-4] + 'pickle'
    dest_filepath = CLEANED_PATH + data_split + data_type + dest_filename
    with open(source_filepath, 'r') as json_file, open(dest_filepath, "wb") as pickle_file: 
        examples = []
        data = json.load(json_file)
        answers = data['answers']
        answer_indices = get_answer_indices(answers)
        options = data['options']
        article = data['article']
        blank_index = 0
        for sentence in nltk.sent_tokenize(article): 
            if blank_index >= len(answer_indices): 
                break
            sentence, num_blanks = fill_in_extra_blanks(sentence, answer_indices, options, blank_index)
            if num_blanks > 0: 
                options_for_sentence = options[blank_index]
                correct_answer = options_for_sentence[answer_indices[blank_index]]
                example = create_training_example(sentence, options_for_sentence, correct_answer)
                examples.append(example)
            blank_index += num_blanks 
        pickle.dump(examples, pickle_file)
            
 

In [32]:
def process_all_files(): 
    for data_split in DATA_SPLITS: 
        for data_type in DATA_TYPES:
            path = RAW_PATH + data_split + data_type 
            filenames = os.listdir(path)
            for filename in filenames: 
                filename_path = path + filename
                create_processed_file(data_split, data_type, filename)
            

In [33]:
process_all_files()

['high2384.json', 'high1662.json', 'high2274.json', 'high29.json', 'high936.json', 'high543.json', 'high1638.json', 'high617.json', 'high2299.json', 'high646.json', 'high2976.json', 'high1229.json', 'high1324.json', 'high1977.json', 'high89.json', 'high129.json', 'high2226.json', 'high167.json', 'high1869.json', 'high726.json', 'high2987.json', 'high1104.json', 'high2743.json', 'high327.json', 'high605.json', 'high2318.json', 'high3068.json', 'high2150.json', 'high1987.json', 'high3101.json', 'high2840.json', 'high1705.json', 'high2952.json', 'high2341.json', 'high610.json', 'high1738.json', 'high1097.json', 'high2841.json', 'high655.json', 'high1318.json', 'high68.json', 'high2780.json', 'high1471.json', 'high1916.json', 'high633.json', 'high1059.json', 'high2416.json', 'high2125.json', 'high2420.json', 'high279.json', 'high2079.json', 'high2412.json', 'high195.json', 'high3012.json', 'high1068.json', 'high881.json', 'high924.json', 'high819.json', 'high1347.json', 'high3021.json', 'h

In [21]:
with open('data/cleaned/test/high/high4063.pickle', 'rb') as pickle_file:
    content = pickle.load(pickle_file)
    print(content)

[{'sentence': 'A teacher shows three toys to a student .Then she asks the student to find out the [MASK] .', 'answer': 'differences', 'candidates': ['differences', 'usage', 'weaknesses', 'categories']}, {'sentence': 'All the three toys seem to be [MASK] in their shape, size and material.', 'answer': 'identical', 'candidates': ['excellent', 'simple', 'identical', 'difficult']}, {'sentence': 'After careful [MASK] , the student sees holes in the toys.', 'answer': 'observation', 'candidates': ['imagination', 'discussion', 'observation', 'selection']}, {'sentence': 'The first toy has holes in the [MASK] .', 'answer': 'ears', 'candidates': ['mouth', 'nose', 'eyes', 'ears']}, {'sentence': 'Then with the [MASK] of a needle, which is put into the holes in one of the ears of the three toys, the student realizes the importance of the company of the people who are trustworthy .', 'answer': 'help', 'candidates': ['invention', 'help', 'discovery', 'company']}, {'sentence': 'The first toy represents 