In [151]:
import json
import spacy
import pandas as pd
import gender_guesser.detector as gender

In [2]:
questions_train = pd.read_csv('../data/train.tsv', delimiter='\t')
questions_dev = pd.read_csv('../data/dev.tsv', delimiter='\t')

In [3]:
nlp = spacy.load("en_core_web_lg")

In [85]:
questions_train[questions_train.answer == 'Yes']

Unnamed: 0,dataset,example_uid,question,answer,turker_answer,rule-based
26046,SQuAD,5726d28b708984140094d269,Queen 's sound has been described as a mix of ...,Yes,Queen 's sound has been described as a mix of ...,Queen 's sound has been described as a mix of ...
52847,movieQA,7339,Did Jason survive when he fell into the river ...,Yes,"Yes , Jason did survive when he fell into the ...",Did Jason survive yes he fell into the river a...


In [5]:
squad2_train = json.load(open('../data/train-v2.0.json'))
squad2_dev = json.load(open('../data/dev-v2.0.json'))

In [6]:
squad2_train['data'][0]['paragraphs'][0]['qas']

[{'question': 'When did Beyonce start becoming popular?',
  'id': '56be85543aeaaa14008c9063',
  'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
  'is_impossible': False},
 {'question': 'What areas did Beyonce compete in when she was growing up?',
  'id': '56be85543aeaaa14008c9065',
  'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
  'is_impossible': False},
 {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
  'id': '56be85543aeaaa14008c9066',
  'answers': [{'text': '2003', 'answer_start': 526}],
  'is_impossible': False},
 {'question': 'In what city and state did Beyonce  grow up? ',
  'id': '56bf6b0f3aeaaa14008c9601',
  'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
  'is_impossible': False},
 {'question': 'In which decade did Beyonce become famous?',
  'id': '56bf6b0f3aeaaa14008c9602',
  'answers': [{'text': 'late 1990s', 'answer_start': 276}],
  'is_impossible': False},
 {'question': 'In what R&B group

In [7]:
num_past_questions = 4

all_ids_dict = {}
for item in squad2_train['data']:
    for paragraph in item['paragraphs']:
        questions = [qa['question'].strip() for qa in paragraph['qas']]
        answers = [qa['answers'][0]['text'].strip() if qa['answers'] else 'unknown' for qa in paragraph['qas']]
        for index, qa in enumerate(paragraph['qas']):
            text = '\n'.join(['Q: ' + q
                             + '\nA: ' + a 
                             for q, a in zip(questions[max(0, index-num_past_questions):index+1], 
                                             answers[max(0, index-num_past_questions):index+1]) 
                  ])
            all_ids_dict[qa['id']] = text

In [8]:
all_ids = set()
for item in squad2_train['data'] + squad2_dev['data']:
    for paragraph in item['paragraphs']:
        for qa in paragraph['qas']:
            all_ids.add(qa['id'])

In [9]:
len(all_ids_dict)

130319

In [10]:
len(all_ids)

142192

In [12]:
def clean_line(line):
    line = line.replace('-LRB-', '(')
    line = line.replace('-RRB-', ')')
    line = line.replace('-LSB-', '[')
    line = line.replace('-RSB-', ']')
    line = line.replace('( ', '(')
    line = line.replace(' )', ')')
    line = line.replace('[ ', '[')
    line = line.replace(' ]', ']')
    line = line.replace(' .', '.')
    line = line.replace(' , ', ', ')
    line = line.replace(' ; ', '; ')
    line = line.replace(' : ', ': ')
    return line

In [178]:
import random
gender_detector = gender.Detector()

unknown_subj_gender = random.choice(['he', 'she'])
unknown_obj_gender = random.choice(['him', 'her'])

subj_gender_dict = {'male': 'he', 
                    'female': 'she', 
                    'mostly_male': unknown_subj_gender, 
                    'mostly_female': unknown_subj_gender,
                    'andy': unknown_subj_gender, 
                    'unknown': unknown_subj_gender}
obj_gender_dict = {'male': 'him', 
                   'female': 'her', 
                   'mostly_male': unknwon_obj_gender, 
                   'mostly_female': unknwon_obj_gender, 
                   'andy': unknown_obj_gender, 
                   'unknown': unknown_obj_gender}


def add_synthetic_coreference(discussion, entities_dict):
    lines = discussion.split('\n')
    new_lines = [lines[0]]
    prior_substitutions = {}
    for line_index, line in enumerate(lines[1:]):
        dep_dict = {}
        tuples = nlp(line[3:])
        for token in tuples:
            dep_dict.setdefault(token.dep_, [])
            dep_dict[token.dep_].append(token.text)

        new_line = line
        for text, label in entities_dict.items():
            #if random.uniform(0, 1) < 0.1:
            #    continue
            pronoun = ''
            if text not in line:
                continue
            if line[:2] == 'A:':
                continue
            if label == 'PERSON' and 'nsubj' in dep_dict and any([item in text for item in dep_dict['nsubj']]):
                if text in prior_substitutions and prior_substitutions[text] in ['he', 'him']:
                    pronoun = 'he'
                elif text in prior_substitutions and prior_substitutions[text] in ['she', 'her']:
                    pronoun = 'she'
                else:
                    pronoun = subj_gender_dict[gender_detector.get_gender(text)]
            if label == 'PERSON' and 'dobj' in dep_dict and any([item in text for item in dep_dict['dobj']]):
                if text in prior_substitutions and prior_substitutions[text] in ['he', 'him']:
                    pronoun = 'him'
                elif text in prior_substitutions and prior_substitutions[text] in ['she', 'her']:
                    pronoun = 'her'
                else:
                    pronoun = obj_gender_dict[gender_detector.get_gender(text)]
            if label in ['GPE', 'FAC', 'PRODUCT']:
                pronoun = 'it'
            if not pronoun:
                continue
            text_appears_before = False
            for prior_lines in lines[:line_index]:
                if text in prior_lines:
                    text_appears_before = True
            if not text_appears_before:
                continue
            new_line = new_line.replace(text, pronoun)
            prior_substitutions[text] = pronoun
            new_line.replace('a it', 'it')
            new_line.replace('the it', 'that')
            new_line = new_line[:3] + new_line[3].upper() + new_line[4:]

        new_lines.append(new_line)

    return '\n'.join(new_lines)

In [179]:
from tqdm import tqdm

def get_text_list(data):
    all_texts = []
    for index, row in tqdm(data.iterrows(), total=data.shape[0]):
        qid, answer = row['example_uid'], row['turker_answer']
        answer = answer[0].upper() + answer[1:]
        answer = clean_line(answer)
        tuples = nlp(answer)
        entities_dict = {ent.text: ent.label_ for ent in tuples.ents}
        if qid not in all_ids_dict:
            continue
        discussion = all_ids_dict[qid]
        discussion = add_synthetic_coreference(discussion, entities_dict)
        text = 'Discussion:\n'
        text += discussion
        text += '\n\n'
        text += 'Statement:\n'
        text += answer
        all_texts.append(text)
        
    return all_texts

In [180]:
train_texts_list = get_text_list(questions_train)
dev_texts_list = get_text_list(questions_dev)

100%|██████████| 60710/60710 [26:46<00:00, 37.79it/s]
100%|██████████| 10344/10344 [04:03<00:00, 42.52it/s]


In [181]:
with open('../data/question_to_statement_train.json', 'w') as f:
    json.dump(train_texts_list, f)
with open('../data/question_to_statement_dev.json', 'w') as f:
    json.dump(dev_texts_list, f)

In [182]:
print(dev_texts_list[0])

Discussion:
Q: Who built the famous decorated havelis in Rajasthan?
A: Rajput kings
Q: Jaipur is also known as what city?
A: the Pink City
Q: What are the notable houses in it made from?
A: a type of sandstone dominated by a pink hue

Statement:
Notable houses in Jaipur made from a type of sandstone dominated by a pink hue
