In [1]:
import os
import json

In [2]:
train_dicts = [json.loads(item) for item in open('../data/FEVER/train.jsonl', encoding='utf8').readlines()]
dev_dicts = [json.loads(item) for item in open('../data/FEVER/paper_dev.jsonl', encoding='utf8').readlines()]

In [3]:
def clean_line(line):
    line = line[2:line[2:].find('\t')]
    line = line.replace('-LRB-', '(')
    line = line.replace('-RRB-', ')')
    line = line.replace('-LSB-', '[')
    line = line.replace('-RSB-', ']')
    line = line.replace('( ', '(')
    line = line.replace(' )', ')')
    line = line.replace('[ ', '[')
    line = line.replace(' ]', ']')
    line = line.replace(' .', '.')
    line = line.replace(' , ', ', ')
    line = line.replace(' ; ', '; ')
    line = line.replace(' : ', ': ')
    return line

In [4]:
train_dicts[0]

{'id': 75397,
 'verifiable': 'VERIFIABLE',
 'label': 'SUPPORTS',
 'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.',
 'evidence': [[[92206, 104971, 'Nikolaj_Coster-Waldau', 7],
   [92206, 104971, 'Fox_Broadcasting_Company', 0]]]}

In [5]:
wiki_lines_dict = {}
_wiki_folder = '../data/FEVER/wiki-pages'
for file in os.listdir(_wiki_folder):
    for item in open(f'{_wiki_folder}/{file}', encoding='utf8').readlines():
        wiki_dict = json.loads(item)
        wiki_id = wiki_dict['id']
        lines = [clean_line(line) for line in wiki_dict['lines'].split('\n')]
        wiki_lines_dict[wiki_id] = lines

In [6]:
def populate_texts_for_label(data_dicts, label):
    all_texts = []

    for item in data_dicts:
        if item['label'] == label:
            for paragraph in item['evidence']:
                text = ''
                for line in paragraph:
                    wiki_id = line[-2]
                    sentence_num = line[-1]
                    if wiki_id not in wiki_lines_dict:
                        continue
                    to_add = '. '.join([item for item in wiki_lines_dict[wiki_id] if item.strip()]) 
                    if to_add not in text:
                        text += to_add
                text = text.strip()
                if text:
                    text += '.\n'
                if text and text != '.':
                    claim = item['claim']
                    all_texts.append('Evidence:\n'
                                     + text + '\n\n'
                                     + 'Claim:\n'
                                     + claim
                                    )
    return all_texts

In [7]:
supporting_texts = populate_texts_for_label(train_dicts, 'SUPPORTS')
refuting_texts = populate_texts_for_label(train_dicts, 'REFUTES')

dev_supporting_texts = populate_texts_for_label(dev_dicts, 'SUPPORTS')
dev_refuting_texts = populate_texts_for_label(dev_dicts, 'REFUTES')

In [8]:
len(supporting_texts)

160305

In [9]:
len(refuting_texts)

59868

In [10]:
print(supporting_texts[8])

Evidence:
The Ten Commandments is a 1956 American biblical epic film produced, directed, and narrated by Cecil B. DeMille, shot in VistaVision (color by Technicolor), and released by Paramount Pictures. The film is based on Prince of Egypt by Dorothy Clarke Wilson, Pillar of Fire by J.H. Ingraham, On Eagle 's Wings by A.E. Southon, and the Book of Exodus. The Ten Commandments dramatizes the biblical story of the life of Moses, an adopted Egyptian prince who becomes the deliverer of his real brethren, the enslaved Hebrews, and therefore leads the Exodus to Mount Sinai, where he receives, from God, the Ten Commandments. The film stars Charlton Heston in the lead role, Yul Brynner as Rameses, Anne Baxter as Nefretiri, Edward G. Robinson as Dathan, Yvonne De Carlo as Sephora, Debra Paget as Lilia, and John Derek as Joshua; and features Sir Cedric Hardwicke as Sethi, Nina Foch as Bithiah, Martha Scott as Yoshebel, Judith Anderson as Memnet, and Vincent Price as Baka, among others. Filmed on

In [11]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [12]:
def invert_claim(claim):
    negated_sentence_words = []
    tuples = nlp(claim)
    to_negate = True
    
    for token in tuples:
        tag = token.tag_
        if tag not in ['VB', 'VBZ', 'VBD', 'VBP']:
            negated_sentence_words.append(token.text)
        elif to_negate:
            if tag == 'VB':
                negated_sentence_words.append('not')
                negated_sentence_words.append(token.text)
                to_negate = False
                continue
                
            if token.text == 'is':
                negated_sentence_words.append(token.text)
                negated_sentence_words.append('not')
                to_negate = False
                continue
                
            if tag == 'VBP':
                negated_sentence_words.append('do not')
                negated_sentence_words.append(token.lemma_)
                to_negate = False
                continue
                
            if tag == 'VBZ':
                negated_sentence_words.append('does not')
                negated_sentence_words.append(token.lemma_)
                to_negate = False
                continue
                
            if tag == 'VBD':
                negated_sentence_words.append('did not')
                negated_sentence_words.append(token.lemma_)
                to_negate = False
                continue
        else:
            negated_sentence_words.append(token.text)
            
    return ' '.join(negated_sentence_words)

In [13]:
invert_claim('Alice had a car that she used to drive to the church where she sang')

'Alice did not have a car that she used to drive to the church where she sang'

In [14]:
def populate_negative_texts_for_label(data_dicts, label):
    all_texts = []

    for item in data_dicts:
        if item['label'] == label:
            for paragraph in item['evidence']:
                text = ''
                for line in paragraph:
                    wiki_id = line[-2]
                    sentence_num = line[-1]
                    if wiki_id not in wiki_lines_dict:
                        continue
                    to_add = '. '.join([item for item in wiki_lines_dict[wiki_id] if item.strip()]) 
                    if to_add not in text:
                        text += to_add
                text = text.strip()
                if text:
                    text += '.\n'
                if text and text != '.':
                    claim = item['claim']
                    claim = invert_claim(claim)
                    all_texts.append('Evidence:\n'
                                     + text + '\n\n'
                                     + 'Claim:\n'
                                     + claim
                                    )
    return all_texts

In [15]:
import random
sampled_train = random.sample(train_dicts, len(train_dicts)//10)
sampled_dev = random.sample(dev_dicts, len(dev_dicts)//10)

In [16]:
refuting_texts += populate_negative_texts_for_label(sampled_train, 'SUPPORTS')
supporting_texts += populate_negative_texts_for_label(sampled_train, 'REFUTES')

dev_refuting_texts += populate_negative_texts_for_label(sampled_dev, 'SUPPORTS')
dev_supporting_texts += populate_negative_texts_for_label(sampled_dev, 'REFUTES')

In [17]:
json.dump(supporting_texts, open('../data/supporting.json', 'w'))
json.dump(refuting_texts, open('../data/refuting.json', 'w'))

In [18]:
json.dump(dev_supporting_texts, open('../data/dev_supporting.json', 'w'))
json.dump(dev_refuting_texts, open('../data/dev_refuting.json', 'w'))