This notebook represents an alternative to generating labelled data for textual entailment.
Differently from `0_fever_decode_wikipage_IDs_to_sentences`, this one takes the FEVER data directly from the processed data from the kgat team.

This is better for one main reason: where the other notebook created *a single completely random sentence* for the NOT ENOUGH INFO cases, this one is able to draw from pre-selected sentences that, *while related to the claim, do not support nor refute it*.

This is very important, as the former approach meant the data was very easy to perform well in, as the sentences were simply completely unrelated, and much clearer to define as "not argumentative" to the claim.

In [1]:
import json
import pandas as pd
import random
import numpy as np
import sqlite3 as sql
import os
from pathlib import Path
import re

HOME = Path('/home/k20036346/')
KERNELGAT_DATA = HOME / 'Repos/KernelGAT/data'

In [2]:
from datasets import load_dataset, Features, Value

features = Features(
{
    'id': Value('int64'),
    'label': Value('string'),
    'claim': Value('string'),
    'evidence': Value('string'),
    'is_multihop' : Value('bool'), 
})

PARTITIONS = ['train','dev','test']

In [3]:
from pandas import read_json
from datasets import Dataset
from datasets import DatasetDict

dataset_dict = {}
for partition in PARTITIONS:
    df = read_json(KERNELGAT_DATA / f'bert_{partition}.json', lines=True)
    
    if 'evidence' in df.columns:
        df['evidence'] = df['evidence'].astype(str)
              
    dataset_dict[partition] = Dataset.from_pandas(
        df,
        features=Features({column: features[column] for column in df.columns})
    )
    
data_bert = DatasetDict(dataset_dict)
data_bert

KeyboardInterrupt: 

In [None]:
_RE_COMBINE_WHITESPACE = re.compile(r"\s+")
rep_double_whitespace = lambda x : _RE_COMBINE_WHITESPACE.sub(" ", x).strip()

for partition in PARTITIONS:
    if partition == 'test':
        continue
    gd_all_df = []
    are_multihop = []
    with open(KERNELGAT_DATA / f'all_{partition}.json','r') as f:
        for line in f:
            row = json.loads(line)
            row['golden_evidence'] = row.pop('evidence')
            row['golden_id'] = row.pop('id')
            row['golden_claim'] = row.pop('claim')
            row['golden_label'] = row.pop('label')
            row['is_multihop'] = (len([e for e in row['golden_evidence'] if e[3] == 2])>0)
            gd_all_df.append(row)

    gd_all_df = pd.DataFrame(gd_all_df)

    assert data_bert[partition]['id'] == gd_all_df['golden_id'].tolist()
    assert data_bert[partition]['claim'] == gd_all_df['golden_claim'].apply(rep_double_whitespace).tolist()
    assert data_bert[partition]['label'] == gd_all_df['golden_label'].tolist()

    data_bert[partition] = data_bert[partition].add_column('golden_evidence', gd_all_df['golden_evidence'].astype(str))
    data_bert[partition] = data_bert[partition].add_column('is_multihop', gd_all_df['is_multihop'].tolist())

data_bert

In [None]:
for partition in PARTITIONS:
    print(partition.upper(), '\n')
    
    if partition != 'test':
        print('Labels:')
        print(pd.Series(data_bert[partition]['label']).value_counts())
    print(f'Total: {len(data_bert[partition])}')
    print()

In [None]:
# AFTER REMOVING MULTIHOPS
for partition in PARTITIONS:
    
    print(partition.upper(), '\n')
    original_size = len(data_bert[partition])
    if partition != 'test':
        data_bert[partition] = data_bert[partition].filter(lambda x : not x['is_multihop'])
        print('Labels:')
        print(pd.Series(data_bert[partition]['label']).value_counts())
        data_bert[partition] = data_bert[partition].remove_columns(['is_multihop'])
        
    print(f'Total: {len(data_bert[partition])}, loss of {round(100 - 100*len(data_bert[partition])/original_size,3)}%')
    print()

In [None]:
'''
FROM THIS POINT ON WE WORK WITH DATAFRAMES DIRECTLY INSTEAD OF DATASET CLASS
MUCH EASIER
'''

data_bert_df = {
    partition: data_bert[partition].to_pandas() for partition in PARTITIONS
}

In [None]:
''' 
To create the support detection dataset, the first strategy will be:
    For each SUPPORTS claim-evidence pair, we keep the pair if the evidence_score is 1 (is gold truth);
    For each REJECTS claim-evidence pair, we do the same as above;
    For each NOT ENOUGH INFO claim-evidence pair, we select all evidence pairs that received a
        score over 0 from BERT sentence retrieval
'''

from ast import literal_eval as leval
from unicodedata import normalize as uninorm

def filter_for_support_detection(row): 
    
    for ev in ['evidence', 'golden_evidence']:
        if ev not in row:
            continue
        row[ev] = leval(row[ev])
        row[ev] = [
            [
                uninorm('NFKC', v[0]),
                v[1],
                uninorm('NFKC', v[2]),
                v[3]
            ] for v in row[ev]
        ]
        
    if 'label' not in row:        
        return(row)
    
    if row['label'] == 'NOT ENOUGH INFO':
        row_evidence = []
        for evidence in row['evidence']:
            keep = False
            if evidence[3] > 0.0:
                keep = True
            else:
                for gd_evidence in row['golden_evidence']:
                    if ((evidence[0],evidence[1]) == (gd_evidence[0],gd_evidence[1])) and gd_evidence[3] == 1:
                        print(row)
                        # This is to cover the case where bert_train.json gave score <=0 to an evidence
                        # that is actually golden data. Since bert_train.json is built on all_train.json,
                        # this should not happen
                        keep = True
            if keep:
                row_evidence.append(evidence)
        row['evidence'] = row_evidence
    else:
        row['evidence'] = [e for e in row['evidence'] if e[3] == 1]
        # KEEP ALL EVIDENCE WITH GD=1 (TRUE EVIDENCE)
        # Both evidence list from all_train.json and bert_train.json have e[3] == 1 for golden data
    
    return(row)

data_bert_df = {k: v.apply(filter_for_support_detection, axis=1) for k,v in data_bert_df.items()}

In [None]:
# Here we drop the golden_evidence column, it served its purpose
# We also explode the evidence column so each evidence has its own row

data_bert_df = {
    k: v.drop('golden_evidence', axis=1, errors='ignore').explode('evidence').reset_index(drop=True)
    for k,v in data_bert_df.items()
}

In [None]:
# Some unverifiable claims do not have evidence recovered by the document retrieval

data_bert_df = {
    k: v[v['evidence'].apply(lambda x: type(x) == list)].reset_index(drop=True)
    for k,v in data_bert_df.items()
}

In [None]:
# Expanding the evidence row

for k,v in data_bert_df.items():
    data_bert_df[k][['evidence_page','evidence_line','evidence_text','evidence_score']] =\
        pd.DataFrame(data_bert_df[k]['evidence'].tolist())

In [12]:
def process_sent(sentence):
    sentence = re.sub("LSB.*?RSB", "", sentence)
    sentence = re.sub("LRB RRB ", "", sentence)
    sentence = re.sub("LRB", " ( ", sentence)
    sentence = re.sub("RRB", " )", sentence)
    sentence = re.sub("--", "-", sentence)
    sentence = re.sub("``", '"', sentence)
    sentence = re.sub("''", '"', sentence)

    return sentence

def process_wiki_title(title):
    title = re.sub("_", " ", title)
    title = re.sub(" -LRB-", " ( ", title)
    title = re.sub("-RRB-", " )", title)
    title = re.sub("-COLON-", ":", title)
    return title

#for k,v in data_bert_df.items():
#    data_bert_df[k]['evidence_page'] =\
#        data_bert_df[k]['evidence_page'].apply(process_wiki_title)
#    
#for k,v in data_bert_df.items():
#    data_bert_df[k]['evidence_text'] =\
#        data_bert_df[k]['evidence_text'].apply(process_sent)

In [13]:
print('TRAIN')
print(data_bert_df['train'].label.value_counts())
print()

print('DEV')
print(data_bert_df['dev'].label.value_counts())

TRAIN
SUPPORTS           81583
NOT ENOUGH INFO    59114
REFUTES            33263
Name: label, dtype: int64

DEV
NOT ENOUGH INFO    11117
REFUTES             7438
SUPPORTS            6839
Name: label, dtype: int64


In [14]:
for partition in PARTITIONS:
    data_bert_df[partition].to_csv(f'./data/support_data_v2/{partition}_support_from_bert_SPECIAL_CHARS_CODED.csv', index=None)