This notebook represents an alternative to generating labelled data for textual entailment.
Differently from `0_fever_decode_wikipage_IDs_to_sentences`, this one takes the FEVER data directly from the processed data from the kgat team.

This is better for one main reason: where the other notebook created *a single completely random sentence* for the NOT ENOUGH INFO cases, this one is able to draw from pre-selected sentences that, *while related to the claim, do not support nor refute it*.

This is very important, as the former approach meant the data was very easy to perform well in, as the sentences were simply completely unrelated, and much clearer to define as "not argumentative" to the claim.

In [1]:
import json
import pandas as pd
import random
import numpy as np
import sqlite3 as sql

In [2]:
all_df = []
with open('kgat_proc_data/bert_train.json','r') as f:
    for line in f:
        row = json.loads(line)
        all_df.append(row)
all_df = pd.DataFrame(all_df)
all_df

Unnamed: 0,id,evidence,claim,label
0,75397,"[[Fox_Broadcasting_Company, 0, The Fox Broadca...",Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS
1,150448,"[[Roman_Atwood, 3, He also has another YouTube...",Roman Atwood is a content creator.,SUPPORTS
2,214861,"[[History_of_art, 2, The subsequent expansion ...","History of art includes architecture, dance, s...",SUPPORTS
3,156709,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ...",Adrienne Bailon is an accountant.,REFUTES
4,83235,"[[System_of_a_Down, 6, The group briefly disba...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO
...,...,...,...,...
145444,75062,"[[Led_Zeppelin_-LRB-album-RRB-, 0, Led Zeppeli...",Led Zeppelin released an eponymous debut album...,REFUTES
145445,149256,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R...",Taal was romantic.,SUPPORTS
145446,13287,"[[Rooney_Mara, 0, Patricia Rooney Mara LRB LSB...",Her stars American actress Rooney Mara.,SUPPORTS
145447,13114,"[[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf war...",J. R. R. Tolkien created Gimli.,SUPPORTS


In [3]:
gd_all_df = []
are_multihop = []
with open('kgat_proc_data/all_train.json','r') as f:
    for line in f:
        row = json.loads(line)
        row['golden_evidence'] = row.pop('evidence')
        row['golden_id'] = row.pop('id')
        row.pop('claim')
        row.pop('label')
        gd_all_df.append(row)
        if len([e for e in row['golden_evidence'] if e[3] == 2])>0:
            are_multihop.append(row['golden_id'])
gd_all_df = pd.DataFrame(gd_all_df)
gd_all_df

Unnamed: 0,golden_evidence,golden_id
0,"[[Waldau, 0, Waldau may refer to :, 0], [Walda...",75397
1,"[[Content_-LRB-media-RRB-, 0, In publishing , ...",150448
2,"[[Architecture, 0, Architecture LRB Latin arch...",214861
3,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ...",156709
4,"[[System_of_a_Down_-LRB-album-RRB-, 0, System ...",83235
...,...,...
145444,"[[1960, 0, It is also known as the `` Year of ...",75062
145445,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R...",149256
145446,"[[Her, 0, Her is the objective form of the pro...",13287
145447,"[[Gimli, 0, Gimli may refer to :, 0], [Gimli, ...",13114


In [4]:
all_df = pd.concat([all_df,gd_all_df],axis=1)
assert all(all_df['id'] == all_df['golden_id'])
all_df = all_df.drop('golden_id', axis=1)
all_df

Unnamed: 0,id,evidence,claim,label,golden_evidence
0,75397,"[[Fox_Broadcasting_Company, 0, The Fox Broadca...",Nikolaj Coster-Waldau worked with the Fox Broa...,SUPPORTS,"[[Waldau, 0, Waldau may refer to :, 0], [Walda..."
1,150448,"[[Roman_Atwood, 3, He also has another YouTube...",Roman Atwood is a content creator.,SUPPORTS,"[[Content_-LRB-media-RRB-, 0, In publishing , ..."
2,214861,"[[History_of_art, 2, The subsequent expansion ...","History of art includes architecture, dance, s...",SUPPORTS,"[[Architecture, 0, Architecture LRB Latin arch..."
3,156709,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ...",Adrienne Bailon is an accountant.,REFUTES,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ..."
4,83235,"[[System_of_a_Down, 6, The group briefly disba...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO,"[[System_of_a_Down_-LRB-album-RRB-, 0, System ..."
...,...,...,...,...,...
145444,75062,"[[Led_Zeppelin_-LRB-album-RRB-, 0, Led Zeppeli...",Led Zeppelin released an eponymous debut album...,REFUTES,"[[1960, 0, It is also known as the `` Year of ..."
145445,149256,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R...",Taal was romantic.,SUPPORTS,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R..."
145446,13287,"[[Rooney_Mara, 0, Patricia Rooney Mara LRB LSB...",Her stars American actress Rooney Mara.,SUPPORTS,"[[Her, 0, Her is the objective form of the pro..."
145447,13114,"[[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf war...",J. R. R. Tolkien created Gimli.,SUPPORTS,"[[Gimli, 0, Gimli may refer to :, 0], [Gimli, ..."


In [5]:
all_df['label'].value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: label, dtype: int64

In [6]:
# Only 14% of the evidence need multi-hop (train)
print(100*len(are_multihop)/all_df.shape[0])

# Filter out multihop
all_df = all_df[all_df.id.apply(lambda x : x not in are_multihop)].reset_index(drop=True)
all_df

13.830964805533212


Unnamed: 0,id,evidence,claim,label,golden_evidence
0,150448,"[[Roman_Atwood, 3, He also has another YouTube...",Roman Atwood is a content creator.,SUPPORTS,"[[Content_-LRB-media-RRB-, 0, In publishing , ..."
1,214861,"[[History_of_art, 2, The subsequent expansion ...","History of art includes architecture, dance, s...",SUPPORTS,"[[Architecture, 0, Architecture LRB Latin arch..."
2,156709,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ...",Adrienne Bailon is an accountant.,REFUTES,"[[Adrienne_Bailon, 0, Adrienne Eliza Houghton ..."
3,83235,"[[System_of_a_Down, 6, The group briefly disba...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO,"[[System_of_a_Down_-LRB-album-RRB-, 0, System ..."
4,149579,"[[Beautiful_-LRB-Christina_Aguilera_song-RRB-,...",Beautiful reached number two on the Billboard ...,NOT ENOUGH INFO,"[[Beautiful, 0, Beautiful , an adjective used ..."
...,...,...,...,...,...
125327,225357,"[[Absolute_Beginners_-LRB-film-RRB-, 1, The fi...",Absolute Beginners starred David Bowie.,SUPPORTS,"[[Absolute_Beginners_-LRB-novel-RRB-, 0, Absol..."
125328,116046,"[[Neil_Young, 7, Young 's often distorted elec...",Neil Young is not a singer-songwriter.,REFUTES,"[[Neil_Young_-LRB-disambiguation-RRB-, 0, Neil..."
125329,149256,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R...",Taal was romantic.,SUPPORTS,"[[Taal_-LRB-film-RRB-, 0, Taal LRB English : R..."
125330,13114,"[[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf war...",J. R. R. Tolkien created Gimli.,SUPPORTS,"[[Gimli, 0, Gimli may refer to :, 0], [Gimli, ..."


In [10]:
''' 
To create the support detection dataset, the first strategy will be:
    For each SUPPORTS claim-evidence pair, we keep the pair if the evidence_score is 1 (is gold truth);
    For each REJECTS claim-evidence pair, we do the same as above;
    For each NOT ENOUGH INFO claim-evidence pair, we select N at random from the evidence pairs available
'''

def filter_for_support_detection(row):
    if row['label'] == 'NOT ENOUGH INFO':
        row_evidence = []
        for evidence in row['evidence']:
            keep = False
            if evidence[3] > 0.0:
                keep = True
            else:
                for gd_evidence in row['golden_evidence']:
                    if ((evidence[0],evidence[1]) == (gd_evidence[0],gd_evidence[1])) and gd_evidence[3] == 1:
                        print(row)
                        keep = True
            if keep:
                row_evidence.append(evidence)
        row['evidence'] = row_evidence
    else:
        row['evidence'] = [e for e in row['evidence'] if e[3] == 1]
    
    return(row)

all_df = all_df.apply(filter_for_support_detection, axis=1)

In [8]:
# Expand evidence into singular cases
all_df = all_df.drop('golden_evidence', axis=1)
expl_all_df = all_df.explode('evidence').reset_index(drop=True)
expl_all_df

Unnamed: 0,id,evidence,claim,label
0,150448,"[Roman_Atwood, 3, He also has another YouTube ...",Roman Atwood is a content creator.,SUPPORTS
1,150448,"[Roman_Atwood, 1, He is best known for his vlo...",Roman Atwood is a content creator.,SUPPORTS
2,214861,"[History_of_art, 2, The subsequent expansion o...","History of art includes architecture, dance, s...",SUPPORTS
3,156709,"[Adrienne_Bailon, 0, Adrienne Eliza Houghton L...",Adrienne Bailon is an accountant.,REFUTES
4,83235,"[System_of_a_Down, 6, The group briefly disban...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO
...,...,...,...,...
179660,13114,"[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf warr...",J. R. R. Tolkien created Gimli.,SUPPORTS
179661,13114,"[Gimli_-LRB-Middle-earth-RRB-, 0, Gimli is a f...",J. R. R. Tolkien created Gimli.,SUPPORTS
179662,152180,"[Susan_Sarandon, 1, She is an Academy Award an...",Susan Sarandon is an award winner.,SUPPORTS
179663,152180,"[Susan_Sarandon, 8, She has also won the BAFTA...",Susan Sarandon is an award winner.,SUPPORTS


In [9]:
# Some unverifiable claims do not have evidence recovered by the document retrieval

no_evidence = expl_all_df[expl_all_df['evidence'].apply(lambda x : type(x) != list)]
print(100*no_evidence.shape[0]/all_df['label'].value_counts()['NOT ENOUGH INFO'])

expl_all_df = expl_all_df[expl_all_df.evidence.apply(lambda x : type(x) == list)].reset_index(drop=True)
expl_all_df

# This reaches 10% if picking only those with sentence retrieval score > 0.0 OR with golden annotation = 1.
# It could that this is enough. If not, try picking at random for the "empty" NOT ENOUGH INFO cases

16.00774432503718


Unnamed: 0,id,evidence,claim,label
0,150448,"[Roman_Atwood, 3, He also has another YouTube ...",Roman Atwood is a content creator.,SUPPORTS
1,150448,"[Roman_Atwood, 1, He is best known for his vlo...",Roman Atwood is a content creator.,SUPPORTS
2,214861,"[History_of_art, 2, The subsequent expansion o...","History of art includes architecture, dance, s...",SUPPORTS
3,156709,"[Adrienne_Bailon, 0, Adrienne Eliza Houghton L...",Adrienne Bailon is an accountant.,REFUTES
4,83235,"[System_of_a_Down, 6, The group briefly disban...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO
...,...,...,...,...
173955,13114,"[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf warr...",J. R. R. Tolkien created Gimli.,SUPPORTS
173956,13114,"[Gimli_-LRB-Middle-earth-RRB-, 0, Gimli is a f...",J. R. R. Tolkien created Gimli.,SUPPORTS
173957,152180,"[Susan_Sarandon, 1, She is an Academy Award an...",Susan Sarandon is an award winner.,SUPPORTS
173958,152180,"[Susan_Sarandon, 8, She has also won the BAFTA...",Susan Sarandon is an award winner.,SUPPORTS


In [10]:
# Expanding the evidence row

expl_all_df[['evidence_page','evidence_line','evidence_text','evidence_score']] = pd.DataFrame(expl_all_df.evidence.tolist())
expl_all_df

Unnamed: 0,id,evidence,claim,label,evidence_page,evidence_line,evidence_text,evidence_score
0,150448,"[Roman_Atwood, 3, He also has another YouTube ...",Roman Atwood is a content creator.,SUPPORTS,Roman_Atwood,3,He also has another YouTube channel called `` ...,1.000000
1,150448,"[Roman_Atwood, 1, He is best known for his vlo...",Roman Atwood is a content creator.,SUPPORTS,Roman_Atwood,1,"He is best known for his vlogs , where he post...",1.000000
2,214861,"[History_of_art, 2, The subsequent expansion o...","History of art includes architecture, dance, s...",SUPPORTS,History_of_art,2,The subsequent expansion of the list of princi...,1.000000
3,156709,"[Adrienne_Bailon, 0, Adrienne Eliza Houghton L...",Adrienne Bailon is an accountant.,REFUTES,Adrienne_Bailon,0,Adrienne Eliza Houghton LRB née Bailon ; born ...,1.000000
4,83235,"[System_of_a_Down, 6, The group briefly disban...",System of a Down briefly disbanded in limbo.,NOT ENOUGH INFO,System_of_a_Down,6,The group briefly disbanded in August 2006 and...,0.999814
...,...,...,...,...,...,...,...,...
173955,13114,"[Gimli_-LRB-Middle-earth-RRB-, 1, A dwarf warr...",J. R. R. Tolkien created Gimli.,SUPPORTS,Gimli_-LRB-Middle-earth-RRB-,1,"A dwarf warrior , he is the son of Glóin LRB a...",1.000000
173956,13114,"[Gimli_-LRB-Middle-earth-RRB-, 0, Gimli is a f...",J. R. R. Tolkien created Gimli.,SUPPORTS,Gimli_-LRB-Middle-earth-RRB-,0,Gimli is a fictional character from J. R. R. T...,1.000000
173957,152180,"[Susan_Sarandon, 1, She is an Academy Award an...",Susan Sarandon is an award winner.,SUPPORTS,Susan_Sarandon,1,She is an Academy Award and BAFTA Award winner...,1.000000
173958,152180,"[Susan_Sarandon, 8, She has also won the BAFTA...",Susan Sarandon is an award winner.,SUPPORTS,Susan_Sarandon,8,She has also won the BAFTA Award for Best Actr...,1.000000


In [11]:
expl_all_df.label.value_counts()

#NOT ENOUGH INFO    11117
#REFUTES             7438
#SUPPORTS            6839
#Name: label, dtype: int64

SUPPORTS           81583
NOT ENOUGH INFO    59114
REFUTES            33263
Name: label, dtype: int64

In [12]:
expl_all_df.to_csv('support_train.csv', index=None)