## Explore fever data

In [1]:
import json
import os
import pandas as pd
import csv

In [2]:
def print_item(item):
    print(json.dumps(item, indent=4))

In [3]:
def load_data(jsonl_file):
    with open(jsonl_file, 'r') as f:
        fever_lst = [json.loads(item) for item in list(f)]
        print(f'{len(fever_lst)} item loaded')
        df = pd.DataFrame.from_records(fever_lst)
    return df

In [58]:
# root = "../naacl2018-fever/data/fever-data/"
root = "../data_2023_06_02/raw/FEVER/"
train_df = load_data(os.path.join(root, 'train.jsonl'))
dev_df = load_data(os.path.join(root, 'paper_dev.jsonl'))
test_df = load_data(os.path.join(root, 'paper_test.jsonl'))

145449 item loaded
9999 item loaded
9999 item loaded


In [59]:
train_df['label'].value_counts()

SUPPORTS           80035
NOT ENOUGH INFO    35639
REFUTES            29775
Name: label, dtype: int64

In [60]:
dev_df['label'].value_counts()

NOT ENOUGH INFO    3333
SUPPORTS           3333
REFUTES            3333
Name: label, dtype: int64

In [61]:
test_df['label'].value_counts()

NOT ENOUGH INFO    3333
SUPPORTS           3333
REFUTES            3333
Name: label, dtype: int64

In [62]:
train_df.head()

Unnamed: 0,id,verifiable,label,claim,evidence
0,75397,VERIFIABLE,SUPPORTS,Nikolaj Coster-Waldau worked with the Fox Broa...,"[[[92206, 104971, Nikolaj_Coster-Waldau, 7], [..."
1,150448,VERIFIABLE,SUPPORTS,Roman Atwood is a content creator.,"[[[174271, 187498, Roman_Atwood, 1]], [[174271..."
2,214861,VERIFIABLE,SUPPORTS,"History of art includes architecture, dance, s...","[[[255136, 254645, History_of_art, 2]]]"
3,156709,VERIFIABLE,REFUTES,Adrienne Bailon is an accountant.,"[[[180804, 193183, Adrienne_Bailon, 0]]]"
4,83235,NOT VERIFIABLE,NOT ENOUGH INFO,System of a Down briefly disbanded in limbo.,"[[[100277, None, None, None]]]"


In [45]:
dev_df.head()

Unnamed: 0,id,verifiable,label,claim,evidence
0,91198,NOT VERIFIABLE,NOT ENOUGH INFO,Colin Kaepernick became a starting quarterback...,"[[[108548, None, None, None]]]"
1,194462,NOT VERIFIABLE,NOT ENOUGH INFO,Tilda Swinton is a vegan.,"[[[227768, None, None, None]]]"
2,137334,VERIFIABLE,SUPPORTS,Fox 2000 Pictures released the film Soul Food.,"[[[289914, 283015, Soul_Food_-LRB-film-RRB-, 0..."
3,166626,NOT VERIFIABLE,NOT ENOUGH INFO,Anne Rice was born in New Jersey.,"[[[191656, None, None, None], [191657, None, N..."
4,111897,VERIFIABLE,REFUTES,Telemundo is a English-language television net...,"[[[131371, 146144, Telemundo, 0]], [[131371, 1..."


In [53]:
test_df[test_df['label'] == 'NOT ENOUGH INFO'].iloc[66]

id                                                       117176
verifiable                                       NOT VERIFIABLE
label                                           NOT ENOUGH INFO
claim         Jamie Lee Curtis starred in John Carpenter's t...
evidence                         [[[137546, None, None, None]]]
Name: 161, dtype: object

In [30]:
root = "../naacl2018-fever/data/fever/"
df = load_data(os.path.join(root, 'dev.ns.rand.jsonl'))

19998 item loaded


### Wiki Dump

In [17]:
wiki_path = "../data_old/FEVER/wiki-pages/wiki-pages"

In [19]:
with open(os.path.join(wiki_path, 'wiki-001.jsonl'), 'r') as f:
    wiki = [json.loads(item) for item in list(f)]

In [29]:
wiki[471]

{'id': '1965_Oregon_Webfoots_football_team',
 'text': 'The 1965 Oregon Webfoots football team represented University of Oregon in the 1965 college football season as a member of the Athletic Association of Western Universities -LRB- AAWU -RRB- . The Webfoots were led by head coach Len Casanova in his fifteenth season and finished with a record of four wins , five losses and one tie -LRB- 4 -- 5 -- 1 overall , 0 -- 5 in the SWC -RRB- . ',
 'lines': '0\tThe 1965 Oregon Webfoots football team represented University of Oregon in the 1965 college football season as a member of the Athletic Association of Western Universities -LRB- AAWU -RRB- .\tUniversity of Oregon\tUniversity of Oregon\t1965 college football season\t1965 college football season\tAthletic Association of Western Universities\tAthletic Association of Western Universities\n1\tThe Webfoots were led by head coach Len Casanova in his fifteenth season and finished with a record of four wins , five losses and one tie -LRB- 4 -- 5 -

In [30]:
print(wiki[471]['lines'])

0	The 1965 Oregon Webfoots football team represented University of Oregon in the 1965 college football season as a member of the Athletic Association of Western Universities -LRB- AAWU -RRB- .	University of Oregon	University of Oregon	1965 college football season	1965 college football season	Athletic Association of Western Universities	Athletic Association of Western Universities
1	The Webfoots were led by head coach Len Casanova in his fifteenth season and finished with a record of four wins , five losses and one tie -LRB- 4 -- 5 -- 1 overall , 0 -- 5 in the SWC -RRB- .	Len Casanova	Len Casanova
2	


## Fever DB

In [11]:
import sqlite3

In [13]:
conn = sqlite3.connect(os.path.join(root, 'fever.db'))

In [41]:
cursor = conn.execute("SELECT * FROM DOCUMENTS")
for row in cursor:
    print(row)
    break

('Snakebite_-LRB-album-RRB-', "Snakebite is the first official release by the British hard rock band Whitesnake . The original EP initially featured only four tracks and was released in the UK in June 1978 and never published in the US . Snakebite was re-released in September 1978 as a Double Extended Play containing four extra studio tracks taken from David Coverdale 's second solo album Northwinds . The EP sleeve is entitled David Coverdale 's Whitesnake and features photographs of the live band in concert . All tracks from the original EP also were used as bonus tracks on the 2006 remaster of Whitesnake 's debut studio album Trouble . ", "0\tSnakebite is the first official release by the British hard rock band Whitesnake .\tWhitesnake\tWhitesnake\thard rock\thard rock\n1\tThe original EP initially featured only four tracks and was released in the UK in June 1978 and never published in the US .\tEP\tExtended play\n2\tSnakebite was re-released in September 1978 as a Double Extended Pl

In [42]:
dev_df.iloc[4]['evidence']

[[[131371, 146144, 'Telemundo', 0]],
 [[131371, 146148, 'Telemundo', 1]],
 [[131371, 146150, 'Telemundo', 4],
  [131371, 146150, 'Hispanic_and_Latino_Americans', 0]],
 [[131371, 146151, 'Telemundo', 5]]]

In [43]:
id = "Hispanic_and_Latino_Americans"
cursor = conn.execute(f"SELECT * FROM DOCUMENTS WHERE id = '{id}'")

In [54]:
list(cursor)

[]