In [1]:
import json
import os
import csv

import pandas as pd
import numpy as np

In [2]:
root = '../data_2023_06_02'
climate_raw = os.path.join(root, 'raw/CLIMATE-FEVER')
climate_out = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

pubhealth_raw = os.path.join(root, 'raw/PUBHEALTH')
pubhealth_out = os.path.join(root, 'preprocessed/PUBHEALTH')

fever_raw = os.path.join(root, 'raw/FEVER')
fever_out = os.path.join(root, 'preprocessed/FEVER')

## Climate Fever
- drop DISPUTED; only keep SUPPORTS, REFUTES and NOT_ENOUGH_INFO

In [50]:
# load data
with open(os.path.join(climate_raw, 'climate-fever-dataset-r1.jsonl'), 'r') as f:
    climate_lst = [json.loads(item) for item in list(f)]

climate_df = pd.DataFrame.from_records(climate_lst)
print(climate_df.shape)

(1535, 4)


In [51]:
climate_df['claim_label'].value_counts()

SUPPORTS           654
NOT_ENOUGH_INFO    474
REFUTES            253
DISPUTED           154
Name: claim_label, dtype: int64

In [52]:
# filter
climate_df = climate_df[climate_df['claim_label'] != 'DISPUTED'].copy()
print(climate_df.shape)

(1381, 4)


In [53]:
# output
if not os.path.isdir(climate_out):
    os.mkdir(climate_out)
with open(os.path.join(climate_out, 'climate-fever.jsonl'), 'w') as f:
    f.write(climate_df.to_json(orient='records', lines=True))

## PubHealth

In [3]:
from operator import itemgetter

from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### extract

In [4]:
pub_train_df = pd.read_csv(os.path.join(pubhealth_raw, 'train.tsv'), sep="\t")
pub_dev_df = pd.read_csv(os.path.join(pubhealth_raw, 'dev.tsv'), sep="\t")
pub_test_df = pd.read_csv(os.path.join(pubhealth_raw, 'test.tsv'), sep="\t")

print(pub_train_df.shape, pub_dev_df.shape, pub_test_df.shape)

(9832, 9) (1221, 9) (1235, 10)


In [17]:
pub_test_df[pub_test_df['main_text'].notnull()].shape

(1235, 10)

### transform

#### filter
- drop data points w/o main text

In [18]:
pub_train_df = pub_train_df[pub_train_df['main_text'].notnull()].copy()
pub_dev_df = pub_dev_df[pub_dev_df['main_text'].notnull()].copy()
pub_test_df = pub_test_df[pub_test_df['main_text'].notnull()].copy()

print(pub_train_df.shape, pub_dev_df.shape, pub_test_df.shape)

(9806, 9) (1217, 9) (1235, 10)


#### evidence selection
- select top 5 evidence sentences from main_text using SBERT (https://github.com/neemakot/Health-Fact-Checking/blob/master/src/load_data.py)

In [33]:
def select_evidence_sentences(df, k=5):
    """Select top k evidence sentences based on sentence transformer model"""

    corpus = df.copy()
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    model.to("cuda")
    corpus['top_k'] = np.empty([len(corpus),], dtype=str)

    for index, row in corpus.iterrows():
        claim = row['claim']
        sentences = [claim] + [sent for sent in sent_tokenize(row['main_text'])]
        
        sentence_embeddings = sentence_transformer_model.encode(sentences)
        claim_embedding = sentence_embeddings[0]
        sentence_embeddings = sentence_embeddings[1:]
        cosine_similarity_emb = {}
        
        for sent, embedding in zip(sentences, sentence_embeddings):
            cosine_similarity_emb[sent] = np.linalg.norm(cosine_similarity(
                [claim_embedding, embedding]))
            
        top_k = dict(sorted(cosine_similarity_emb.items(),
                            key=itemgetter(1))[:k])
        # corpus.at[index, 'top_k'] = ','.join(key for key in top_k.keys())
        corpus.at[index, 'top_k'] = [key for key in top_k.keys()]

    return corpus

In [34]:
pub_train_df = select_evidence_sentences(pub_train_df)

In [35]:
pub_dev_df = select_evidence_sentences(pub_dev_df)

In [36]:
pub_test_df = select_evidence_sentences(pub_test_df)

### load

In [48]:
if not os.path.isdir(pubhealth_out):
    os.mkdir(pubhealth_out)
with open(os.path.join(pubhealth_out, 'train.jsonl'), 'w') as f:
    f.write(pub_train_df.to_json(orient='records', lines=True))
    
with open(os.path.join(pubhealth_out, 'dev.jsonl'), 'w') as f:
    f.write(pub_dev_df.to_json(orient='records', lines=True))
    
with open(os.path.join(pubhealth_out, 'test.jsonl'), 'w') as f:
    f.write(pub_test_df.to_json(orient='records', lines=True))

## FEVER

#### load fever data

In [95]:
# read as dataframe
def load_data(jsonl_file):
    with open(jsonl_file, 'r') as f:
        fever_lst = [json.loads(item) for item in list(f)]
        print(f'{len(fever_lst)} item loaded')
        df = pd.DataFrame.from_records(fever_lst)
    return df

# These are original data files
# fever_train_df = load_data(os.path.join(fever_raw, 'train.jsonl'))
# fever_dev_df = load_data(os.path.join(fever_raw, 'paper_dev.jsonl'))
# fever_test_df = load_data(os.path.join(fever_raw, 'paper_test.jsonl'))

# There are original data files + random sampled evidence for NOT ENOUGH INFO class
fever_train_df = load_data(os.path.join(fever_raw, 'train.ns.rand.jsonl'))
fever_dev_df = load_data(os.path.join(fever_raw, 'dev.ns.rand.jsonl'))
fever_test_df = load_data(os.path.join(fever_raw, 'test.ns.rand.jsonl'))

145449 item loaded
9999 item loaded
9999 item loaded


In [None]:
# read as dict

# import csv
# import json

# class Reader:
#     def __init__(self,encoding="utf-8"):
#         self.enc = encoding

#     def read(self,file):
#         with open(file,"r",encoding = self.enc) as f:
#             return self.process(f)

#     def process(self,f):
#         pass

# class JSONLineReader(Reader):
#     def process(self,fp):
#         data = []
#         for line in fp.readlines():
#             data.append(json.loads(line.strip()))
#         return data
    
# reader = JSONLineReader()
# t = reader.read(os.path.join(fever_raw, 'test.ns.rand.jsonl'))

#### extract evidence text from fever.db


In [None]:
"""
Questions
- do i combine the evidence text? (src/rte/parikh/reader.py --> FEVERReader) (src/retrieval/reader.py --> FEVERSentenceReader)
    - " ".join(evidences)
- how to extract evidence text for NA cases? - PYTHONPATH=src python src/scripts/dataset/neg_sample_evidence.py data/fever/fever.db
"""

In [96]:
# reference 1: https://github.com/facebookresearch/DrQA/blob/main/drqa/retriever/doc_db.py
# reference 2: src/retrieval/fever_doc_db.py

import unicodedata
import sqlite3

def normalize(text):
    """Resolve different type of unicode encodings."""
    return unicodedata.normalize('NFD', text)


class DocDB():
    """Sqlite backed document storage.

    Implements get_doc_text(doc_id).
    """

    def __init__(self, db_path=None):
        self.path = db_path or DEFAULTS['db_path']
        self.connection = sqlite3.connect(self.path, check_same_thread=False)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def path(self):
        """Return the path to the file that backs this database."""
        return self.path

    def close(self):
        """Close the connection to the database."""
        self.connection.close()

    def get_doc_ids(self):
        """Fetch all ids of docs stored in the db."""
        cursor = self.connection.cursor()
        cursor.execute("SELECT id FROM documents")
        results = [r[0] for r in cursor.fetchall()]
        cursor.close()
        return results

    def get_doc_text(self, doc_id):
        """Fetch the raw text of the doc for 'doc_id'."""
        cursor = self.connection.cursor()
        cursor.execute(
            "SELECT text FROM documents WHERE id = ?",
            (normalize(doc_id),)
        )
        result = cursor.fetchone()
        cursor.close()
        return result if result is None else result[0]
    
    def get_doc_lines(self, doc_id):
        """Fetch the raw text of the doc for 'doc_id'."""
        cursor = self.connection.cursor()
        cursor.execute(
            "SELECT lines FROM documents WHERE id = ?",
            (normalize(doc_id),)
        )
        result = cursor.fetchone()
        cursor.close()
        return result if result is None else result[0]
    
    def get_non_empty_doc_ids(self):
        """Fetch all ids of docs stored in the db."""
        cursor = self.connection.cursor()
        cursor.execute("SELECT id FROM documents WHERE length(trim(text)) > 0")
        results = [r[0] for r in cursor.fetchall()]
        cursor.close()
        return results

In [97]:
db = DocDB(os.path.join(fever_raw, 'fever.db'))

#### Extract evidence text for modelling

In [73]:
import random
import numpy as np

class SimpleRandom():
    instance = None

    def __init__(self,seed):
        self.seed = seed
        self.random = random.Random(seed)

    def next_rand(self,a,b):
        return self.random.randint(a,b)

    @staticmethod
    def get_instance():
        if SimpleRandom.instance is None:
            SimpleRandom.instance = SimpleRandom(SimpleRandom.get_seed())
        return SimpleRandom.instance

    @staticmethod
    def get_seed():
        return int(os.getenv("RANDOM_SEED", 12459))

    @staticmethod
    def set_seeds():
        np.random.seed(SimpleRandom.get_seed())
        random.seed(SimpleRandom.get_seed())

def get_doc_line(doc,line):
    lines = db.get_doc_lines(doc)
    if line > -1:
        return lines.split("\n")[line].split("\t")[1] #get specific line in wiki page
    else:
        non_empty_lines = [line.split("\t")[1] for line in lines.split("\n") if len(line.split("\t"))>1 and len(line.split("\t")[1].strip())]
        return non_empty_lines[SimpleRandom.get_instance().next_rand(0,len(non_empty_lines)-1)]

In [103]:
def extract_evidence(evidence):
    try:
        pages = []
        for evidence_group in evidence:
            pages.extend([(ev[2],ev[3]) for ev in evidence_group])
        
        lines = set([get_doc_line(d[0],d[1]) for d in pages])
        premise = " ".join(lines)
        return lines
    except Exception as e:
        print(evidences)
        print(e)
        raise

In [None]:
fever_test_df['premise'] = fever_test_df['evidence'].apply(extract_evidence)

In [104]:
fever_dev_df['premise'] = fever_dev_df['evidence'].apply(extract_evidence)

In [105]:
fever_train_df['premise'] = fever_train_df['evidence'].apply(extract_evidence)

### load

In [113]:
if not os.path.isdir(fever_out):
    os.mkdir(fever_out)
with open(os.path.join(fever_out, 'train_preprocessed.ns.rand.jsonl'), 'w') as f:
    f.write(fever_train_df.to_json(orient='records', lines=True))
    
with open(os.path.join(fever_out, 'dev_preprocessed.ns.rand.jsonl'), 'w') as f:
    f.write(fever_dev_df.to_json(orient='records', lines=True))
    
with open(os.path.join(fever_out, 'test_preprocessed.ns.rand.jsonl'), 'w') as f:
    f.write(fever_test_df.to_json(orient='records', lines=True))