In [1]:
import json
import os
import csv

import pandas as pd
import numpy as np

In [2]:
root = '../data_2023_06_02'
climate_raw = os.path.join(root, 'raw/CLIMATE-FEVER')
climate_out = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

pubhealth_raw = os.path.join(root, 'raw/PUBHEALTH')
pubhealth_out = os.path.join(root, 'preprocessed/PUBHEALTH')
# fever_raw = 

## Climate Fever
- drop DISPUTED; only keep SUPPORTS, REFUTES and NOT_ENOUGH_INFO

In [50]:
# load data
with open(os.path.join(climate_raw, 'climate-fever-dataset-r1.jsonl'), 'r') as f:
    climate_lst = [json.loads(item) for item in list(f)]

climate_df = pd.DataFrame.from_records(climate_lst)
print(climate_df.shape)

(1535, 4)


In [51]:
climate_df['claim_label'].value_counts()

SUPPORTS           654
NOT_ENOUGH_INFO    474
REFUTES            253
DISPUTED           154
Name: claim_label, dtype: int64

In [52]:
# filter
climate_df = climate_df[climate_df['claim_label'] != 'DISPUTED'].copy()
print(climate_df.shape)

(1381, 4)


In [53]:
# output
if not os.path.isdir(climate_out):
    os.mkdir(climate_out)
with open(os.path.join(climate_out, 'climate-fever.jsonl'), 'w') as f:
    f.write(climate_df.to_json(orient='records', lines=True))

## PubHealth

In [3]:
from operator import itemgetter

from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


### extract

In [4]:
pub_train_df = pd.read_csv(os.path.join(pubhealth_raw, 'train.tsv'), sep="\t")
pub_dev_df = pd.read_csv(os.path.join(pubhealth_raw, 'dev.tsv'), sep="\t")
pub_test_df = pd.read_csv(os.path.join(pubhealth_raw, 'test.tsv'), sep="\t")

print(pub_train_df.shape, pub_dev_df.shape, pub_test_df.shape)

(9832, 9) (1221, 9) (1235, 10)


In [17]:
pub_test_df[pub_test_df['main_text'].notnull()].shape

(1235, 10)

### transform

#### filter
- drop data points w/o main text

In [18]:
pub_train_df = pub_train_df[pub_train_df['main_text'].notnull()].copy()
pub_dev_df = pub_dev_df[pub_dev_df['main_text'].notnull()].copy()
pub_test_df = pub_test_df[pub_test_df['main_text'].notnull()].copy()

print(pub_train_df.shape, pub_dev_df.shape, pub_test_df.shape)

(9806, 9) (1217, 9) (1235, 10)


#### evidence selection
- select top 5 evidence sentences from main_text using SBERT (https://github.com/neemakot/Health-Fact-Checking/blob/master/src/load_data.py)

In [33]:
def select_evidence_sentences(df, k=5):
    """Select top k evidence sentences based on sentence transformer model"""

    corpus = df.copy()
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    model.to("cuda")
    corpus['top_k'] = np.empty([len(corpus),], dtype=str)

    for index, row in corpus.iterrows():
        claim = row['claim']
        sentences = [claim] + [sent for sent in sent_tokenize(row['main_text'])]
        
        sentence_embeddings = sentence_transformer_model.encode(sentences)
        claim_embedding = sentence_embeddings[0]
        sentence_embeddings = sentence_embeddings[1:]
        cosine_similarity_emb = {}
        
        for sent, embedding in zip(sentences, sentence_embeddings):
            cosine_similarity_emb[sent] = np.linalg.norm(cosine_similarity(
                [claim_embedding, embedding]))
            
        top_k = dict(sorted(cosine_similarity_emb.items(),
                            key=itemgetter(1))[:k])
        # corpus.at[index, 'top_k'] = ','.join(key for key in top_k.keys())
        corpus.at[index, 'top_k'] = [key for key in top_k.keys()]

    return corpus

In [34]:
pub_train_df = select_evidence_sentences(pub_train_df)

In [35]:
pub_dev_df = select_evidence_sentences(pub_dev_df)

In [36]:
pub_test_df = select_evidence_sentences(pub_test_df)

### Load

In [48]:
if not os.path.isdir(pubhealth_out):
    os.mkdir(pubhealth_out)
with open(os.path.join(pubhealth_out, 'train.jsonl'), 'w') as f:
    f.write(pub_train_df.to_json(orient='records', lines=True))
    
with open(os.path.join(pubhealth_out, 'dev.jsonl'), 'w') as f:
    f.write(pub_dev_df.to_json(orient='records', lines=True))
    
with open(os.path.join(pubhealth_out, 'test.jsonl'), 'w') as f:
    f.write(pub_test_df.to_json(orient='records', lines=True))

## FEVER