In [14]:
import json
import os
import csv

import pandas as pd
import numpy as np

In [4]:
root = '../data_2023_06_02'
climate_raw = os.path.join(root, 'raw/CLIMATE-FEVER')
climate_out = os.path.join(root, 'preprocessed/CLIMATE-FEVER')

pubhealth_raw = os.path.join(root, 'raw/PUBHEALTH')
pubhealth_out = os.path.join(root, 'preprocessed/PUBHEALTH')
# fever_raw = 

## Climate Fever
- drop DISPUTED; only keep SUPPORTS, REFUTES and NOT_ENOUGH_INFO

In [14]:
# load data
with open(os.path.join(climate_raw, 'climate-fever-dataset-r1.jsonl'), 'r') as f:
    climate_lst = [json.loads(item) for item in list(f)]

climate_df = pd.DataFrame.from_records(climate_lst)
print(climate_df.shape)

(1535, 4)


In [15]:
climate_df['claim_label'].value_counts()

SUPPORTS           654
NOT_ENOUGH_INFO    474
REFUTES            253
DISPUTED           154
Name: claim_label, dtype: int64

In [17]:
# filter
climate_df = climate_df[climate_df['claim_label'] != 'DISPUTED'].copy()
print(climate_df.shape)

(1381, 4)


In [24]:
# output
if not os.path.isdir(climate_out):
    os.mkdir(climate_out)
with open(os.path.join(climate_out, 'climate-fever.jsonl'), 'w') as f:
    f.write(climate_df.to_json(orient='records', lines=True))

## PubHealth

In [19]:
from operator import itemgetter

from sentence_transformers import SentenceTransformer
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

#### extract

In [7]:
pub_train_df = pd.read_csv(os.path.join(pubhealth_raw, 'train.tsv'), sep="\t")
pub_dev_df = pd.read_csv(os.path.join(pubhealth_raw, 'dev.tsv'), sep="\t")
pub_test_df = pd.read_csv(os.path.join(pubhealth_raw, 'test.tsv'), sep="\t")

print(pub_train_df.shape, pub_dev_df.shape, pub_test_df.shape)

(9832, 9) (1221, 9) (1235, 10)


#### transform
- select top 5 evidence sentences from main_text using SBERT (https://github.com/neemakot/Health-Fact-Checking/blob/master/src/load_data.py)

In [20]:
#select evidence_sentences
#following the approach in https://github.com/neemakot/Health-Fact-Checking/blob/master/src/load_data.py
k = 5

In [17]:
corpus = pub_train_df.copy()
sentence_transformer_model = SentenceTransformer('bert-base-nli-mean-tokens')
corpus['top_k'] = np.empty([len(corpus),], dtype=str)

In [25]:
for index, row in corpus.iterrows():
    claim = row['claim']
    sentences = [claim] + [sent for sent in sent_tokenize(row['main_text'])]
    
    sentence_embeddings = sentence_transformer_model.encode(sentences)
    claim_embedding = sentence_embeddings[0]
    sentence_embeddings = sentence_embeddings[1:]
    cosine_similarity_emb = {}
    
    for sent, embedding in zip(sentences, sentence_embeddings):
        cosine_similarity_emb[sent] = np.linalg.norm(cosine_similarity(
            [claim_embedding, embedding]))
        
    top_k = dict(sorted(cosine_similarity_emb.items(),
                        key=itemgetter(1))[:k])
    corpus.at[index, 'top_k'] = ' '.join(key for key in top_k.keys())