#### 1. Prepare climate FEVER dataframe

In [18]:
%pprint on

Pretty printing has been turned OFF


In [19]:
import pandas as pd
from pprint import pprint

# if you have pyarrow or fastparquet installed, pandas will pick one automatically
climate_fever = pd.read_parquet("climate_fever/test-00000-of-00001.parquet")

In [20]:
climate_fever.head(10)

Unnamed: 0,claim_id,claim,claim_label,evidences
0,0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,1,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,1,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,1,[{'evidence_id': 'Carbon dioxide in Earth's at...
5,11,They tell us that we are the primary forces co...,0,"[{'evidence_id': 'Carbon dioxide:183', 'eviden..."
6,14,The Great Barrier Reef is experiencing the mos...,0,"[{'evidence_id': 'Coral bleaching:52', 'eviden..."
7,18,it’s not a pollutant that threatens human civi...,1,"[{'evidence_id': 'Air pollution:12', 'evidence..."
8,19,"If CO2 was so terrible for the planet, then in...",1,[{'evidence_id': 'Carbon dioxide in Earth's at...
9,21,"Sea level rise has been slow and a constant, p...",1,"[{'evidence_id': 'Russia:153', 'evidence_label..."


In [22]:
print(climate_fever.to_dict().iloc[0])

AttributeError: 'dict' object has no attribute 'iloc'

In [23]:
print(climate_fever.iloc[0]['evidences'])

[{'evidence_id': 'Extinction risk from global warming:170', 'evidence_label': 2, 'article': 'Extinction risk from global warming', 'evidence': '"Recent Research Shows Human Activity Driving Earth Towards Global Extinction Event".', 'entropy': 0.6931471824645996, 'votes': array(['SUPPORTS', 'NOT_ENOUGH_INFO', None, None, None], dtype=object)}
 {'evidence_id': 'Global warming:14', 'evidence_label': 0, 'article': 'Global warming', 'evidence': 'Environmental impacts include the extinction or relocation of many species as their ecosystems change, most immediately the environments of coral reefs, mountains, and the Arctic.', 'entropy': 0.0, 'votes': array(['SUPPORTS', 'SUPPORTS', None, None, None], dtype=object)}
 {'evidence_id': 'Global warming:178', 'evidence_label': 2, 'article': 'Global warming', 'evidence': 'Rising temperatures push bees to their physiological limits, and could cause the extinction of bee populations.', 'entropy': 0.6931471824645996, 'votes': array(['SUPPORTS', 'NOT_ENO

In [24]:
import json

def export_claims_with_supports(df, json_path):
    """
    From a DataFrame `df` with columns ['claim_id','claim','evidences'...],
    build a list of dicts containing only:
      - claim_id
      - claim
      - evidences: [ { title, content, supports_pct }, … ]
    and write it to `json_path` as pretty JSON.
    """
    output = []
    for _, row in df.iterrows():
        simple_evs = []
        for ev in row["evidences"]:
            # remove None votes, calculate SUPPORTS pct
            votes = [v for v in ev.get("votes", []) if v is not None]
            pct   = (votes.count("SUPPORTS") / len(votes) * 100) if votes else 0
            simple_evs.append({
                "title":        ev["article"],
                "content":      ev["evidence"],
                "supports_pct": pct
            })
        output.append({
            "claim_id":   row["claim_id"],
            "claim":      row["claim"],
            "label":      bool(row["claim_label"]),
            "evidences":  simple_evs
        })

    # write to JSON
    with open(json_path, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

In [25]:
export_claims_with_supports(climate_fever, "prepared_datasets/climate_fever_with_evidence_details_labels.json")

#### 2. Prepare COVIDFACT

In [23]:
import pandas as pd
import json

# 1. Read the JSONL file (one JSON object per line)
df = pd.read_json('covid_fact/COVIDFACT_dataset.jsonl', lines=True)

# 2. Drop the unwanted columns (ignore if they’re missing)
df = df.drop(columns=['flair', 'gold_source'], errors='ignore')

# 3. Convert to a list of dicts
records = df.to_dict(orient='records')

# 4. (Optional) Write out to a normal JSON file
with open('prepared_datasets/covid_fact.json', 'w') as f:
    json.dump(records, f, indent=2)

#### 3. Prepare Politihop

In [14]:
import pandas as pd
import json
from collections import defaultdict

def extract_and_group(train_tsv, valid_tsv, test_tsv, output_path):
    """
    1) Reads the three TSV splits.
    2) Extracts only the truly validated evidence sentences (using `annotated_evidence`).
    3) Concatenates each record’s evidence sentences into one string.
    4) Groups by (statement, dataset), collecting those concatenated strings into lists.
    5) Writes the grouped records to a single JSON file.
    """
    def load_split(path, split_name, start_id, raw_recs):
        df = pd.read_csv(path, sep='\t', dtype=str)
        uid = start_id
        for _, row in df.iterrows():
            # full ruling sentences
            rulings = json.loads(row['ruling'])
            # annotated_evidence: chain → [sentence IDs]
            try:
                chains = json.loads(row.get('annotated_evidence', '{}'))
            except json.JSONDecodeError:
                chains = {}
            # gather all sentence IDs
            ids = set()
            for sid_list in chains.values():
                for sid in sid_list:
                    if isinstance(sid, str) and sid.isdigit():
                        ids.add(int(sid))
                    elif isinstance(sid, int):
                        ids.add(sid)
            # filter valid and sort
            valid_ids = sorted(i for i in ids if 0 <= i < len(rulings))
            # extract and concatenate
            evidence_texts = [rulings[i] for i in valid_ids]
            concatenated = " ".join(evidence_texts).strip()
            raw_recs.append({
                'statement': row['statement'],
                'politifact_label': row['politifact_label'],
                'annotated_label': row['annotated_label'],
                'dataset': split_name,
                'evidence': concatenated
            })
            uid += 1
        return uid

    # step 1–3: extract and concatenate
    raw_records = []
    counter = 1
    counter = load_split(train_tsv, 'train', counter, raw_records)
    counter = load_split(valid_tsv, 'validation', counter, raw_records)
    counter = load_split(test_tsv,  'test',       counter, raw_records)

    # step 4: group by (statement, dataset)
    grouped = {}
    for rec in raw_records:
        key = (rec['statement'], rec['dataset'])
        if key not in grouped:
            grouped[key] = {
                'politifact_label': rec['politifact_label'],
                'annotated_label': rec['annotated_label'],
                'evidence': []
            }
        grouped[key]['evidence'].append(rec['evidence'])

    # step 5: build the grouped list and write out
    grouped_records = []
    for (statement, dataset), data in grouped.items():
        grouped_records.append({
            'statement': statement,
            'politifact_label': data['politifact_label'],
            'annotated_label': data['annotated_label'],
            'dataset':   dataset,
            'evidence':  data['evidence']
        })

    with open(output_path, 'w') as f:
        json.dump(grouped_records, f, indent=2)
    print(f"Wrote {len(grouped_records)} grouped records to {output_path}")

In [15]:
extract_and_group(
    train_tsv='politi_hop/politihop_train.tsv',
    valid_tsv='politi_hop/politihop_valid.tsv',
    test_tsv= 'politi_hop/politihop_test.tsv',
    output_path='prepared_datasets/politihop_combined_and_grouped_ruling.json'
)


Wrote 497 grouped records to prepared_datasets/politihop_combined_and_grouped_ruling.json


#### 4. prepare HoVer Dev set

In [1]:
import json

def preprocess_dataset(input_path: str, output_path: str) -> None:
    """
    Reads a JSON file containing dataset examples, extracts claim, matching evidences,
    and label for each entry, then writes the results as a JSON list.

    Each output entry will have:
      - 'claim': the claim text
      - 'evidences': list of context contents whose titles appear in supporting_facts
      - 'label': the example label (e.g., SUPPORTED, REFUTED)
    """
    # Load the dataset
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    processed = []
    for example in data:
        claim = example.get('claim', '')
        label = example.get('label', '')

        # Build a map from context title to its content
        context = example.get('context', [])  # list of [title, content]
        context_map = {title: content for title, content in context}

        # Extract evidences by matching titles in supporting_facts
        evidences = []
        for title, _ in example.get('supporting_facts', []):
            content = context_map.get(title)
            if content:
                evidences.append(content)

        # Remove duplicates while preserving order
        seen = set()
        unique_evidences = []
        for ev in evidences:
            if ev not in seen:
                seen.add(ev)
                unique_evidences.append(ev)

        processed.append({
            'claim': claim,
            'evidences': unique_evidences,
            'label': label
        })

    # Write the processed list to output JSON file
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(processed, f, indent=2, ensure_ascii=False)

In [2]:

preprocess_dataset('hover/data/hover/doc_retrieval/hover_dev_doc_retrieval.json', 'prepared_datasets/hover_dev_full_docs.json')


In [3]:
preprocess_dataset('hover/data/hover/doc_retrieval/hover_train_doc_retrieval.json', 'prepared_datasets/hover_train_full_docs.json')