#### 1. Prepare climate FEVER dataframe

In [11]:
%pprint on

Pretty printing has been turned ON


In [2]:
import pandas as pd
from pprint import pprint

# if you have pyarrow or fastparquet installed, pandas will pick one automatically
climate_fever = pd.read_parquet("climate_fever/test-00000-of-00001.parquet")

In [3]:
climate_fever.head(10)

Unnamed: 0,claim_id,claim,claim_label,evidences
0,0,Global warming is driving polar bears toward e...,0,[{'evidence_id': 'Extinction risk from global ...
1,5,The sun has gone into ‘lockdown’ which could c...,0,"[{'evidence_id': 'Famine:386', 'evidence_label..."
2,6,The polar bear population has been growing.,1,"[{'evidence_id': 'Polar bear:1332', 'evidence_..."
3,9,Ironic' study finds more CO2 has slightly cool...,1,"[{'evidence_id': 'Atmosphere of Mars:131', 'ev..."
4,10,Human additions of CO2 are in the margin of er...,1,[{'evidence_id': 'Carbon dioxide in Earth's at...
5,11,They tell us that we are the primary forces co...,0,"[{'evidence_id': 'Carbon dioxide:183', 'eviden..."
6,14,The Great Barrier Reef is experiencing the mos...,0,"[{'evidence_id': 'Coral bleaching:52', 'eviden..."
7,18,it’s not a pollutant that threatens human civi...,1,"[{'evidence_id': 'Air pollution:12', 'evidence..."
8,19,"If CO2 was so terrible for the planet, then in...",1,[{'evidence_id': 'Carbon dioxide in Earth's at...
9,21,"Sea level rise has been slow and a constant, p...",1,"[{'evidence_id': 'Russia:153', 'evidence_label..."


In [14]:
print(climate_fever.to_dict().iloc[0])

{'claim_id': '0', 'claim': 'Global warming is driving polar bears toward extinction', 'claim_label': 0, 'evidences': array([{'evidence_id': 'Extinction risk from global warming:170', 'evidence_label': 2, 'article': 'Extinction risk from global warming', 'evidence': '"Recent Research Shows Human Activity Driving Earth Towards Global Extinction Event".', 'entropy': 0.6931471824645996, 'votes': array(['SUPPORTS', 'NOT_ENOUGH_INFO', None, None, None], dtype=object)},
       {'evidence_id': 'Global warming:14', 'evidence_label': 0, 'article': 'Global warming', 'evidence': 'Environmental impacts include the extinction or relocation of many species as their ecosystems change, most immediately the environments of coral reefs, mountains, and the Arctic.', 'entropy': 0.0, 'votes': array(['SUPPORTS', 'SUPPORTS', None, None, None], dtype=object)},
       {'evidence_id': 'Global warming:178', 'evidence_label': 2, 'article': 'Global warming', 'evidence': 'Rising temperatures push bees to their physi

In [19]:
print(climate_fever.iloc[0]['evidences'])

[{'evidence_id': 'Extinction risk from global warming:170', 'evidence_label': 2, 'article': 'Extinction risk from global warming', 'evidence': '"Recent Research Shows Human Activity Driving Earth Towards Global Extinction Event".', 'entropy': 0.6931471824645996, 'votes': array(['SUPPORTS', 'NOT_ENOUGH_INFO', None, None, None], dtype=object)}
 {'evidence_id': 'Global warming:14', 'evidence_label': 0, 'article': 'Global warming', 'evidence': 'Environmental impacts include the extinction or relocation of many species as their ecosystems change, most immediately the environments of coral reefs, mountains, and the Arctic.', 'entropy': 0.0, 'votes': array(['SUPPORTS', 'SUPPORTS', None, None, None], dtype=object)}
 {'evidence_id': 'Global warming:178', 'evidence_label': 2, 'article': 'Global warming', 'evidence': 'Rising temperatures push bees to their physiological limits, and could cause the extinction of bee populations.', 'entropy': 0.6931471824645996, 'votes': array(['SUPPORTS', 'NOT_ENO

In [20]:
import json

def export_claims_with_supports(df, json_path):
    """
    From a DataFrame `df` with columns ['claim_id','claim','evidences'...],
    build a list of dicts containing only:
      - claim_id
      - claim
      - evidences: [ { title, content, supports_pct }, … ]
    and write it to `json_path` as pretty JSON.
    """
    output = []
    for _, row in df.iterrows():
        simple_evs = []
        for ev in row["evidences"]:
            # remove None votes, calculate SUPPORTS pct
            votes = [v for v in ev.get("votes", []) if v is not None]
            pct   = (votes.count("SUPPORTS") / len(votes) * 100) if votes else 0
            simple_evs.append({
                "title":        ev["article"],
                "content":      ev["evidence"],
                "supports_pct": pct
            })
        output.append({
            "claim_id":   row["claim_id"],
            "claim":      row["claim"],
            "evidences":  simple_evs
        })

    # write to JSON
    with open(json_path, "w") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

In [21]:
export_claims_with_supports(climate_fever, "prepared_datasets/climate_fever_with_evidence_details.json")

#### 2. Prepare COVIDFACT

In [None]:
import pandas as pd
import json

# 1. Read the JSONL file (one JSON object per line)
df = pd.read_json('covid_fact/COVIDFACT_dataset.jsonl', lines=True)

# 2. Drop the unwanted columns (ignore if they’re missing)
df = df.drop(columns=['flair', 'gold_source'], errors='ignore')

# 3. Convert to a list of dicts
records = df.to_dict(orient='records')

# 4. (Optional) Write out to a normal JSON file
with open('prepared_datasets/climate_fever.json', 'w') as f:
    json.dump(records, f, indent=2)

# If you just want to print it:
print(json.dumps(records, indent=2))