In [1]:
# Dependencies

import pandas as pd
import numpy as np
import re

from clean_data import clean_claim, clean_evidence, clean_dataframe, validate_dataframe, validate_evidence

In [2]:
# Load Data

df = pd.read_csv("../../data/raw/climate-fever.csv")

In [3]:
# Drop Entries

core_columns = ['claim_id', 'claim', 'claim_label']
evidence_columns = [col for col in df.columns if col.startswith('evidences/') and '/votes/' not in col and '/entropy' not in col]
columns_to_keep = core_columns + evidence_columns

df = df[columns_to_keep]
df = df.dropna()

In [4]:
# Text Columns

claim_cols = ['claim']
evidence_cols = [col for col in df.columns if col.endswith('/evidence') or col.endswith('/article')]
evidence_cols = [col for col in evidence_cols if col in df.columns]

In [5]:
# Validate (show invalid chars)

validate_dataframe(df, claim_cols + evidence_cols)

ʂ: 642
у: 1091
п: 1087
‑: 8209
̂: 770
⋅: 8901
ɲ: 626
е: 1077
ɑ: 593
”: 8221
∼: 8764
—: 8212
Р: 1056
·: 183
р: 1088
⋯: 8943
ɪ: 618
∪: 8746
“: 8220
‐: 8208
¦: 166
с: 1089
К: 1050
а: 1072
˚: 730
‘: 8216
–: 8211
∩: 8745
ɛ: 603
⁄: 8260
≡: 8801
∑: 8721
ό: 972
ʒ: 658
ɔ: 596
₹: 8377
я: 1103
ə: 601
×: 215
₂: 8322
′: 8242
⇌: 8652
…: 8230
ˈ: 712
и: 1080
н: 1085
−: 8722
ț: 539
д: 1076
″: 8243
²: 178
έ: 941
ː: 720
С: 1057
ʊ: 650
’: 8217


In [6]:
# Clean Text

df = clean_dataframe(df, claim_columns=claim_cols, evidence_columns=evidence_cols)

In [7]:
# Save Cleaned Data

df.to_csv("../../data/processed/climate-fever-cleaned.csv", index=False)

In [8]:
# Claims

claims = df[['claim_id', 'claim', 'claim_label']].copy()

In [9]:
# Evidences

evidence_records = []

for idx, row in df.iterrows():
    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_text = row.get(f'evidences/{i}/evidence')
        article = row.get(f'evidences/{i}/article')

        if pd.notna(evidence_id) and pd.notna(evidence_text):
            evidence_records.append({
                'evidence_id': evidence_id,
                'evidence': evidence_text,
                'article': article
            })

evidences = pd.DataFrame(evidence_records)
evidences = evidences.drop_duplicates(subset=['evidence_id'])

In [10]:
# Filter Bad Evidences

valid_mask = evidences['evidence'].apply(validate_evidence)
print(f"Filtered Out: {len(evidences) - valid_mask.sum()}")
evidences = evidences[valid_mask]

Filtered Out: 99


In [11]:
# Mapping

mapping_records = []

for idx, row in df.iterrows():
    claim_id = row['claim_id']

    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_label = row.get(f'evidences/{i}/evidence_label')

        if pd.notna(evidence_id):
            mapping_records.append({
                'claim_id': claim_id,
                'evidence_id': evidence_id,
                'label': evidence_label
            })

mappings = pd.DataFrame(mapping_records)

In [12]:
# Save Data

claims.to_csv("../../data/processed/claims.csv", index=False)
evidences.to_csv("../../data/processed/evidences.csv", index=False)
mappings.to_csv("../../data/processed/mappings.csv", index=False)