In [22]:
# Dependencies

import pandas as pd
import numpy as np
import re

from clean_data import clean_claim, clean_evidence, clean_dataframe, validate_dataframe, validate_evidence

In [23]:
# Load Data

df = pd.read_csv("../../data/raw/climate-fever.csv")

In [24]:
# Drop Entries

core_columns = ['claim_id', 'claim', 'claim_label']
evidence_columns = [col for col in df.columns if col.startswith('evidences/') and '/votes/' not in col and '/entropy' not in col]
columns_to_keep = core_columns + evidence_columns

df = df[columns_to_keep]
print(df['claim_label'].value_counts())
df = df.dropna()
print(df['claim_label'].value_counts())

claim_label
SUPPORTS              654
NOT_ENOUGH_INFO       472
REFUTES               252
DISPUTED              154
0.6931471805599453      1
Name: count, dtype: int64
claim_label
SUPPORTS           654
NOT_ENOUGH_INFO    471
REFUTES            252
DISPUTED           154
Name: count, dtype: int64


In [25]:
# Text Columns

claim_cols = ['claim']
evidence_cols = [col for col in df.columns if col.endswith('/evidence') or col.endswith('/article')]
evidence_cols = [col for col in evidence_cols if col in df.columns]

In [26]:
# Validate (show invalid chars)

validate_dataframe(df, claim_cols + evidence_cols)

⋅: 8901
ɔ: 596
·: 183
′: 8242
₹: 8377
ʊ: 650
ʒ: 658
₂: 8322
έ: 941
ɲ: 626
р: 1088
п: 1087
∩: 8745
‐: 8208
ə: 601
ʂ: 642
с: 1089
Р: 1056
“: 8220
ɛ: 603
≡: 8801
˚: 730
″: 8243
̂: 770
⋯: 8943
ț: 539
ό: 972
∪: 8746
д: 1076
¦: 166
−: 8722
²: 178
∑: 8721
×: 215
ˈ: 712
ɪ: 618
…: 8230
⇌: 8652
ː: 720
у: 1091
‑: 8209
н: 1085
С: 1057
и: 1080
”: 8221
К: 1050
’: 8217
⁄: 8260
‘: 8216
е: 1077
ɑ: 593
—: 8212
я: 1103
–: 8211
а: 1072
∼: 8764


In [27]:
# Clean Text

df = clean_dataframe(df, claim_columns=claim_cols, evidence_columns=evidence_cols)

In [28]:
# Save Cleaned Data

df.to_csv("../../data/processed/climate-fever-cleaned.csv", index=False)

In [29]:
# Claims

claims = df[['claim_id', 'claim', 'claim_label']].copy()
claims = claims[claims['claim_label'] != 'DISPUTED']

In [30]:
# Evidences

evidence_records = []

for idx, row in df.iterrows():
    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_text = row.get(f'evidences/{i}/evidence')
        article = row.get(f'evidences/{i}/article')

        if pd.notna(evidence_id) and pd.notna(evidence_text):
            evidence_records.append({
                'evidence_id': evidence_id,
                'evidence': evidence_text,
                'article': article
            })

evidences = pd.DataFrame(evidence_records)
evidences = evidences.drop_duplicates(subset=['evidence_id'])

In [31]:
# Filter Bad Evidences

valid_mask = evidences['evidence'].apply(validate_evidence)
print(f"Filtered Out: {len(evidences) - valid_mask.sum()}")
evidences = evidences[valid_mask]

Filtered Out: 99


In [32]:
# Mapping

mapping_records = []

for idx, row in df.iterrows():
    claim_id = row['claim_id']

    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_label = row.get(f'evidences/{i}/evidence_label')

        if pd.notna(evidence_id):
            mapping_records.append({
                'claim_id': claim_id,
                'evidence_id': evidence_id,
                'label': evidence_label
            })

mappings = pd.DataFrame(mapping_records)

In [33]:
# Save Data

claims.to_csv("../../data/processed/claims.csv", index=False)
evidences.to_csv("../../data/processed/evidences.csv", index=False)
mappings.to_csv("../../data/processed/mappings.csv", index=False)