In [37]:
# Dependencies

import pandas as pd
import numpy as np
import re

from clean_text import clean_text, clean_dataframe, validate_dataframe

In [38]:
# Load Data

df = pd.read_csv("../../data/raw/climate-fever.csv")

In [39]:
# Drop Entries

core_columns = ['claim_id', 'claim', 'claim_label']
evidence_columns = [col for col in df.columns if col.startswith('evidences/') and '/votes/' not in col and '/entropy' not in col]
columns_to_keep = core_columns + evidence_columns

df = df[columns_to_keep]
df = df.dropna()

In [40]:
# Text Columns

text_columns = ['claim'] + [col for col in df.columns if col.endswith('/evidence') or col.endswith('/article')]
text_columns = [col for col in text_columns if col in df.columns]

In [41]:
# Validate (show invalid chars)

validate_dataframe(df, text_columns)

—: 8212
 : 8201
−: 8722
ʊ: 650
ʒ: 658
​: 8203
ɔ: 596
“: 8220
‐: 8208
и: 1080
Р: 1056
₂: 8322
ə: 601
≡: 8801
: 146
‘: 8216
′: 8242
⁄: 8260
 : 8197
…: 8230
 : 8202
″: 8243
ț: 539
–: 8211
∑: 8721
‑: 8209
∼: 8764
̂: 770
⋅: 8901
⇌: 8652
ό: 972
’: 8217
: 128
 : 160
∩: 8745
ʂ: 642
ː: 720
 : 8239
у: 1091
п: 1087
₹: 8377
¦: 166
ˈ: 712
⁠: 8288
﻿: 65279
⋯: 8943
е: 1077
ɑ: 593
С: 1057
д: 1076
×: 215
н: 1085
·: 183
с: 1089
˚: 730
р: 1088
я: 1103
­: 173
ɪ: 618
έ: 941
К: 1050
: 148
а: 1072
ɲ: 626
²: 178
∪: 8746
ɛ: 603
”: 8221


In [42]:
# Clean Text

df = clean_dataframe(df, text_columns)

In [43]:
# Save Cleaned Data

df.to_csv("../../data/processed/climate-fever-cleaned.csv", index=False)

In [44]:
# Claims

claims = df[['claim_id', 'claim', 'claim_label']].copy()

In [45]:
# Evidences

evidence_records = []

for idx, row in df.iterrows():
    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_text = row.get(f'evidences/{i}/evidence')
        article = row.get(f'evidences/{i}/article')

        if pd.notna(evidence_id) and pd.notna(evidence_text):
            evidence_records.append({
                'evidence_id': evidence_id,
                'evidence': evidence_text,
                'article': article
            })

evidences = pd.DataFrame(evidence_records)
evidences = evidences.drop_duplicates(subset=['evidence_id'])

In [46]:
# Mapping

mapping_records = []

for idx, row in df.iterrows():
    claim_id = row['claim_id']

    for i in range(5):
        evidence_id = row.get(f'evidences/{i}/evidence_id')
        evidence_label = row.get(f'evidences/{i}/evidence_label')

        if pd.notna(evidence_id):
            mapping_records.append({
                'claim_id': claim_id,
                'evidence_id': evidence_id,
                'label': evidence_label
            })

mappings = pd.DataFrame(mapping_records)

In [47]:
# Save Data

claims.to_csv("../../data/processed/claims.csv", index=False)
evidences.to_csv("../../data/processed/evidences.csv", index=False)
mappings.to_csv("../../data/processed/mappings.csv", index=False)