# Preprocessing
In this file, we remove unnecessary columns, and merge rating labels to just true and false. 

In [1]:
import pandas as pd
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.util import load_claims_from_file, convert_claims_to_dataframe

In [95]:
claims = load_claims_from_file('../data/raw/fact_claims_1739933287.json')
df = convert_claims_to_dataframe(claims)

display(df.head(5))

Unnamed: 0,text,claimant,claimDate,publisherName,publisherSite,reviewUrl,reviewTitle,reviewDate,textualRating
0,Pennsylvania Gov. Josh Shapiro is “being charg...,Facebook post,2025-02-10 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,"No, Pennsylvania Gov. Josh Shapiro wasn’t char...",2025-02-14 00:00:00+00:00,Pants on Fire!
1,Former USAID Administrator Samantha Power’s ne...,Mario Nawfal,2025-02-09 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,Former USAID head Samantha Power’s wealth didn...,2025-02-14 00:00:00+00:00,False
2,"""We found fraud and abuse, I would say those t...",President Donald Trump,2025-02-11 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/article/2025/feb/13...,"Trump, Musk claim government 'fraud' without s...",2025-02-13 00:00:00+00:00,"So far, the Trump White House has not shown ev..."
3,“Breaking news: Taylor Swift officially banned...,Facebook posts,2025-02-04 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,You need to calm down. Taylor Swift has not be...,2025-02-07 00:00:00+00:00,False
4,Early reports of four survivors show “this DC ...,Facebook posts,2025-01-30 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,Initial reports that people survived Potomac p...,2025-02-07 00:00:00+00:00,False


## Drop rows with no date

In [112]:
print(f"Number of rows in the dataframe: {len(df)}")
df_filtered = df.dropna(subset=['reviewDate'], how='all').copy()
# df_filtered = df.dropna(subset=['claimDate', 'reviewDate'], how='all').copy()
print(f"Number of rows in the filtered dataframe: {len(df_filtered)}")

display(df_filtered.head(5))

Number of rows in the dataframe: 52408
Number of rows in the filtered dataframe: 41159


Unnamed: 0,text,claimant,claimDate,publisherName,publisherSite,reviewUrl,reviewTitle,reviewDate,textualRating
0,Pennsylvania Gov. Josh Shapiro is “being charg...,Facebook post,2025-02-10 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,"No, Pennsylvania Gov. Josh Shapiro wasn’t char...",2025-02-14 00:00:00+00:00,Pants on Fire!
1,Former USAID Administrator Samantha Power’s ne...,Mario Nawfal,2025-02-09 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,Former USAID head Samantha Power’s wealth didn...,2025-02-14 00:00:00+00:00,False
2,"""We found fraud and abuse, I would say those t...",President Donald Trump,2025-02-11 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/article/2025/feb/13...,"Trump, Musk claim government 'fraud' without s...",2025-02-13 00:00:00+00:00,"So far, the Trump White House has not shown ev..."
3,“Breaking news: Taylor Swift officially banned...,Facebook posts,2025-02-04 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,You need to calm down. Taylor Swift has not be...,2025-02-07 00:00:00+00:00,False
4,Early reports of four survivors show “this DC ...,Facebook posts,2025-01-30 00:00:00+00:00,PolitiFact,politifact.com,https://www.politifact.com/factchecks/2025/feb...,Initial reports that people survived Potomac p...,2025-02-07 00:00:00+00:00,False


## Pick Date & Remove Columns

In [113]:
df_filtered['date'] = df_filtered[['claimDate', 'reviewDate']].min(axis=1)
df_filtered = df_filtered.drop(columns=['claimDate', 'reviewDate', 'publisherSite', 'reviewUrl'])

display(df_filtered.head(5))

Unnamed: 0,text,claimant,publisherName,reviewTitle,textualRating,date
0,Pennsylvania Gov. Josh Shapiro is “being charg...,Facebook post,PolitiFact,"No, Pennsylvania Gov. Josh Shapiro wasn’t char...",Pants on Fire!,2025-02-10 00:00:00+00:00
1,Former USAID Administrator Samantha Power’s ne...,Mario Nawfal,PolitiFact,Former USAID head Samantha Power’s wealth didn...,False,2025-02-09 00:00:00+00:00
2,"""We found fraud and abuse, I would say those t...",President Donald Trump,PolitiFact,"Trump, Musk claim government 'fraud' without s...","So far, the Trump White House has not shown ev...",2025-02-11 00:00:00+00:00
3,“Breaking news: Taylor Swift officially banned...,Facebook posts,PolitiFact,You need to calm down. Taylor Swift has not be...,False,2025-02-04 00:00:00+00:00
4,Early reports of four survivors show “this DC ...,Facebook posts,PolitiFact,Initial reports that people survived Potomac p...,False,2025-01-30 00:00:00+00:00


## Merge Ratings

In [114]:
# Group by the rating and count the number of claims for each rating
rating_counts = df_filtered['textualRating'].str.lower().value_counts().reset_index()
rating_counts.columns = ['Rating', 'Number of Claims']

print(f"Number of rows in rating_counts: {len(rating_counts)}")

# Display the top 10 rating counts
display(rating_counts.head(10))

Number of rows in rating_counts: 7385


Unnamed: 0,Rating,Number of Claims
0,false,13980
1,true,3819
2,mixture,1639
3,mostly false,1477
4,mostly true,1155
5,labeled satire,980
6,unproven,911
7,miscaptioned,863
8,misleading,806
9,pants on fire,739


As seen above, a lot of ratings are not very concise. We leave these out. 

In [115]:
rating_merge_dict = {
    'mixture': 'false',
    'mostly false': 'false',
    'pants on fire': 'false',
    'mostly true': 'true',
    'misleading': 'false',
    'altered': 'false',
    'four pinocchios': 'false',
    'inaccurate': 'false',
    'distorts the facts': 'false',
    'incorrect': 'false',
    'three pinocchios': 'false',
    'half true': 'false',
    'partly false': 'false',
    'correct.': 'true',
    'correct': 'true',
    'incorrect.': 'false',
    'true.': 'true',
    'false.': 'false',
    'wrong': 'false',
    'this is misleading': 'false',
    'pants on fire!': 'false',
    'mostly_accurate': 'true',
    'this is accurate': 'true',
    'this is not true.': 'false',
    'mis- leading': 'false',
    'partially_correct': 'false',
    'spins the facts': 'false',
    'spinning the facts': 'false',
    'twists the facts': 'false',
    'largely correct': 'true',
    'not true': 'false',
    'not true.': 'false',
    'one pinocchio': 'false',
    'two pinocchios': 'false',
    'this is wrong': 'false',
    'this is correct.': 'true',
    'partly true': 'false',
    'accurate': 'true',
    'not accurate': 'false',
    'not the whole story': 'false',
    'mostly accurate': 'true',
    'unproven': 'false',
    'unsupported': 'false',
    'no evidence': 'false',
    'unfounded': 'false',
    'this is misleading': 'false',
    'false: no evidence': 'false',
    'lacks evidence': 'false',
    'largely accurate': 'true',
    'misleading.': 'false',
    'there is no evidence for this.': 'false',
    'this is misleading.': 'false',
    'not proven': 'false',
    'mostly_correct': 'true',
    'that’s correct.': 'true',
}

df_filtered_merged = df_filtered.copy()
df_filtered_merged['mergedTextualRating'] = df_filtered_merged['textualRating'].str.lower().map(rating_merge_dict).fillna(df_filtered_merged['textualRating'].str.lower())

In [116]:
# Group by the rating and count the number of claims for each rating
rating_counts = df_filtered_merged['mergedTextualRating'].str.lower().value_counts().reset_index()
rating_counts.columns = ['Rating', 'Number of Claims']

print(f"Number of rows in rating_counts: {len(rating_counts)}")

# Display the top 10 rating counts
display(rating_counts.head(10))

Number of rows in rating_counts: 7331


Unnamed: 0,Rating,Number of Claims
0,false,23430
1,true,5160
2,labeled satire,980
3,miscaptioned,863
4,correct attribution,578
5,missing context,474
6,fake,323
7,misattributed,216
8,legend,214
9,scam,203


In [117]:
df_filtered_merged = df_filtered_merged[df_filtered_merged['mergedTextualRating'].isin(['true', 'false'])]

print(f"Number of rows in df_filtered_merged: {len(df_filtered_merged)}")

display(df_filtered_merged.head(10))

Number of rows in df_filtered_merged: 28590


Unnamed: 0,text,claimant,publisherName,reviewTitle,textualRating,date,mergedTextualRating
0,Pennsylvania Gov. Josh Shapiro is “being charg...,Facebook post,PolitiFact,"No, Pennsylvania Gov. Josh Shapiro wasn’t char...",Pants on Fire!,2025-02-10 00:00:00+00:00,False
1,Former USAID Administrator Samantha Power’s ne...,Mario Nawfal,PolitiFact,Former USAID head Samantha Power’s wealth didn...,False,2025-02-09 00:00:00+00:00,False
3,“Breaking news: Taylor Swift officially banned...,Facebook posts,PolitiFact,You need to calm down. Taylor Swift has not be...,False,2025-02-04 00:00:00+00:00,False
4,Early reports of four survivors show “this DC ...,Facebook posts,PolitiFact,Initial reports that people survived Potomac p...,False,2025-01-30 00:00:00+00:00,False
6,Video shows President Donald Trump saying Mexi...,Instagram posts,PolitiFact,Did Donald Trump say Mexicans call him ‘peachy...,False,2025-01-30 00:00:00+00:00,False
7,From the U.S. Agency for International Develop...,"U.S. Rep. Brian Mast, R-Fla.",PolitiFact,Does as little as 10% of USAID go to help peop...,False,2025-02-02 00:00:00+00:00,False
8,Photo shows Black Hawk helicopter pilot in fat...,Instagram post,PolitiFact,"No, this photo does not show fallen Army pilot...",False,2025-02-02 00:00:00+00:00,False
9,“Pilot of Blackhawk helicopter that crashed in...,Facebook posts,PolitiFact,Social media posts misidentify transgender pil...,Pants on Fire!,2025-01-31 00:00:00+00:00,False
10,"“As a Californian, we have given more to the r...","U.S. Sen. Adam Schiff, D-Calif.",PolitiFact,Does California give more than it gets on disa...,Mostly True,2025-01-26 00:00:00+00:00,True
11,The U.S. has an egg shortage because the “Bide...,Karoline Leavitt,PolitiFact,Karoline Leavitt blames Biden for egg shortage...,Half True,2025-01-28 00:00:00+00:00,False


We keep 36685 claim reviews, of which 30968 are labeled false, and 5717 labeled true.