In [None]:
import httpcore
setattr(httpcore, 'SyncHTTPTransport', any)

from concurrent.futures import ThreadPoolExecutor
from tenacity import retry, stop_after_attempt, wait_fixed
import pandas as pd
from tqdm import tqdm
import numpy as np
import os
import ast
from urllib.parse import urlparse
from nltk.tokenize import word_tokenize
import pickle
import glob
import requests
import uuid
from langdetect import detect, DetectorFactory
import re
from googletrans import Translator
from tldextract import extract
from langdetect import detect_langs
import matplotlib.pyplot as plt
import json
import html

tqdm.pandas()

In [None]:
pattern = r'(https?://(?:www\.|)twitter\.com/[\w\d_/]+|https?://t\.co/[\w\d]+)'
# Define a function to find matches and create a dictionary
def find_twitter_links(text):
    if not text or not isinstance(text, str):
        return {}
    matches = re.findall(pattern, text)
    # Create a dictionary where keys are the match order (1-indexed) and values are the matches
    return {i+1: match for i, match in enumerate(matches)}

In [None]:
# LOAD FACTCHECKS
link = "https://storage.googleapis.com/datacommons-feeds/factcheck/latest/data.json"
# Download JSON with requests
r = requests.get(link)
# Load JSON
factchecks = r.json()

columns = ["datePublished","claimReviewed","author"]
temp_df = pd.concat(map(pd.DataFrame,[x["item"] for x in factchecks["dataFeedElement"]]))
temp_df = temp_df[columns]
temp_df["name"] = temp_df["author"].apply(lambda x: x.get("name",None) if isinstance(x,dict) else None)
temp_df["url"] = temp_df["author"].apply(lambda x: x.get("url",None) if isinstance(x,dict) else None)
temp_df["twitter_urls"] = temp_df["claimReviewed"].apply(lambda x: find_twitter_links(x))
temp_df["@type"] = temp_df["author"].apply(lambda x: x.get("@type",None) if isinstance(x,dict) else None)
temp_df.drop(columns=["author"],inplace=True)
temp_df = temp_df[~pd.isna(temp_df.claimReviewed)]
temp_df["claimReviewed"] = temp_df["claimReviewed"].apply(lambda x: re.sub(r'http\S+', '', x))
temp_df.reset_index(drop=True,inplace=True)
google_dump = temp_df

In [None]:
# LOAD FACTCHECKS
link = "https://storage.googleapis.com/datacommons-feeds/factcheck/latest/data.json"
# Download JSON with requests
r = requests.get(link)
# Load JSON
factchecks = r.json()
columns = ["datePublished","claimReviewed","name", "verdict","author_url", "url","twitter_urls"]

# Prepare empty list to store dataframes
dfs = []
# Iterate over each DataFeedItem in dataFeedElement
for data_item in tqdm(factchecks["dataFeedElement"]):
    # Create temporary DataFrame for each item
    temp_df = pd.DataFrame(data_item["item"])
    
    if 'author' in temp_df.columns and "claimReviewed" in temp_df.columns:
        temp_df["name"] = temp_df["author"].apply(lambda x: x.get("name",None) if isinstance(x,dict) else None)
        temp_df["author_url"] = temp_df["author"].apply(lambda x: x.get("url",None) if isinstance(x,dict) else None)
        temp_df["@type"] = temp_df["author"].apply(lambda x: x.get("@type",None) if isinstance(x,dict) else None)
        temp_df["twitter_urls"] = temp_df["claimReviewed"].apply(lambda x: find_twitter_links(x))
        if "reviewRating" in temp_df.columns:
            temp_df["verdict"] = temp_df["reviewRating"].apply(lambda x: x["alternateName"] if isinstance(x,dict) and "alternateName" in x else None)
        else:
            temp_df["verdict"] = None
        temp_df.drop(columns=["author"],inplace=True)

    temp_df["url"] = data_item["url"]  # Add URL from DataFeedItem to each row in the DataFrame
    dfs.append(temp_df)  # Append temporary DataFrame to list

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)

# Filter rows where claimReviewed is not NaN
df = df[~pd.isna(df.claimReviewed)]

# Reset the index
df.reset_index(drop=True,inplace=True)
google_dump = df
google_dump = google_dump[columns]
google_dump["inLanguage"] = None
print(google_dump.shape)

In [None]:
# Heuristic:
# If text is shorter than < 50 take it as claim else claim
def decide(x):
    """
    Decides which part of the Fact-Check is a claim. 
    Preference Order: Claim > Body > Headline
    """
    
    if x.claimReviewed:
        return(x.claimReviewed)
    else:
        if x.headline and isinstance(x.headline,str):
            if len(x.headline) <= average_length_claim_reviewd + 1.96*var_claim_reviewd**0.5:
                return(x.headline)
        if x.text and isinstance(x.text,str):
            if len(x.text) <= average_length_claim_reviewd + 1.96*var_claim_reviewd**0.5:
                return(x.text)
        else:
            return(float("nan"))
    return(float("nan"))

# Calulate the Average Length & Variance Based on the Google Dump
average_length_claim_reviewd = google_dump.claimReviewed.apply(lambda x: len(x)).mean()
var_claim_reviewd = google_dump.claimReviewed.apply(lambda x: len(x)).var()

In [None]:
def fix_and_load_json(file_path):
    with open(file_path, "r") as f:
        file_contents = f.read()

    # Assuming JSON objects are separated by line breaks for this example
    potential_json_objects = file_contents.split('\n')

    valid_json_objects = []
    for obj_str in potential_json_objects:
        try:
            json_obj = json.loads(obj_str)
            valid_json_objects.append(json_obj)
        except json.JSONDecodeError:
            print("Found an invalid JSON object, skipping...")

    return valid_json_objects


In [None]:
all_dfs = []
count = 0
# Columns to extract
columns = ["datePublished","claimReviewed","name","author_url", "url","twitter_urls","inLanguage","verdict", "claim_date","text"]

# Loop through all Dataframe
for f in tqdm(os.listdir("../Data/Factcheck/")):
    if ".json" in f:
        if "2024" in f:
            temp_json = fix_and_load_json("../Data/Factcheck/{}".format(f))
            temp_df = [x["_source"] for x in temp_json if "_source" in x]
            temp_df = pd.DataFrame(temp_df)

            temp_df["twitter_urls"] = temp_df["claim_review_body"].apply(lambda x: find_twitter_links(x))
            temp_df["claimReviewed"] = temp_df["claim_review_headline"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)
            temp_df["text"] = temp_df["claim_review_body"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)
            temp_df["headline"] = temp_df["claim_review_headline"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)

            temp_df["verdict"] = temp_df["claim_review_result"]
            temp_df["author_url"] = temp_df["author_link"]
            temp_df["name"] = temp_df["author"]
            temp_df["url"] = temp_df["claim_review_url"]
            temp_df["datePublished"] = pd.to_datetime(temp_df["created_at"], errors='coerce')
            temp_df["inLanguage"] = temp_df["language"]
            try:
                raw_claim_review = temp_df["raw_claim_review"].apply(lambda x: json.loads(x) if isinstance(x,str) else x)
                temp_df["claim_date"] = raw_claim_review.apply(lambda x: x.get("claimDate",None) if isinstance(x,dict) else None)
            except Exception as e:
                print(f"\033[91mError in New: {e}\033[0m")
                temp_df["claim_date"] = None

            temp_df["author"] = temp_df["author"].apply(lambda x: ", ".join(x) if isinstance(x,list) else str(x))
            temp_df["author_link"] = temp_df["author_link"].apply(lambda x: "".join(x) if isinstance(x,str) else str(x))
        else:
            temp_json = pd.read_json("../Data/Factcheck/{}".format(f))
            temp_df = pd.DataFrame(temp_json)

            temp_df["twitter_urls"] = temp_df["raw"].apply(lambda x: find_twitter_links(x["claim_review_body"]))
            temp_df["claimReviewed"] = temp_df["claimReviewed"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)
            temp_df["text"] = temp_df["text"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)
            temp_df["headline"] = temp_df["headline"].apply(lambda x: re.sub(r'http\S+', '', x) if isinstance(x,str) else x)

            temp_df["verdict"] = temp_df["reviewRating"].apply(lambda x: x.get("alternateName",None) if isinstance(x,dict) else None)

            if "author" in temp_df.columns:
                temp_df["author_url"] = temp_df.author.apply(lambda x: x.get("url",None) if isinstance(x,dict) else None)
                temp_df["name"] = temp_df["author"].apply(lambda x: x.get("name",None) if isinstance(x,dict) else None)
            else:
                temp_df["author_url"] = None
                temp_df["name"] = None

            temp_df["url"] = temp_df["url"]
            temp_df["datePublished"] = pd.to_datetime(temp_df["datePublished"], errors='coerce')
            temp_df["inLanguage"] = temp_df["inLanguage"]
            temp_df["raw_claim_review"] = temp_df.apply(lambda x: json.loads(x.get("raw",{}).get("raw_claim_review","{}")),axis=1)
            temp_df["claim_date"] = temp_df["raw_claim_review"].apply(lambda x: x.get("claimDate", None) if x else None)
        
        count += temp_df.shape[0]

        # Extract which column is the claimReviewed
        for i,r in temp_df.iterrows():
            temp_df.loc[i,"claimReviewed"] = decide(r)
        
        temp_df = temp_df[columns]    
        temp_df = temp_df.reset_index(drop=True)
        all_dfs.append(temp_df)
        
scraped_factchecks = pd.concat(all_dfs)
scraped_factchecks = scraped_factchecks.reset_index(drop = True)
print(count)

### Analysis of Duration to Publish Fact-check - We should do this when we have the clusters.

In [None]:
scraped_factchecks["claim_date"] = pd.to_datetime(scraped_factchecks["claim_date"], errors='coerce', utc=True)
scraped_factchecks["datePublished"] = pd.to_datetime(scraped_factchecks["datePublished"], errors='coerce', utc=True)
scraped_factchecks["timedifference"] = abs(scraped_factchecks["claim_date"] - scraped_factchecks["datePublished"]).apply(lambda x: x.total_seconds() if not pd.isna(x) else x)
scraped_factchecks.loc[scraped_factchecks["timedifference"] < 3600,"timedifference"] = None
scraped_factchecks["timedifference_hours"] = scraped_factchecks["timedifference"] // 3600
scraped_factchecks["timedifference_days"] = scraped_factchecks["timedifference"] // (3600 * 24)
scraped_factchecks["author_url"] = scraped_factchecks["author_url"].apply(lambda x: ", ".join(x) if isinstance(x,list) else str(x))


In [None]:
# Analysis of Ratings:
# - Only look at Ratings that have been given at least 100 times
# - remove all punctuation, case, and spaces
scraped_factchecks["verdict"] = scraped_factchecks["verdict"].apply(lambda x: x.lower() if isinstance(x,str) else x)
scraped_factchecks["verdict"] = scraped_factchecks["verdict"].apply(lambda x: re.sub(r'[^\w\s]','',x) if isinstance(x,str) else x)
all_verdicts = scraped_factchecks["verdict"].value_counts()
all_verdicts = all_verdicts[all_verdicts > 50]
verdict_mapping = {"false": "False", "falso": "False", "misleading": "Mostly False", 
    "مضلل": "Mostly False", "mostly false": "Mostly False", "half true": "Mostly True", 
    "yanlış": "False", "زائف": "False", "true": "True", "خطأ": "False", 
    "fake": "False", "錯誤": "False", "faux": "False", "misleading content": "Mostly False",
    "engañoso": "Mostly False", "fałsz": "False", "verdadeiro": "True",
    "false context": "False", "errado": "False", "yanlis": "False", 
    "salah misleading content": "Mostly False", "mentira": "False", 
    "mixture": "NA", "fabricated content": "False", "pants on fire": "False", 
    "partly false": "Mostly False", "incorrect": "False", "other": "NA",
    "manipulated content": "False", "keliru": "False", "salah false context": "False",
    "2 false context": "False", "falsch": "False", "missing context": "Mostly False", 
    "notizia falsa": "False", "2 misleading content": "Mostly False", 
    "predominantemente falso": "False", "prawda": "True", "salah": "False", 
    "enganoso": "Mostly False", "παραπληροφόρηση": "Mostly False", "fals": "False", 
    "verdadeiro mas": "Mostly True", "fact crescendo rating false": "False",
    "mostly true": "Mostly True", "yanliş": "False", "verdadero": "True", 
    "salah fabricated content": "False", "satire": "NA", "doğru": "True", 
    "unproven": "NA", "部分錯誤": "Mostly False", "sesat": "False", 
    "impostor content": "False", "fuori contesto": "Mostly False", "nepravda": "False", 
    "不實": "False", "2 manipulated content": "False", "salah manipulated content": "False",
    "2 fabricated content": "False", "enganyós": "Mostly False", "ψευδές": "False", 
    "salah impostor content": "False", "impreciso": "Mostly False", "inconclusive": "NA",
    "pimenta na língua": "NA", "صحيح": "True", "falsk": "False", "netačno": "False",
    "kısmen yanlış": "Mostly False", "trompeur": "Mostly False", "manipulacja": "False",
    "correct": "True", "nì": "NA", "إثارة": "NA", "miscaptioned": "False", 
    "correct attribution": "True", "clarification": "NA", "cuestionable": "NA", 
    "inaccurate": "Mostly False", "altered image": "False", "cest faux": "False",
    "misplaced context": "Mostly False", "vero": "True", "helt feil": "False", 
    "sin registro": "NA", "ساخر": "NA", "altered": "False", "pinocchio andante": "False",
    "tak benar": "False", "事實釐清": "NA", "karma": "NA", "enganador": "Mostly False", 
    "meiaverdade": "NA", "ceri quasi": "NA", "irreführend": "Mostly False",
    "no evidence": "NA", "distorcido": "Mostly False", "مركب": "False", 
    "checked": "NA", "four pinocchios": "False", "false  content that has no basis in fact": "False", 
    "needs context": "NA", "fabricated": "False", "false connection": "False",
    "عنوان مضلل": "False", "raczej fałsz": "Mostly False", "ikke dokumenteret": "NA", 
    "tvrdnja je netočna": "False", "rrenë": "False", "گمراہ کن دعوی": "NA", 
    "exagerado": "NA", "фейк": "False", "neistina": "False", "falta contexto": "Mostly False", 
    "unsupported": "NA", "правда": "True", "فرضی دعوی": "NA", "манипуляция": "False",
    "كذب": "False", "ψευδής ισχυρισμός": "False", "انتقائي": "Mostly False", 
    "suspicious": "NA", "manipulated": "False", "مركبة": "False", "5 clarification": "NA", 
    "scam": "False", "mostly correct": "Mostly True", "półprawda": "Mostly True", 
    "chequeo múltiple": "NA", "sem contexto": "Mostly False", "تضليل": "Mostly False",
    "भरमक": "NA", "distorts the facts": "Mostly False", "salah satire": "False", "3": "NA", 
    "three pinocchios": "Mostly False", "verdadero pero": "Mostly True", 
    "commotion": "NA", "parcialmente falso": "Mostly False", "explainer": "NA",
    "verdadeiro em partes": "Mostly True", "nieweryfikowalne": "NA", 
    "false  the primary claims of the content are factually inaccurate": "False",
    "mix": "NA", "hard to categorise": "NA", "notizia vera": "True", 
    "real": "True", "false and misleading": "False", "selective": "Mostly False", 
    "مشكوك فيه": "NA", "ложь": "False", "misleading and false": "False", 
    "زائف جزئيا": "Mostly False", "ψευδοεπιστήμη": "NA", "manipulated media": "False",
    "partially true": "Mostly True", "misvisende": "Mostly False", "rrenë e kryptë": "False", 
    "verificamos": "NA", "verdad a medias": "Mostly True", 
    "λείπει θεματικό περιεχόμενο": "NA", "hetimet vazhdojnë": "NA",
    "two pinocchios": "Mostly False", "نادرست": "False", "descontextualizado": "Mostly False", 
    "sem registro": "NA", "misattributed": "False", "labeled satire": "NA", 
    "לא נכון": "False", "2 false connection": "False", "hoax": "False", 
    "falsk men": "Mostly False", "false headline": "False", "partiellement faux": "Mostly False", 
    "panzana pazzesca": "False", "5050": "NA", "outdated": "NA", 
    "montaje": "False", "غير صحيح": "False", "immagine modificata": "False", 
    "خرافة": "False", "false claim": "False", "flipflop": "NA", "очень спорно": "NA", 
    "not the whole story": "Mostly False", "certo": "True", "1 true": "True",
    "falsa": "False", "mostly true  mostly accurate but there is a minor error or problem": "Mostly True"
}

## Extract Twitter Links

In [None]:
url_scraped_dict = scraped_factchecks.set_index("url")["twitter_urls"]
url_scraped_dict = {k:v for k,v in url_scraped_dict.items() if len(v) > 0}
url_scraped_dict
all_urls = [list(value.values()) for key, value in url_scraped_dict.items()]
# Flatten the list
all_urls = [item for sublist in all_urls for item in sublist]
# Remove duplicates
all_urls = list(set(all_urls))

@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
def get_final_url(url):
    # Check if url is a t.co link
    if "t.co" in url:
        try:
            response = requests.get(url, timeout=5)
            return url, response.url
        except (requests.exceptions.RequestException, UnicodeDecodeError):
            return url, url
    else:
        return url, url

with ThreadPoolExecutor(max_workers=10) as executor:
    url_mapping = dict(tqdm(executor.map(get_final_url, all_urls), total=len(all_urls)))


url_scraped_dict_resolved = {}
for key in tqdm(url_scraped_dict):
    for number, link in url_scraped_dict[key].items():
        if key not in url_scraped_dict_resolved:
            url_scraped_dict_resolved[key] = {}
        if "t.co" in link:
            if link in url_mapping and url_mapping[link] != link and "t.co" not in url_mapping[link]:
                to_add = {"Resolved":url_mapping[link], "Original":link}
            else:
                to_add = {"Resolved":link, "Original":link}
            url_scraped_dict_resolved[key][number] = to_add
        else:
            url_scraped_dict_resolved[key][number] = {"Resolved":link, "Original":link}

# First, convert the nested dictionary into a flat list of dictionaries
rows_list = []
for article_url, urls in url_scraped_dict_resolved.items():
    for idx, url_dict in urls.items():
        row_dict = {
            'Article_Url': article_url,
            'Index': idx,
            'Original': url_dict['Original'],
            'Resolved': url_dict['Resolved']
        }
        rows_list.append(row_dict)

# Now, convert the list of dictionaries into a DataFrame
url_df = pd.DataFrame(rows_list)
url_df["Twitter_Link"] = url_df["Resolved"].apply(lambda x: True if "twitter.com" in x else False)
url_df["Twitter_Id"] = url_df["Resolved"].apply(lambda x: x.split("/")[-1] if "twitter.com" in x and "status" in x else None)

# For each link set variable is_first to True if it is the first link with an associated Twitter ID in the article
url_df = url_df[(url_df.Twitter_Id.notnull()) & (url_df.Twitter_Id.apply(lambda x: len(x) == 19 if isinstance(x,str) else False))]
first_urls = url_df.groupby("Article_Url").apply(lambda x: x["Index"] == x["Index"].min())
first_urls = first_urls.reset_index(level=1, drop=True).to_dict()
first_urls = {k for k,v in first_urls.items() if v}
url_df = url_df[url_df.Article_Url.isin(first_urls)]
# Export to CSV
url_df.to_csv("../Data/URLs.csv", index=False)

In [None]:
def geturl(x):
    return extract(x).domain + "." + extract(x).suffix

# Add inLanguage to Google Dump
google_dump["inLanguage"] = None

# Merge Google Dump with other Factchecks
df = pd.concat([scraped_factchecks,google_dump])

# Remove all Rows with no associated claimReviewed (na or None or empty string)
df = df[~pd.isna(df.claimReviewed)]
df = df[df.claimReviewed != ""]

# Remove all Empty Dates
df = df[~pd.isna(df.datePublished)]
df = df[df.datePublished != ""]


# Replace 20190-02-21 with 2019-02-21
df.datePublished = pd.to_datetime(df.datePublished, utc = True, errors='coerce')
df.reset_index(drop=True,inplace=True)

df = df.sort_values(by="datePublished")
df.drop_duplicates(subset="claimReviewed",inplace=True)
df.reset_index(drop=True,inplace=True)

df["id"] = df.claimReviewed.apply(lambda x: uuid.uuid4().hex)
df.set_index("id",inplace=True)

# Replace URL by domain
get_domain = lambda x: extract(x).domain
df["domain"] = df.url.apply(lambda x: get_domain(x) if isinstance(x,str) else None)

# For the rows where name is a list join with ", "
df["name"] = df.name.apply(lambda x: ", ".join(x) if isinstance(x,list) else x)

# Where there is no name take domain
df.loc[pd.isna(df.name),"name"] = df.loc[pd.isna(df.name),"domain"]
df.loc[df.name == "","name"] = df.loc[df.name == "","domain"]

isna_name = pd.isna(df.name)
isempty_name = df.name == ""
isna_domain = pd.isna(df.domain)
isempty_domain = df.domain == ""
df = df[~((isna_domain | isempty_domain))]

df = df[(df["datePublished"] < pd.to_datetime("2024-03-21", utc = True)) & (df["datePublished"] > pd.to_datetime("2000-12-30", utc = True))]

df.author_url = df.url.apply(lambda x: geturl(x) if isinstance(x,str) else x)

In [None]:
print(f"There are {pd.isna(df.inLanguage).sum()} rows with no language")
# Iterate through domains and check whethe at least 95% of non NA values are the same
# If so, replace the NA values with the most common value
domain_counts = df.domain.value_counts()
for domain in domain_counts.index:
    languages = df.loc[df.domain == domain,"inLanguage"]
    # Check whether less than 25 percent of the values are None
    if pd.isna(languages).sum() / languages.shape[0] > 0.5:
        continue
    languages = languages[~pd.isna(languages)]    
    # Add a check whether there are at least 50 values
    if languages.shape[0] > 50:
        if languages.value_counts().iloc[0] / languages.shape[0] > 0.95:
            df.loc[(df.domain == domain) & (pd.isna(df.inLanguage)),"inLanguage"] = languages.value_counts().index[0]

def safe_detect_langs(text):
    try:
        return detect_langs(text)[0].lang
    except:
        return "unknown"

df["detected_language"] = df.claimReviewed.progress_apply(lambda x: safe_detect_langs(x))


df.detected_language = df.detected_language.str.lower()
df.inLanguage = df.inLanguage.str.lower()
# Check whether the detected language is the same as the inLanguage for inLanguage != None
df["detected_language_same"] = df.apply(lambda x: x.detected_language == x.inLanguage if not pd.isna(x.inLanguage) else None,axis=1)

# Print Percentage of same
print(f"{df.detected_language_same.sum() / sum(df.detected_language != None)} of the detected languages are the same as the inLanguage")

# Create New Variable Langauge which takes it's value from detectedLagnauge if there is no inLanguage
df["language"] = df.inLanguage
df.loc[pd.isna(df.language),"language"] = df.loc[pd.isna(df.language),"detected_language"]
df = df.drop(columns=["detected_language","detected_language_same","inLanguage"])

supported_languages_labse = ["zh-tw","af","ar","as","az","bn","bo", "bs", "ca", "ceb","co", "cs", "cy", "da", "de", "el","en", "eo", "es", "et", "eu", "fa", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "haw","he","hi", "hmn","hr", "ht", "hu", "hy", "id", "ig", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "ku", "ky", "la", "lb", "lo", "lt", "lv", "mg", "mi", "mk", "ml","mn", "mr", "ms", "mt", "my", "ne", "nl", "no", "ny", "or", "pa", "pl", "pt", "ro", "ru", "rw", "si", "sk", "sl", "sm", "sn", "so", "sq", "sr", "st", "su", "sv", "sw", "ta", "te", "tg", "th","tk", "tl", "tr", "tt", "ug", "uk", "ur","uz", "vi","wo", "xh", "yi", "yo", "zh", "zu"]
print(f"{100*df.language.isin(supported_languages_labse).mean():.2f}% of the data is in a supported language")
print(df[~df.language.isin(supported_languages_labse)].language.value_counts().index.to_list())
df = df[df.language.isin(supported_languages_labse)]

## Clean ClaimReviewed

In [None]:
# Decode HTML Entities in the claimReviewed. Make sure that we do not delete anything that is not an entity
def decode_html(s):
    decoded = html.unescape(s)

    # Custom replacements
    replacements = {
        '\xa0': ' ',  # Non-breaking space
        '\u2003': ' ',  # Em space
        '\u2002': ' ',  # En space
        '\u2009': ' ',  # Thin space
        '\u200c': '',   # Zero width non-joiner
        '\u200d': ''    # Zero width joiner
    }

    for original, replacement in replacements.items():
        decoded = decoded.replace(original, replacement)

    return decoded

assert decode_html("你好 &amp; 你好") == "你好 & 你好"
assert decode_html("Caf&eacute; &amp; croissant") == "Café & croissant"
assert decode_html("আমি ভালোবাসি &lt;3") == "আমি ভালোবাসি <3"
assert decode_html("Привет, &quot;мир&quot;!") == 'Привет, "мир"!'
assert decode_html("مرحبا &gt; مرحبتين") == "مرحبا > مرحبتين"
assert decode_html("&amp;") == "&"
assert decode_html("&lt;") == "<"
assert decode_html("&gt;") == ">"
assert decode_html("&nbsp;") == " "
assert decode_html("&quot;") == '"'

df["claimReviewed"] = df.claimReviewed.progress_apply(lambda x: decode_html(x))

## Remove Duplicates based on minimal ClaimReviewed

In [None]:
# Check for duplicates by mapping to the lowest resolution of the text. Make sure that it works for all languages

def map_minimal(s):
    # Remove non-alphanumeric characters except spaces using regex
    result = re.sub(r'[^\w]|[\s\n]', '', s)
    # Remove spaces and convert the remaining characters to lowercase
    return result.lower().replace(" ", "")

assert map_minimal("Hello? is this real") == "helloisthisreal"
assert map_minimal("Hello!") == "hello"
assert map_minimal("Hello 123") == "hello123"
assert map_minimal("こんにちは、私の名前はボブです。") == "こんにちは私の名前はボブです"
assert map_minimal("שלום עולם") == "שלוםעולם"
assert map_minimal("你好，我叫小明。") == "你好我叫小明"
assert map_minimal("Привет, меня зовут Андрей.") == "приветменязовутандрей"
assert map_minimal("안녕하세요. 제 이름은 홍길동입니다.") == "안녕하세요제이름은홍길동입니다"
assert map_minimal("مرحبا، اسمي أحمد.") == "مرحبااسميأحمد"
assert map_minimal("Hello#World") == "helloworld"
assert map_minimal("Hello Wor*ld") == "helloworld"
assert map_minimal("Hello\nWorld") == "helloworld"
assert map_minimal("") == ""
assert map_minimal(" ") == ""
assert map_minimal("👍 This is a test 😀 of emojis! 👋") == "thisisatestofemojis"
assert map_minimal("[Square brackets], {curly braces}, and <angle brackets> too!") == "squarebracketscurlybracesandanglebracketstoo"
assert map_minimal("「你好嗎？」") == "你好嗎"
assert map_minimal("【】「」Hello") == "hello"

# Remove all claims with less than 5 characters
df = df[df.claimReviewed.str.len() > 5]
df["claim_minimal"] = df["claimReviewed"].progress_apply(lambda x: map_minimal(x))
np.random.seed(1)
df = df.sample(frac=1).drop_duplicates(subset="claim_minimal", keep="first").sort_index().reset_index(drop = True)
df["verdict"] = df.verdict.map(lambda x: verdict_mapping[x] if x in verdict_mapping else "NA")

In [None]:
df.to_csv("../Data/Cleaned_FactCheckData_nopreprocess_local.csv.gz", compression="gzip", index=False)