In [None]:
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm
import string
import os 
import re
import httpcore
setattr(httpcore, 'SyncHTTPTransport', any)
from googletrans import Translator
import re
import numpy as np
translator = Translator()

tqdm.pandas()
df = pd.read_csv("../Data/Cleaned_FactCheckData_nopreprocess_local.csv.gz", compression="gzip")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/LaBSE')
# Tokenize the data
df["tokens"] = df["claimReviewed"].progress_apply(lambda x: tokenizer(x).tokens())
# All domains that have less than 100 fact-checks => Unknown/Other
domains = df["domain"].value_counts()
domains = domains[domains < 100].index
df.loc[df["domain"].isin(domains), "domain"] = "Unknown/Other"
df.loc[pd.isna(df.domain),"domain"] = "Unknown/Other"

In [None]:
df["claimReviewed"] = df["tokens"].progress_apply(lambda x: tokenizer.convert_tokens_to_string(x).replace("[CLS]", "").replace("[SEP]", ""))
df["claimReviewed"] = df["claimReviewed"].progress_apply(lambda x: x.strip())

In [None]:
# Extract all 3-grams
from nltk.util import ngrams as ngram_generator
grams = {}

for i,r in tqdm(df.iterrows(), total=len(df)):
    tokens = r["tokens"]
    domain = r["domain"]

    ngrams = [list(ngram) for ngram in ngram_generator(tokens, 6)]
    for ngram in ngrams:
        ngram_string = tokenizer.convert_tokens_to_string(ngram)
        # Add the ngram to the counter
        if domain not in grams:
            grams[domain] = {}

        if ngram_string not in grams[domain]:
            grams[domain][ngram_string] = 0
        grams[domain][ngram_string] += 1

# Sort each key by value
domain_size = df.domain.value_counts().to_dict()

for key in grams:
    grams[key] = {k: v/domain_size[key] for k, v in sorted(grams[key].items(), key=lambda item: item[1], reverse=True)}


In [None]:
# Print total number of ngrams
print(f"Total number of ngrams: {sum([len(grams[key]) for key in grams])}") #3,917,254

# Print total number of ngrams with value > 0.01
print(f"Total number of ngrams with value > 0.01: {sum([len([k for k in grams[key] if grams[key][k] > 0.01]) for key in grams])}") 

# Print total number of ngrams with value > 0.05
print(f"Total number of ngrams with value > 0.05: {sum([len([k for k in grams[key] if grams[key][k] > 0.05]) for key in grams])}")

# Print total number of ngrams with value > 0.10
print(f"Total number of ngrams with value > 0.10: {sum([len([k for k in grams[key] if grams[key][k] > 0.10]) for key in grams])}")

# Print total number of ngrams with value > 0.50
print(f"Total number of ngrams with value > 0.50: {sum([len([k for k in grams[key] if grams[key][k] > 0.50]) for key in grams])}")

In [None]:
# Extract all n-grams with value > 0.1 and make a dataframe
ngrams = []
for key in grams:
    for ngram in grams[key]:
        if grams[key][ngram] > 0.1:
            ngrams.append([key, ngram, grams[key][ngram]])

ngrams = pd.DataFrame(ngrams, columns=["domain", "ngram", "value"])
ngrams["remove"] = False
ngrams["remove_partial"] = ""

def translate(text):
    try:
        return translator.translate(text, dest="en").text
    except:
        return text

# Translate the n-grams - to check whether they can be removed
ngrams["translation"] = ngrams["ngram"].progress_apply(lambda x: translate(x))
ngrams.to_csv("../Data/ngrams.csv", index=False)

In [None]:
remove_set = set()
remove_set.update(["Fact Check:","Fact-Check:", "FACT CHECK :","Fact Check.","[ Fakta atau Hoaks ]"
                   "FactCheck:", "[SALAH]", "[No Ceará]", "【 錯 誤 】","- Fakenews. pl","- Fakenews.pl"
                   "#Verificamos :", "# Verificamos:","VERA FILES  ","இடம் - முபாரக்பூர்",": # Real", "# Real","#Real"
                   "# Verificamos :", "#Verificamos:",
                   "Fact Check :", "Fact-Check :","[ SALAH ]","SATIRE :", "FAKE NEWS :","Satire :", "SATIRE:","Satire:",
                   "É # FAKE","Fake News :","網 傳 影 片", "網 傳 圖 片", "網 傳 影 片 稱", "網 傳 圖 片 搭 配 訊 息",
                   "Фейк : ","Манипуляция :", "Фотофейк :","फैक्ट चेक :","Claim : ","[ SPAM ]",
                   "Analysis |","Fake Video : ","É falso que","MUSIC - CHECK","WHATSAPP - CHECK",
                   "Veja o que é # FATO ou # FAKE nas ","Non, cette vidéo ne montre pas",
                   "Non, cette photo ne montre pas","[ Fakta atau Hoax ]","| Agência Lupa","- ELLINIKA HOAX",
                   "- Vishvas News","Fact Check | Misbar","- Malumatfuruş","| Doğruluk Pay","Full Fact","Fake :",
                   "- Cek Fakta","- Vera Files","- teyit. org","| مسبار","- factly","- FACTLY","Fact - check :","[ Fakta atau Hoaks ]","Quick  ","Saryusz - ","| Misbar",
                   "| فحص الحقائق","Fact Check NI","|მითების დეტექტორი","FALSE :","( COM VÍDEO )","Fact Check NI","|მითების დეტექტორი","| Fact Check","| فحص الحقائق", "NEWS ",
                   "Fact check","Fact Check", "Video show", "هذه الصورة", "Es falso que","فیکٹ چیک", "- Cekfakta Tempo. co","CEKFAKTA", ". CO"])


claims = []
for i,r in df.iterrows():
    claim_string = r["claimReviewed"]
    # Sort the ngrams by length
    for ngram in sorted(remove_set, key=len, reverse=True):
        # not CLS or SEP
        if ngram in claim_string:
            if "[CLS]" in ngram or "[SEP]" in ngram:
                cleaned_ngram = ngram.replace("[CLS]", "").replace("[SEP]", "").strip()
                claim_string = claim_string.replace(cleaned_ngram, "")
            else:
                claim_string = claim_string.replace(ngram, "")
    claims.append(claim_string)
    
df["claimReviewed"] = claims
del claims

# Regex sub 【.*?】
special_pattern = re.compile(r"【.*?】")
df["claimReviewed"] = df["claimReviewed"].apply(lambda x: special_pattern.sub("",x))
del special_pattern

# Strip incase the ngram was at the beginning or end of the string and induced superfluous spaces
df["claimReviewed"] = df["claimReviewed"].apply(lambda x: x.strip())

# If first letter is : remove (This is often the case when the ngram is something like Fact-check :!)
df["claimReviewed"] = df["claimReviewed"].apply(lambda x: x[1:].strip() if len(x) > 0 and x[0] == ":" else x)

# If the entire claim is enclosed by quotation marks ("" or ''), remove them In general. If first letter is punctuation, and the last letter is the same punctuation, remove them
quotation_marks = """"“'“”‘„‚«»„“‹›‘’“‘「 」『 』""" #thanks chatgpt
punctuation = string.punctuation
def clean_punt(x):
    if len(x) == 0:
        return x
    if len(x) > 0 and x[0] == x[-1] and not x[0].isalnum():
        return x[1:-1]
    elif x[0] in quotation_marks and x[-1] in quotation_marks:
        return x[1:-1]
    elif x[0] in quotation_marks and x[-1] not in quotation_marks:
        # check whether every character after the last occurence of a quotation mark is punctuation
        if all([c in punctuation + quotation_marks for c in x[x.rfind(x[0])+1:]]):
            # In that case we remove the first quotation and the last quotation mark
            return x[1:x.rfind(x[0])]
        else:
            return x
    return x

df["claimReviewed"] = df["claimReviewed"].apply(clean_punt)

In [None]:
# After removing the ngrams, we repeat the process of mapping to minimal
def map_minimal(s):
    # Remove non-alphanumeric characters except spaces using regex
    result = re.sub(r'[^\w]|[\s\n]', '', s)
    # Remove spaces and convert the remaining characters to lowercase
    return result.lower().replace(" ", "")

df["claimReviewed_mini"] = df["claimReviewed"].apply(map_minimal)
print(f"Number of claims after removing ngrams: {df.shape[0]} We are removing {df.claimReviewed_mini.duplicated().sum()} claims")

# Set Seed
np.random.seed(1)
df = df.sample(frac=1).drop_duplicates(subset="claimReviewed_mini", keep="first").sort_index().reset_index(drop = True)
# Drop the minimal column
df = df.drop("claimReviewed_mini", axis=1)

# remove claims that are empty
df = df[df["claimReviewed"] != ""].reset_index(drop=True)
# remove claims that are NA
df = df[df["claimReviewed"].notna()].reset_index(drop=True)
# remove claims that are of length < 10
df = df[df["claimReviewed"].apply(lambda x: len(x) > 10)].reset_index(drop=True)
df = df.reset_index(drop=True)

In [None]:
# Sanity Checks - Shape, Any NAs, Any Empties, Any len < 5
print(df.shape)
assert df["claimReviewed"].isna().sum() == 0, "NAs"
assert sum(df["claimReviewed"] == "") == 0, "Empties"
assert sum(df["claimReviewed"].apply(lambda x: len(x) < 5)) == 0, "Length < 5"

# Save the data
df.to_csv("../Data/minimal_FactCheckData_local.csv.gz", index=False, compression="gzip")