In [1]:
from transformers import BartTokenizer
from tqdm import tqdm

import pandas as pd
import spacy
import os
import re

In [2]:
nlp = spacy.load("en_core_web_sm")
tokenizer = BartTokenizer.from_pretrained("facebook/perturber")

In [3]:
dimensions = ["nationality", "country", "religion"]
entity_types = ["NORP", "GPE"]

In [None]:
RAW_PATH = "../../data/raw"
PROCESSED_PATH = "../../data/processed"

In [5]:
def preprocess_text(text, target_entities):
    target_key = target_entities.keys()[0]

    entity_to_substitute = None
    # Create Doc object
    doc2 = nlp(text)
    # Identify the entities
    entities = [ent.text for ent in doc2.ents if ent.label_ in entity_types]
    # print("entities: ", entities)
    if entities:
        matched_entities = []
        for entity in entities:
            if any(entity.lower() in token.lower() for token in target_entities[target_key].to_list()):
                matched_entities.append(entity)
        if matched_entities:
            # print("matched_ entities: ", matched_entities[0])
            entity_to_substitute = matched_entities[0]
    return entity_to_substitute

In [6]:
def replace_entity(text, entity, df, search_column, return_column):
    # Check if the substring is present in the search column
    mask = df[search_column].str.contains(entity.capitalize())

    # If a match is found, return the corresponding value from the return column
    if mask.any():
        swap_entity = df.loc[mask, return_column].iloc[0]
        regex = f"([A-Z]([a-z]+|\.)\s*)*{entity.split(' ')[-1]}"
        text = re.sub(r''+regex, swap_entity, text)
    return text

In [7]:
data_paths = []
for path in os.walk(RAW_PATH):
    for file in path[2]:
        if file.endswith("test.csv") or file.endswith("train.csv"):
            data_paths.append(f"{path[0]}{os.sep}{file}")

In [8]:
def batches(sents, batch_size):
    for i in range(0, len(sents), batch_size):
        yield sents[i : i + batch_size]

In [None]:
for dimension in dimensions:
    target_entities = pd.read_csv(f"../../heterogeneity/lists_for_perturbations/{dimension}_swaps.csv")
    for path in data_paths:

        path2check = f'{PROCESSED_PATH}{os.sep}{dimension}{os.sep}{path.split("/")[-1]}'
        if os.path.exists(path2check):
            print(f"Already exists file: {path2check}. Skip.")
            continue

        data = pd.read_csv(path).dropna()

        print(f"Processing {path}")

        print("Truncating texts...")
        data['text'] = data.apply(
        lambda row: tokenizer.batch_decode(
            tokenizer(
                row.text,
                return_tensors="pt",
                max_length=128,
                truncation=True,
            )["input_ids"],
        skip_special_tokens=True
        )[0],
        axis = 1
        )
        print("...texts truncated!")

        print("Etracting entities...")
        entities = []
        for row in data["text"].to_list():
            entity = preprocess_text(row, target_entities)
            entities.append(entity)
        data["entity"] = entities
        print("...entities extracted!")

        print("Perturbating texts...")
        text_input = data["text"].to_list()
        entities = data["entity"].to_list()
        perturbed_texts = []
        for input, entity in zip(tqdm(text_input, total=len(text_input)), entities):
            if entity:
                perturbed_texts.append(replace_entity(input, entity, target_entities, target_entities.keys()[0], target_entities.keys()[1]))
            else:
                perturbed_texts.append(input)
        data['perturbed_text'] = perturbed_texts
        data = data[["text", "perturbed_text", "entity", "labels"]]

        print("...texts perturbated!")

        if not os.path.exists(f"{PROCESSED_PATH}{os.sep}{dimension}"):
            os.mkdir(f"{PROCESSED_PATH}{os.sep}{dimension}")

        data_path = f"{PROCESSED_PATH}{os.sep}{dimension}{os.sep}{path.split('/')[-1]}"
        data.to_csv(data_path, index=False, header=True, encoding="utf-8")
        print(f"{data_path} created")
        print("\n")

Processing ../data/raw/fingerprints copy/fingerprints_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 13200/13200 [00:00<00:00, 16383.82it/s]


...texts perturbated!
../data/processed/nationality/fingerprints_train.csv created


Processing ../data/raw/fingerprints copy/fingerprints_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3300/3300 [00:00<00:00, 16942.12it/s]

...texts perturbated!





../data/processed/nationality/fingerprints_test.csv created


Processing ../data/raw/clef22 copy/clef22_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 720/720 [00:00<00:00, 21874.77it/s]

...texts perturbated!
../data/processed/nationality/clef22_train.csv created


Processing ../data/raw/clef22 copy/clef22_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 180/180 [00:00<00:00, 13164.57it/s]

...texts perturbated!
../data/processed/nationality/clef22_test.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 52/52 [00:00<00:00, 19326.88it/s]

...texts perturbated!
../data/processed/nationality/twittercovidq2_test.csv created


Processing ../data/raw/unifiedm2/basil_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6367/6367 [00:00<00:00, 59890.28it/s]

...texts perturbated!
../data/processed/nationality/basil_train.csv created







Processing ../data/raw/unifiedm2/clickbait_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3812/3812 [00:00<00:00, 45835.18it/s]

...texts perturbated!
../data/processed/nationality/clickbait_test.csv created


Processing ../data/raw/unifiedm2/basil_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1592/1592 [00:00<00:00, 71726.00it/s]

...texts perturbated!
../data/processed/nationality/basil_test.csv created


Processing ../data/raw/unifiedm2/politifact_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 162/162 [00:00<00:00, 13220.69it/s]

...texts perturbated!
../data/processed/nationality/politifact_train.csv created







Processing ../data/raw/unifiedm2/webis_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1283/1283 [00:00<00:00, 18164.88it/s]

...texts perturbated!
../data/processed/nationality/webis_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 136/136 [00:00<00:00, 74332.21it/s]

...texts perturbated!
../data/processed/nationality/buzzfeed_train.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 208/208 [00:00<00:00, 25197.56it/s]

...texts perturbated!
../data/processed/nationality/twittercovidq2_train.csv created


Processing ../data/raw/unifiedm2/propaganda_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1294/1294 [00:00<00:00, 35893.09it/s]

...texts perturbated!
../data/processed/nationality/propaganda_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 34/34 [00:00<00:00, 69394.81it/s]

...texts perturbated!
../data/processed/nationality/buzzfeed_test.csv created


Processing ../data/raw/unifiedm2/propaganda_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 324/324 [00:00<00:00, 28533.88it/s]

...texts perturbated!
../data/processed/nationality/propaganda_test.csv created


Processing ../data/raw/unifiedm2/pheme_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1364/1364 [00:00<00:00, 32007.02it/s]

...texts perturbated!
../data/processed/nationality/pheme_train.csv created


Processing ../data/raw/unifiedm2/pheme_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 341/341 [00:00<00:00, 32322.21it/s]

...texts perturbated!
../data/processed/nationality/pheme_test.csv created


Processing ../data/raw/unifiedm2/webis_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 321/321 [00:00<00:00, 15401.89it/s]

...texts perturbated!
../data/processed/nationality/webis_test.csv created


Processing ../data/raw/unifiedm2/clickbait_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 15244/15244 [00:00<00:00, 46791.47it/s]


...texts perturbated!
../data/processed/nationality/clickbait_train.csv created


Processing ../data/raw/unifiedm2/politifact_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 40/40 [00:00<00:00, 11994.01it/s]

...texts perturbated!
../data/processed/nationality/politifact_test.csv created







Processing ../data/raw/shadesoftruth copy/shadesoftruth_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1600/1600 [00:00<00:00, 16046.81it/s]

...texts perturbated!
../data/processed/nationality/shadesoftruth_test.csv created







Processing ../data/raw/shadesoftruth copy/shadesoftruth_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6400/6400 [00:00<00:00, 16391.96it/s]


...texts perturbated!
../data/processed/nationality/shadesoftruth_train.csv created


Processing ../data/raw/fingerprints copy/fingerprints_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 13200/13200 [00:00<00:00, 23385.55it/s]


...texts perturbated!
../data/processed/country/fingerprints_train.csv created


Processing ../data/raw/fingerprints copy/fingerprints_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3300/3300 [00:00<00:00, 24127.47it/s]

...texts perturbated!





../data/processed/country/fingerprints_test.csv created


Processing ../data/raw/clef22 copy/clef22_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 720/720 [00:00<00:00, 24913.37it/s]

...texts perturbated!
../data/processed/country/clef22_train.csv created


Processing ../data/raw/clef22 copy/clef22_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 180/180 [00:00<00:00, 19304.87it/s]

...texts perturbated!
../data/processed/country/clef22_test.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 52/52 [00:00<00:00, 19207.73it/s]

...texts perturbated!
../data/processed/country/twittercovidq2_test.csv created


Processing ../data/raw/unifiedm2/basil_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6367/6367 [00:00<00:00, 113050.52it/s]

...texts perturbated!
../data/processed/country/basil_train.csv created


Processing ../data/raw/unifiedm2/clickbait_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3812/3812 [00:00<00:00, 62436.05it/s]

...texts perturbated!
../data/processed/country/clickbait_test.csv created


Processing ../data/raw/unifiedm2/basil_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1592/1592 [00:00<00:00, 128563.52it/s]

...texts perturbated!
../data/processed/country/basil_test.csv created


Processing ../data/raw/unifiedm2/politifact_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 162/162 [00:00<00:00, 17403.75it/s]

...texts perturbated!
../data/processed/country/politifact_train.csv created


Processing ../data/raw/unifiedm2/webis_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1283/1283 [00:00<00:00, 26621.08it/s]

...texts perturbated!
../data/processed/country/webis_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_train.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 136/136 [00:00<00:00, 98706.58it/s]

...texts perturbated!
../data/processed/country/buzzfeed_train.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 208/208 [00:00<00:00, 24860.80it/s]

...texts perturbated!
../data/processed/country/twittercovidq2_train.csv created


Processing ../data/raw/unifiedm2/propaganda_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1294/1294 [00:00<00:00, 52488.63it/s]

...texts perturbated!
../data/processed/country/propaganda_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 34/34 [00:00<00:00, 43960.03it/s]


...texts perturbated!
../data/processed/country/buzzfeed_test.csv created


Processing ../data/raw/unifiedm2/propaganda_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 324/324 [00:00<00:00, 44792.33it/s]

...texts perturbated!
../data/processed/country/propaganda_test.csv created


Processing ../data/raw/unifiedm2/pheme_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1364/1364 [00:00<00:00, 75276.72it/s]

...texts perturbated!
../data/processed/country/pheme_train.csv created


Processing ../data/raw/unifiedm2/pheme_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 341/341 [00:00<00:00, 74345.44it/s]

...texts perturbated!
../data/processed/country/pheme_test.csv created


Processing ../data/raw/unifiedm2/webis_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 321/321 [00:00<00:00, 23561.86it/s]

...texts perturbated!
../data/processed/country/webis_test.csv created


Processing ../data/raw/unifiedm2/clickbait_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 15244/15244 [00:00<00:00, 67465.68it/s]


...texts perturbated!
../data/processed/country/clickbait_train.csv created


Processing ../data/raw/unifiedm2/politifact_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 40/40 [00:00<00:00, 23957.18it/s]

...texts perturbated!
../data/processed/country/politifact_test.csv created


Processing ../data/raw/shadesoftruth copy/shadesoftruth_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1600/1600 [00:00<00:00, 28573.99it/s]

...texts perturbated!
../data/processed/country/shadesoftruth_test.csv created







Processing ../data/raw/shadesoftruth copy/shadesoftruth_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6400/6400 [00:00<00:00, 27185.48it/s]


...texts perturbated!
../data/processed/country/shadesoftruth_train.csv created


Processing ../data/raw/fingerprints copy/fingerprints_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 13200/13200 [00:00<00:00, 51484.43it/s]


...texts perturbated!
../data/processed/religion/fingerprints_train.csv created


Processing ../data/raw/fingerprints copy/fingerprints_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3300/3300 [00:00<00:00, 47087.41it/s]

...texts perturbated!
../data/processed/religion/fingerprints_test.csv created


Processing ../data/raw/clef22 copy/clef22_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 720/720 [00:00<00:00, 97428.66it/s]

...texts perturbated!
../data/processed/religion/clef22_train.csv created


Processing ../data/raw/clef22 copy/clef22_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 180/180 [00:00<00:00, 82727.89it/s]

...texts perturbated!
../data/processed/religion/clef22_test.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 52/52 [00:00<00:00, 125780.74it/s]

...texts perturbated!
../data/processed/religion/twittercovidq2_test.csv created


Processing ../data/raw/unifiedm2/basil_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6367/6367 [00:00<00:00, 413667.51it/s]

...texts perturbated!
../data/processed/religion/basil_train.csv created


Processing ../data/raw/unifiedm2/clickbait_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 3812/3812 [00:00<00:00, 307462.92it/s]

...texts perturbated!
../data/processed/religion/clickbait_test.csv created


Processing ../data/raw/unifiedm2/basil_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1592/1592 [00:00<00:00, 756295.39it/s]

...texts perturbated!
../data/processed/religion/basil_test.csv created


Processing ../data/raw/unifiedm2/politifact_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 162/162 [00:00<00:00, 50220.05it/s]

...texts perturbated!
../data/processed/religion/politifact_train.csv created


Processing ../data/raw/unifiedm2/webis_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1283/1283 [00:00<00:00, 73865.07it/s]

...texts perturbated!
../data/processed/religion/webis_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_train.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 136/136 [00:00<00:00, 103940.48it/s]

...texts perturbated!
../data/processed/religion/buzzfeed_train.csv created


Processing ../data/raw/unifiedm2/twittercovidq2_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 208/208 [00:00<00:00, 108576.88it/s]

...texts perturbated!
../data/processed/religion/twittercovidq2_train.csv created


Processing ../data/raw/unifiedm2/propaganda_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1294/1294 [00:00<00:00, 74623.33it/s]

...texts perturbated!
../data/processed/religion/propaganda_train.csv created


Processing ../data/raw/unifiedm2/buzzfeed_test.csv
Truncating texts...
...texts truncated!
Etracting entities...





...entities extracted!
Perturbating texts...


100%|██████████| 34/34 [00:00<00:00, 65808.18it/s]


...texts perturbated!
../data/processed/religion/buzzfeed_test.csv created


Processing ../data/raw/unifiedm2/propaganda_test.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 324/324 [00:00<00:00, 83177.53it/s]

...texts perturbated!
../data/processed/religion/propaganda_test.csv created


Processing ../data/raw/unifiedm2/pheme_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1364/1364 [00:00<00:00, 176940.92it/s]

...texts perturbated!
../data/processed/religion/pheme_train.csv created


Processing ../data/raw/unifiedm2/pheme_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 341/341 [00:00<00:00, 141989.24it/s]

...texts perturbated!
../data/processed/religion/pheme_test.csv created


Processing ../data/raw/unifiedm2/webis_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 321/321 [00:00<00:00, 95076.02it/s]

...texts perturbated!
../data/processed/religion/webis_test.csv created


Processing ../data/raw/unifiedm2/clickbait_train.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 15244/15244 [00:00<00:00, 270028.85it/s]

...texts perturbated!
../data/processed/religion/clickbait_train.csv created


Processing ../data/raw/unifiedm2/politifact_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 40/40 [00:00<00:00, 47087.33it/s]

...texts perturbated!
../data/processed/religion/politifact_test.csv created


Processing ../data/raw/shadesoftruth copy/shadesoftruth_test.csv
Truncating texts...





...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 1600/1600 [00:00<00:00, 37672.61it/s]

...texts perturbated!
../data/processed/religion/shadesoftruth_test.csv created







Processing ../data/raw/shadesoftruth copy/shadesoftruth_train.csv
Truncating texts...
...texts truncated!
Etracting entities...
...entities extracted!
Perturbating texts...


100%|██████████| 6400/6400 [00:00<00:00, 47323.33it/s]

...texts perturbated!





../data/processed/religion/shadesoftruth_train.csv created


