In [20]:
import pandas as pd
import re
import random

In [17]:
path = "../data/single_word_with_replacement_"
train_df = pd.read_json(path+"train.jsonl", lines=True)
val_df = pd.read_json(path+"val.jsonl", lines=True)
test_df = pd.read_json(path+"test.jsonl", lines=True)

In [18]:
# Replace the acronym with a mask regardless of the capitalization
def replace_with_mask(s):
    insensitive_acronym = re.compile(re.escape(s["Acronym"]), re.IGNORECASE)
    s["Abstract"] = insensitive_acronym.sub("<MASKED_ACRONYM>", s["Abstract"])
    return s["Abstract"]

train_df["Abstract"] = train_df.apply(replace_with_mask, axis=1)
val_df["Abstract"] = val_df.apply(replace_with_mask, axis=1)
test_df["Abstract"] = test_df.apply(replace_with_mask, axis=1)

In [19]:
train_df.iloc[3177]["Abstract"]

"In criminal proceedings, sometimes it is not easy to evaluate the sincerity of oral testimonies. <MASKED_ACRONYM> - DEception in COURt corpus - has been built with the aim of training models suitable to discriminate, from a stylometric point of view, between sincere and deceptive statements. <MASKED_ACRONYM> is a collection of hearings held in four Italian Courts, in which the speakers lie in front of the judge. These hearings become the object of a specific criminal proceeding for calumny or false testimony, in which the deceptiveness of the statements of the defendant is ascertained. Thanks to the final Court judgment, that points out which lies are told, each utterance of the corpus has been annotated as true, uncertain or false, according to its degree of truthfulness. Since the judgment of deceptiveness follows a judicial inquiry, the annotation has been realized with a greater degree of confidence than ever before. Moreover, in Italy this is the first corpus of deceptive texts n

In [23]:
# Replace each <MASKED_ACRONYM> with a random word from the abstract (other than <MASKED_ACRONYM>)
def replace_mask_with_random_word(s):
    # Extract all unique words from the abstract, excluding <MASKED_ACRONYM>
    words = set(s["Abstract"].replace('<MASKED_ACRONYM>', '').split())
    
    # Remove empty strings, if any
    words.discard('')

    # Replace each <MASKED_ACRONYM> with a random word from the list
    while '<MASKED_ACRONYM>' in s["Abstract"]:
        random_word = random.choice(list(words))
        s["Abstract"] = s["Abstract"].replace('<MASKED_ACRONYM>', random_word, 1)
    
    return s["Abstract"]
    

In [24]:
train_df["Abstract"] = train_df.apply(replace_mask_with_random_word, axis=1)
val_df["Abstract"] = val_df.apply(replace_mask_with_random_word, axis=1)
test_df["Abstract"] = test_df.apply(replace_mask_with_random_word, axis=1)

In [25]:
train_df.iloc[3177]["Abstract"]

"In criminal proceedings, sometimes it is not easy to evaluate the sincerity of oral testimonies. uncertain - DEception in COURt corpus - has been built with the aim of training models suitable to discriminate, from a stylometric point of view, between sincere and deceptive statements. judicial is a collection of hearings held in four Italian Courts, in which the speakers lie in front of the judge. These hearings become the object of a specific criminal proceeding for calumny or false testimony, in which the deceptiveness of the statements of the defendant is ascertained. Thanks to the final Court judgment, that points out which lies are told, each utterance of the corpus has been annotated as true, uncertain or false, according to its degree of truthfulness. Since the judgment of deceptiveness follows a judicial inquiry, the annotation has been realized with a greater degree of confidence than ever before. Moreover, in Italy this is the first corpus of deceptive texts not relying on \

In [26]:
# Convert all words not in the start of a sentence to lowercase
# Function to convert all letters of words not at the start of a sentence to lowercase
def convert_to_lowercase(s):
    # Split the text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', s["Abstract"])
    converted_sentences = []

    for sentence in sentences:
        # Split each sentence into words and ensure only the first character of the first word is uppercase
        words = sentence.split()
        if words:
            converted_sentence = [words[0].capitalize()] + [word.lower() for word in words[1:]]
            converted_sentences.append(" ".join(converted_sentence))

    # Join the converted sentences back into a single text
    return " ".join(converted_sentences)

In [27]:
train_df["Abstract"] = train_df.apply(convert_to_lowercase, axis=1)
val_df["Abstract"] = val_df.apply(convert_to_lowercase, axis=1)
test_df["Abstract"] = test_df.apply(convert_to_lowercase, axis=1)

In [28]:
train_df.iloc[3177]["Abstract"]

"In criminal proceedings, sometimes it is not easy to evaluate the sincerity of oral testimonies. Uncertain - deception in court corpus - has been built with the aim of training models suitable to discriminate, from a stylometric point of view, between sincere and deceptive statements. Judicial is a collection of hearings held in four italian courts, in which the speakers lie in front of the judge. These hearings become the object of a specific criminal proceeding for calumny or false testimony, in which the deceptiveness of the statements of the defendant is ascertained. Thanks to the final court judgment, that points out which lies are told, each utterance of the corpus has been annotated as true, uncertain or false, according to its degree of truthfulness. Since the judgment of deceptiveness follows a judicial inquiry, the annotation has been realized with a greater degree of confidence than ever before. Moreover, in italy this is the first corpus of deceptive texts not relying on \

In [None]:
train_df.to_json(path+"al_train.jsonl", orient='records', lines=True)
val_df.to_json(path+"al_val.jsonl", orient='records', lines=True)
test_df.to_json(path+"al_test.jsonl", orient='records', lines=True)