In [None]:
import numpy as np
import pandas as pd
import copy
import glob

import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score

import gc

In [None]:
n_sample = 200

## Step 1: Pool only the data with ground truth ECOG PS

In [None]:
def get_all_data(model, original_text):
    if (original_text):
        validation_with_PS = pd.read_csv(f"{model} validation result (Valid PS - Original Text).csv")
        test_with_PS = pd.read_csv(f"{model} test result (Valid PS - Original Text).csv")
        model_dfs = [validation_with_PS, test_with_PS]
    else:
        validation_no_PS = pd.read_csv(f"{model} validation result (Valid PS - PS Removed Text).csv")
        test_no_PS = pd.read_csv(f"{model} test result (Valid PS - PS Removed Text).csv")
        model_dfs = [validation_no_PS, test_no_PS]

    # Concatenate the list of DataFrames by rows
    model_all = pd.concat(model_dfs, axis=0)
    model_all = model_all.reset_index(drop=True)
    return model_all

In [None]:
CNN_all_original = get_all_data("CNN", True)
CNN_all_ps_removed = get_all_data("CNN", False)

In [None]:
LongFormer_all_original = get_all_data("LongFormer", True)
LongFormer_all_ps_removed = get_all_data("LongFormer", False)

## Step 2: Randomly sample 100 group truth positive and 100 group truth negative text from the data set

In [None]:
def generate_samples(model_all_original, model_all_ps_removed):
    # Selecting 100 rows where ground truth is 0 and 100 rows where ground is 1
    selected_rows_0 = model_all_original[model_all_original['high_ps'] == 0].sample(n=n_sample // 2, random_state=0)
    selected_rows_1 = model_all_original[model_all_original['high_ps'] == 1].sample(n=n_sample // 2, random_state=0)

    # Concatenating the two sets of rows
    original_samples = pd.concat([selected_rows_0, selected_rows_1])

    # Getting the indices of the selected rows
    selected_indices = original_samples.index

    # Selecting the same rows from CNN_all dataframe
    ps_removed_samples = model_all_ps_removed.loc[selected_indices]

    original_samples = original_samples.reset_index(drop=True)
    ps_removed_samples = ps_removed_samples.reset_index(drop=True)
    return original_samples, ps_removed_samples

In [None]:
CNN_original_samples, CNN_ps_removed_samples = generate_samples(CNN_all_original, CNN_all_ps_removed)

In [None]:
LongFormer_original_samples, LongFormer_ps_removed_samples = generate_samples(LongFormer_all_original, LongFormer_all_ps_removed)

## Step 3: Generate sentence removal files

In [None]:
# Load the tokenizer
tokenizer = LongformerTokenizer.from_pretrained("./best_Longformer_model")

# Initialize the model architecture
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-large-4096", num_labels=2)

# Load the saved weights into the model
model.load_state_dict(torch.load("./best_Longformer_model/pytorch_model.bin"))
model = torch.nn.DataParallel(model)

# If using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
MAX_TOKENS = 4096
def filter_exceeding_texts(note, tokenizer):
    tokens = tokenizer.tokenize(note)
    num_tokens = len(tokens)

    if num_tokens > MAX_TOKENS:
        # Tokenize the note and then convert back to string 
        # only the last MAX_TOKENS of tokens
        filtered_note = tokenizer.convert_tokens_to_string(tokens[-MAX_TOKENS:])
        return filtered_note
    else:
        return note

In [None]:
def generate_text(text):
    text = str(text)
    result = pd.DataFrame()
    sentence_id = []
    remaining_text = []
    removed_sentence = []
    sentence_list = text.split(".")
    for i in range(0, len(sentence_list)):
        sentence_id.append(i + 1)
        sentence_list_copy = copy.deepcopy(sentence_list)
        del sentence_list_copy[i]
        remaining_text.append(".".join(sentence_list_copy))
        removed_sentence.append(sentence_list[i])
    result["Removed Sentence ID"] = sentence_id
    result["Remaining Text"] = remaining_text
    result["Removed Sentence"] = removed_sentence
    return result

In [None]:
def generate_sentence_removal_file(model, original_text):
    if (original_text):
        target_rows = LongFormer_original_samples
    else :
        target_rows = LongFormer_ps_removed_samples
    for i in range(0, len(target_rows)):
        if original_text:
            text = target_rows.iloc[i]["text"]
        else:
            text = target_rows.iloc[i]["text_no_ps"]
        if (model == "LongFormer"):
            text = filter_exceeding_texts(text, tokenizer) # The LongFormer model only sees the last 4096 tokens
        result_df = generate_text(text)
        truth = target_rows.iloc[i]["high_ps"]
        prediction = target_rows.iloc[i]["Prediction"]
        logits_0 = target_rows.iloc[i]["Logits (Class 0)"]
        logits_1 = target_rows.iloc[i]["Logits (Class 1)"]
        probability_0 = target_rows.iloc[i]["Probability (Class 0)"]
        probability_1 = target_rows.iloc[i]["Probability (Class 1)"]
        result_df.insert(0, "Original Text", [text] * len(result_df))
        result_df["Ground Truth"] = [truth] * len(result_df)
        result_df["Original Prediction"] = [prediction] * len(result_df)
        result_df["Original Logits (Class 0)"] = [logits_0] * len(result_df)
        result_df["Original Logits (Class 1)"] = [logits_1] * len(result_df)
        result_df["Original Probability (Class 0)"] = [probability_0] * len(result_df)
        result_df["Original Probability (Class 1)"] = [probability_1] * len(result_df)
        if original_text:
            result_df.to_csv(f"{model} (Sentence Removal) ({i + 1}) (Valid PS - Original Text).csv", index = False)
        else:
            result_df.to_csv(f"{model} (Sentence Removal) ({i + 1}) (Valid PS - PS Removed Text).csv", index = False)

In [None]:
generate_sentence_removal_file("CNN", True)
generate_sentence_removal_file("CNN", False)

In [None]:
generate_sentence_removal_file("LongFormer", True)
generate_sentence_removal_file("LongFormer", False)

## Step 4: Run model inference on sentence removal files

#### CNN

In [None]:
# import libraries
import pandas as pd
import numpy as np
import pickle

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, Activation, concatenate, Average
from tensorflow.keras.layers import Embedding
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, TimeDistributed, GRU, Bidirectional, Layer
from tensorflow.keras import backend as K

import tensorflow as tf
import os

import torch

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# load tokenizer if already trained
with open('notes_tokenizer_ps_find.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
def get_simple_model(target):
    vocab_size = 10000
    embedding_dims = 256
    filters = 250
    kernel_size = 3
    epochs = 2
    hidden_dims = 128
    max_note_length=2000
    batch_size = 32


    # make model
    text_input = Input(shape=(max_note_length,), dtype='float32')

    text_embed = Embedding(vocab_size, embedding_dims, input_length=max_note_length, mask_zero=False)(text_input)
    
    cnn1 = Conv1D(filters=500, kernel_size=kernel_size, strides=1, padding='valid')(text_embed)
    x = GlobalMaxPooling1D()(cnn1)

    hidden = Dense(hidden_dims)(x)
    hidden = Activation('relu')(hidden)

    output = Dense(1, activation='linear')(hidden)

    model = Model(inputs=text_input, outputs=output)

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [None]:
model = get_simple_model('ps_high')
model.load_weights('ps_high'+'.h5')

In [None]:
def generate_logits_and_probability(logits_list):
    logits_list = [logit[0] for logit in logits_list]
    # Logits
    logit_class_1 = logits_list
    logit_class_0 = [-logit for logit in logits_list]

    # Probabilities
    probability_class_1 = [tf.math.sigmoid(logit).numpy() for logit in logits_list]
    probability_class_0 = [1 - tf.math.sigmoid(logit).numpy() for logit in logits_list]
    
    prediction = [1 if a > b else 0 for a, b in zip(probability_class_1, probability_class_0)]
    
    return logit_class_1, logit_class_0, probability_class_1, probability_class_0, prediction

In [None]:
def run_inference(n_sample, original_text):
    for i in range(1, n_sample + 1):
        if original_text:
            file_name =  f"CNN (Sentence Removal) ({i}) (Valid PS - Original Text).csv"
        else:
            file_name =  f"CNN (Sentence Removal) ({i}) (Valid PS - PS Removed Text).csv"
        data = pd.read_csv(file_name)
        text_list = data["Remaining Text"].tolist()
        text_list = ["" if type(text) != str else text for text in text_list]
        vocab_size = 10000
        max_note_length = 2000

        input_text = sequence.pad_sequences(tokenizer.texts_to_sequences([str(x) for x in text_list]), maxlen=max_note_length, padding='post')
        logits_list = model.predict(input_text)

        logit_class_1, logit_class_0, probability_class_1, probability_class_0, prediction = generate_logits_and_probability(logits_list)
        data["Prediction"] = prediction
        data["Logits (Class 0)"] = logit_class_0
        data["Logits (Class 1)"] = logit_class_1
        data["Probability (Class 0)"] = probability_class_0
        data["Probability (Class 1)"] = probability_class_1
        data.to_csv(file_name, index = False)

In [None]:
run_inference(n_sample, True)
run_inference(n_sample, False)

#### LongFormer

In [None]:
import os

import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score

from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score

import itertools

import glob

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

In [None]:
# Load the tokenizer
tokenizer = LongformerTokenizer.from_pretrained("./best_Longformer_model")

# Initialize the model architecture
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-large-4096", num_labels=2)

# Load the saved weights into the model
model.load_state_dict(torch.load("./best_Longformer_model/pytorch_model.bin"))
model = torch.nn.DataParallel(model)

# If using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def warn_if_truncated(texts, max_length):
    for text in texts:
        if len(tokenizer.tokenize(text)) > max_length:
            print(f"Warning: Text with length {len(tokenizer.tokenize(text))} is truncated to {max_length} tokens.")

In [None]:
MAX_TOKENS = 4096
def filter_exceeding_texts(notes, tokenizer):
    filtered_notes = []
    
    for note in notes:
        tokens = tokenizer.tokenize(note)
        num_tokens = len(tokens)
        
        if num_tokens > MAX_TOKENS:
            # Tokenize the note and then convert back to string 
            # only the last MAX_TOKENS of tokens
            filtered_note = tokenizer.convert_tokens_to_string(tokens[-MAX_TOKENS:])
            filtered_notes.append(filtered_note)
        else:
            filtered_notes.append(note)

    return filtered_notes

In [None]:
MAX_TOKENS = 4096
def encode_data(texts, max_length=MAX_TOKENS):
    warn_if_truncated(texts, max_length)
    encoded_data = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    return input_ids, attention_masks

In [None]:
def softmax(logits):
    """Convert logits to probabilities."""
    exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))
    return exp_logits / exp_logits.sum(axis=1, keepdims=True)

In [None]:
def run_inference(n_sample, original_text):
    for i in range(1, n_sample + 1):
        if original_text:
            file_name =  f"LongFormer (Sentence Removal) ({i}) (Valid PS - Original Text).csv"
        else:
            file_name =  f"LongFormer (Sentence Removal) ({i}) (Valid PS - PS Removed Text).csv"
        data = pd.read_csv(file_name)
        text_list = data["Remaining Text"].tolist()
        text_list = ["" if type(text) != str else text for text in text_list]
        text_list = filter_exceeding_texts(text_list, tokenizer)

        input_ids, attention_masks = encode_data(text_list)

        dataset = TensorDataset(input_ids, attention_masks)

        batch_size = 64

        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        model.eval()

        # Initialize tqdm for the loop
        progress = tqdm(loader, desc="Test", position=0, leave=True)

        logits_list = []  # Collect logits for all chunks

        preds = []

        with torch.no_grad():
            for batch in progress:
                inputs, masks = batch[0].to(device), batch[1].to(device)
                logits = model(inputs, attention_mask=masks).logits
                predictions = torch.argmax(logits, dim=1)
                preds.extend(predictions.tolist())

                logits_list.extend(logits.tolist())  # Append the logits for this batch

        logits_list = np.array(logits_list)
        probability = softmax(logits_list)

        data["Prediction"] = preds
        data["Logits (Class 0)"] = logits_list[:, 0]
        data["Logits (Class 1)"] = logits_list[:, 1]
        data["Probability (Class 0)"] = probability[:, 0]
        data["Probability (Class 1)"] = probability[:, 1]
        data.to_csv(file_name, index = False)

In [None]:
run_inference(n_sample, True)
run_inference(n_sample, False)

## Step 5: Calculate the logits and probability difference

In [None]:
def add_logits_and_probability_difference(model, n_sample, original_text):
    for i in range(1, n_sample + 1):
        if original_text:
            file_name =  f"{model} (Sentence Removal) ({i}) (Valid PS - Original Text).csv"
        else:
            file_name =  f"{model} (Sentence Removal) ({i}) (Valid PS - PS Removed Text).csv"
        data = pd.read_csv(file_name)
        data["Prediction Difference"] =  data["Prediction"] - data["Original Prediction"]
        data["Logits (Class 0) Difference"] = data["Logits (Class 0)"] - data["Original Logits (Class 0)"]
        data["Logits (Class 1) Difference"] = data["Logits (Class 1)"] - data["Original Logits (Class 1)"]
        data["Probability (Class 0) Difference"] = data["Probability (Class 0)"] - data["Original Probability (Class 0)"]
        data["Probability (Class 1) Difference"] = data["Probability (Class 1)"] - data["Original Probability (Class 1)"]
        data["Absolute Prediction Difference"] = (data["Prediction"] - data["Original Prediction"]).abs()
        data["Absolute Logits (Class 0) Difference"] = (data["Logits (Class 0)"] - data["Original Logits (Class 0)"]).abs()
        data["Absolute Logits (Class 1) Difference"] = (data["Logits (Class 1)"] - data["Original Logits (Class 1)"]).abs()
        data["Absolute Probability (Class 0) Difference"] = (data["Probability (Class 0)"] - data["Original Probability (Class 0)"]).abs()
        data["Absolute Probability (Class 1) Difference"] = (data["Probability (Class 1)"] - data["Original Probability (Class 1)"]).abs()
        data.to_csv(file_name, index = False)

In [None]:
add_logits_and_probability_difference("CNN", n_sample, True)
add_logits_and_probability_difference("CNN", n_sample, False)

In [None]:
add_logits_and_probability_difference("LongFormer", n_sample, True)
add_logits_and_probability_difference("LongFormer", n_sample, False)

## Step 6: Generate the explainability dataframe

#### CNN

In [None]:
def generate_CNN_explainability_dataframe(n_sample, original_text):
    CNN_explainability = pd.DataFrame()
    CNN_text_file_name = []
    CNN_ground_truth = []
    CNN_original_prediction = []
    CNN_original_logits_0 = []
    CNN_original_logits_1 = []
    CNN_original_probability_0 = []
    CNN_original_probability_1 = []
    CNN_sum_of_diff = []
    CNN_num_of_positive_significant_sentences = []
    CNN_num_of_negative_significant_sentences = []
    CNN_total_num_of_sentences = []
    CNN_most_positive_sentence = []
    CNN_most_positive_value = []
    CNN_most_negative_sentence = []
    CNN_most_negative_value = []

    for i in range(1, n_sample + 1):
        if (original_text):
            file_name =  f"CNN (Sentence Removal) ({i}) (Valid PS - Original Text).csv"
        else:
            file_name =  f"CNN (Sentence Removal) ({i}) (Valid PS - PS Removed Text).csv"
        data = pd.read_csv(file_name)
        CNN_text_file_name.append(file_name.split("/")[-1])
        CNN_ground_truth.append(data["Ground Truth"].iloc[0])
        CNN_original_prediction.append(data["Original Prediction"].iloc[0])
        CNN_original_logits_0.append(data["Original Logits (Class 0)"].iloc[0])
        CNN_original_logits_1.append(data["Original Logits (Class 1)"].iloc[0])
        CNN_original_probability_0.append(data["Original Probability (Class 0)"].iloc[0])
        CNN_original_probability_1.append(data["Original Probability (Class 1)"].iloc[0])
        CNN_sum_of_diff.append(data["Probability (Class 1) Difference"].sum())
        CNN_num_of_positive_significant_sentences.append(len(data[data['Probability (Class 1) Difference'] > 0.01]))
        CNN_num_of_negative_significant_sentences.append(len(data[data['Probability (Class 1) Difference'] < -0.01]))
        CNN_total_num_of_sentences.append(len(data))
        sorted_df = data.sort_values(by="Probability (Class 1) Difference", ascending=False)
        sorted_df.to_csv(file_name, index = False)
        CNN_most_positive_sentence.append(sorted_df['Removed Sentence'].iloc[0])
        CNN_most_positive_value.append(sorted_df['Probability (Class 1) Difference'].iloc[0])
        CNN_most_negative_sentence.append(sorted_df['Removed Sentence'].iloc[-1])
        CNN_most_negative_value.append(sorted_df['Probability (Class 1) Difference'].iloc[-1])

    CNN_explainability["Text file name"] = CNN_text_file_name
    CNN_explainability["Ground Truth"] = CNN_ground_truth
    CNN_explainability["Original prediction"] = CNN_original_prediction
    CNN_explainability["Original Logits (Class 0)"] = CNN_original_logits_0
    CNN_explainability["Original Logits (Class 1)"] = CNN_original_logits_1
    CNN_explainability["Original Probability (Class 0)"] = CNN_original_probability_0
    CNN_explainability["Original Probability (Class 1)"] = CNN_original_probability_1
    CNN_explainability["Diff sum"] = CNN_sum_of_diff
    CNN_explainability["Num positive significant sentences (>0.01)"] = CNN_num_of_positive_significant_sentences
    CNN_explainability["Num negative significant sentences (<-0.01)"] = CNN_num_of_negative_significant_sentences
    CNN_explainability["Total sentences"] = CNN_total_num_of_sentences
    CNN_explainability["Most positive sentence"] = CNN_most_positive_sentence
    CNN_explainability["Most positive value"] = CNN_most_positive_value
    CNN_explainability["Most negative sentence"] = CNN_most_negative_sentence
    CNN_explainability["Most negative value"] = CNN_most_negative_value
    if original_text:
        save_file_name = f"Explainability Analysis (CNN) (Valid PS - Original Text).csv"
    else:
        save_file_name = f"Explainability Analysis (CNN) (Valid PS - PS Removed Text).csv"
    CNN_explainability.to_csv(save_file_name, index = False)
    return CNN_explainability

In [None]:
CNN_explainability_original_text = generate_CNN_explainability_dataframe(n_sample, True)
CNN_explainability_ps_removed_text = generate_CNN_explainability_dataframe(n_sample, False)

In [None]:
CNN_explainability_original_text

In [None]:
CNN_explainability_ps_removed_text

#### LongFormer

In [None]:
def generate_LongFormer_explainability_dataframe(n_sample, original_text):
    LongFormer_explainability = pd.DataFrame()
    LongFormer_text_file_name = []
    LongFormer_ground_truth = []
    LongFormer_original_prediction = []
    LongFormer_original_logits_0 = []
    LongFormer_original_logits_1 = []
    LongFormer_original_probability_0 = []
    LongFormer_original_probability_1 = []
    LongFormer_sum_of_diff = []
    LongFormer_num_of_positive_significant_sentences = []
    LongFormer_num_of_negative_significant_sentences = []
    LongFormer_total_num_of_sentences = []
    LongFormer_most_positive_sentence = []
    LongFormer_most_positive_value = []
    LongFormer_most_negative_sentence = []
    LongFormer_most_negative_value = []

    for i in range(1, n_sample + 1):
        if (original_text):
            file_name =  f"LongFormer (Sentence Removal) ({i}) (Valid PS - Original Text).csv"
        else:
            file_name =  f"LongFormer (Sentence Removal) ({i}) (Valid PS - PS Removed Text).csv"
        data = pd.read_csv(file_name)
        LongFormer_text_file_name.append(file_name.split("/")[-1])
        LongFormer_ground_truth.append(data["Ground Truth"].iloc[0])
        LongFormer_original_prediction.append(data["Original Prediction"].iloc[0])
        LongFormer_original_logits_0.append(data["Original Logits (Class 0)"].iloc[0])
        LongFormer_original_logits_1.append(data["Original Logits (Class 1)"].iloc[0])
        LongFormer_original_probability_0.append(data["Original Probability (Class 0)"].iloc[0])
        LongFormer_original_probability_1.append(data["Original Probability (Class 1)"].iloc[0])
        LongFormer_sum_of_diff.append(data["Probability (Class 1) Difference"].sum())
        LongFormer_num_of_positive_significant_sentences.append(len(data[data['Probability (Class 1) Difference'] > 0.01]))
        LongFormer_num_of_negative_significant_sentences.append(len(data[data['Probability (Class 1) Difference'] < -0.01]))
        LongFormer_total_num_of_sentences.append(len(data))
        sorted_df = data.sort_values(by="Probability (Class 1) Difference", ascending=False)
        sorted_df.to_csv(file_name, index = False)
        LongFormer_most_positive_sentence.append(sorted_df['Removed Sentence'].iloc[0])
        LongFormer_most_positive_value.append(sorted_df['Probability (Class 1) Difference'].iloc[0])
        LongFormer_most_negative_sentence.append(sorted_df['Removed Sentence'].iloc[-1])
        LongFormer_most_negative_value.append(sorted_df['Probability (Class 1) Difference'].iloc[-1])

    LongFormer_explainability["Text file name"] = LongFormer_text_file_name
    LongFormer_explainability["Ground Truth"] = LongFormer_ground_truth
    LongFormer_explainability["Original prediction"] = LongFormer_original_prediction
    LongFormer_explainability["Original Logits (Class 0)"] = LongFormer_original_logits_0
    LongFormer_explainability["Original Logits (Class 1)"] = LongFormer_original_logits_1
    LongFormer_explainability["Original Probability (Class 0)"] = LongFormer_original_probability_0
    LongFormer_explainability["Original Probability (Class 1)"] = LongFormer_original_probability_1
    LongFormer_explainability["Diff sum"] = LongFormer_sum_of_diff
    LongFormer_explainability["Num positive significant sentences (>0.01)"] = LongFormer_num_of_positive_significant_sentences
    LongFormer_explainability["Num negative significant sentences (<-0.01)"] = LongFormer_num_of_negative_significant_sentences
    LongFormer_explainability["Total sentences"] = LongFormer_total_num_of_sentences
    LongFormer_explainability["Most positive sentence"] = LongFormer_most_positive_sentence
    LongFormer_explainability["Most positive value"] = LongFormer_most_positive_value
    LongFormer_explainability["Most negative sentence"] = LongFormer_most_negative_sentence
    LongFormer_explainability["Most negative value"] = LongFormer_most_negative_value
    if original_text:
        save_file_name = f"Explainability Analysis (LongFormer) (Valid PS - Original Text).csv"
    else:
        save_file_name = f"Explainability Analysis (LongFormer) (Valid PS - PS Removed Text).csv"
    LongFormer_explainability.to_csv(save_file_name, index = False)
    return LongFormer_explainability

In [None]:
LongFormer_explainability_original_text = generate_LongFormer_explainability_dataframe(n_sample, True)
LongFormer_explainability_ps_removed_text = generate_LongFormer_explainability_dataframe(n_sample, False)

In [None]:
LongFormer_explainability_original_text

In [None]:
LongFormer_explainability_ps_removed_text