# Importing libraries

In [1]:
import sys
import subprocess
import os
import numpy as np
import pandas as pd
import re
import seaborn as sns
import torch
import matplotlib.pyplot as plt
from collections import Counter
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import shap
from lime.lime_text import LimeTextExplainer
from scipy.stats import pearsonr
from scipy.spatial.distance import jensenshannon
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


  import pynvml  # type: ignore[import]


# Loading model and tokenizer

In [11]:
# Setting paths for model and results, as saved in "Replicating_Hearts_Model"
MODEL_DIR = "/home/ec2-user/HEARTS_Replication/model_output_albertv2/emgsd_trained"
RESULTS_DIR = "/home/ec2-user/HEARTS_Replication/results/explainability"
os.makedirs(RESULTS_DIR, exist_ok=True)

model_path = MODEL_DIR
print(f"\nModel directory: {MODEL_DIR}")
print(f"Results directory: {RESULTS_DIR}")


Model directory: /home/ec2-user/HEARTS_Replication/model_output_albertv2/emgsd_trained
Results directory: /home/ec2-user/HEARTS_Replication/results/explainability


In [4]:
# Loading model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

device = 0 if torch.cuda.is_available() else -1

# Defining label mapping (stereotype=0, neutral=1, unrelated=2)
label_to_id = {"stereotype": 0, "neutral": 1, "unrelated": 2}
id_to_label = {v: k for k, v in label_to_id.items()}
class_names = [id_to_label[i] for i in range(len(id_to_label))]


def predict_proba(text_list):
    """
    Wrapper that takes a list of texts and returns a (N, 3) numpy array
    with probabilities for [stereotype, neutral, unrelated].
    """
    if isinstance(text_list, str):
        text_list = [text_list]

    enc = tokenizer(
        text_list,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    if torch.cuda.is_available():
        enc = {k: v.to("cuda") for k, v in enc.items()}
        model.to("cuda")
    else:
        model.to("cpu")

    model.eval()
    with torch.no_grad():
        outputs = model(**enc)
        probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()

    return probs

# Loading dataset

In [5]:
# Loading EMGSD again
dataset = load_dataset("holistic-ai/EMGSD")
test_ds = dataset["test"]

# Creating the Hearts Framework pipeline, as defined by King et al. (2024)
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=False,  
    device=0
)

# Generating predicition meta data for explainability tasks
rows = []
for i, (text, cat) in enumerate(zip(test_ds["text"], test_ds["category"])):
    out = pipe(text)[0]         # e.g. {'label': 'LABEL_0', 'score': 0.9}
    pred_id = int(out["label"].split("_")[-1])  # 0 / 1 / 2
    
    rows.append({
        "sentence_id": i,
        "text": text,
        "dataset_name": "EMGSD",
        "categorisation": cat,
        "predicted_label": pred_id,
        "actual_label": label_to_id[cat]
    })

# Saving results for explainability tasks
full_results = pd.DataFrame(rows)
full_results.to_csv("full_results_albertv2.csv", index=False)
print(full_results.head())
print("✅ Saved full_results_albertv2.csv")


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


   sentence_id                                               text  \
0            0            Central African rebels left towns dirty   
1            1  I have a feeling he got arrested. You never kn...   
2            2  I have a muslim neighbor who attends Mosque. H...   
3            3  The little girl went to the park. Eggs benedic...   
4            4  My coworker is from China The pencil on the co...   

  dataset_name categorisation  predicted_label  actual_label  
0        EMGSD     stereotype                0             0  
1        EMGSD     stereotype                0             0  
2        EMGSD        neutral                0             1  
3        EMGSD      unrelated                2             2  
4        EMGSD      unrelated                2             2  
✅ Saved full_results_albertv2.csv


# Selecting samples for explainability analysis

In [8]:
print("="*60)
print("SELECTING SAMPLES FOR ANALYSIS")
print("="*60)

# Using the saved results
file_path = "full_results_albertv2.csv"

# Definining a function for obtaining samples
def sample_observations(file_path, k=37, seed=42):
    data = pd.read_csv(file_path)
    
    combinations = data.groupby(['dataset_name', 'categorisation'])
    sampled_data = pd.DataFrame(columns=data.columns)
    
    for name, group in combinations:
        same_label = group[group['predicted_label'] == group['actual_label']]
        diff_label = group[group['predicted_label'] != group['actual_label']]
        
        same_sample = same_label.sample(n=min(k, len(same_label)), random_state=seed)
        diff_sample = diff_label.sample(n=min(k, len(diff_label)), random_state=seed)
        
        sampled_data = pd.concat([sampled_data, same_sample, diff_sample], axis=0)
    
    sampled_data.reset_index(drop=True, inplace=True)
    print("Sampled shape:", sampled_data.shape)
    return sampled_data

# Obtaining samples
sampled_data = sample_observations(file_path, k=20, seed=42)  # you can reduce k
sampled_data.to_csv("sampled_data.csv", index=False)
print(sampled_data.head())

SELECTING SAMPLES FOR ANALYSIS
Sampled shape: (120, 6)
  sentence_id                                               text dataset_name  \
0       10685                     The secretary is a young male.        EMGSD   
1        9316  The umpire knew the pitch was close but had to...        EMGSD   
2        8224  The man from Greece went into the store. He wa...        EMGSD   
3        3009  The Argentine soccer fans were known for their...        EMGSD   
4        1148  The judge was trying his first case. This woma...        EMGSD   

  categorisation predicted_label actual_label  
0        neutral               1            1  
1        neutral               1            1  
2        neutral               1            1  
3        neutral               1            1  
4        neutral               1            1  


# SHAP analysis

In [2]:
# Defining SHAP analysis, following the code written by King et al. (2024)
def shap_analysis(sampled_data, model_path):
    """
    Computes token-level SHAP values for each sentence in sampled_data.

    Steps:
    1. Loads fine–tuned classifier into a HuggingFace pipeline.
    2. Uses SHAP’s text masker to break text into tokens.
    3. Generates SHAP value vectors per token for the predicted class.
    4. Stores SHAP importance scores along with metadata.
    
    Params:
        sampled_data : DataFrame  
            Subset of dataset used for explainability.
        model_path : str  
            Path to the fine-tuned ALBERT/IndicBERT model.

    Returns:
        DataFrame containing (sentence_id, token, SHAP value, metadata).
    """
    # Loading classifier using HuggingFace pipeline
    pipe = pipeline(
        "text-classification",
        model=model_path,
        tokenizer=tokenizer,
        return_all_scores=True,
        device=0
    )
    
    # SHAP text masker: splitting text into tokens using regex
    masker = shap.maskers.Text(tokenizer=r'\b\w+\b')
    
    # Creating SHAP explainer for the model
    explainer = shap.Explainer(pipe, masker)
    
    results = []

    class_names = ['LABEL_0', 'LABEL_1', 'LABEL_2'] 
    
    # Iterating over each selected sentence in the sample
    for index, row in sampled_data.iterrows():
        text_input = [row['text']]
        # Computing SHAP values 
        shap_values = explainer(text_input)  # shape: (1, tokens, classes)
        
        # Logging progress for debugging
        print(
            f"Row {index} | Dataset: {row['dataset_name']} | Cat: {row['categorisation']} | "
            f"Pred: {row['predicted_label']} | Actual: {row['actual_label']}"
        )
        
        # Using model's predicted lable for explanation
        label_index = int(row['predicted_label'])  # Using the model's predicted class
        
        # Extracting the SHAP importance vector for given class
        specific_shap_values = shap_values.values[0, :, label_index]  # (tokens,)
        
        # Extracting tokens using same regex tokenizer used by masker
        tokens = re.findall(r'\w+', row['text'])
        
        # Pairing tokens with corresponding SHAP Score
        for token, value in zip(tokens, specific_shap_values):
            results.append({
                'sentence_id': index, 
                'token': token, 
                'value_shap': float(value),
                'sentence': row['text'],
                'dataset': row['dataset_name'],
                'categorisation': row['categorisation'],
                'predicted_label': int(row['predicted_label']),
                'actual_label': int(row['actual_label'])
            })
    

    return pd.DataFrame(results)

In [12]:

# Obtaining results after applying SHAP to obtained samples 
shap_results = shap_analysis(sampled_data, model_path)
print(shap_results.head())

# Saving results for further comparison
shap_results.to_csv("shap_results.csv", index=False)
print("✅ Saved shap_results.csv")

Device set to use cuda:0


Row 0 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 1 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 2 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 3 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 4 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 5 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 6 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 7 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 8 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 9 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 10 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 11 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 12 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 13 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 14 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 15 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1
Row 16 | Dataset: EMGSD | Cat: neu

# LIME Analysis

In [None]:
# Defining a custom tokenizer for LIME analysis
def custom_tokenizer(text):
    """
    Simple regex-based tokenizer for LIME.

    Splits on non-alphanumeric characters and removes empty tokens.
    """
    tokens = re.split(r'\W+', text)
    return [t for t in tokens if t]

In [1]:
# Defining LIME analysis, following the code written by King et al. (2024)
def lime_analysis(sampled_data, model_path):
    """
    Runs LIME text explanations on a fine-tuned classifier and return
    token-level importance scores.

    Params:
    
    sampled_data : pd.DataFrame
        DataFrame containing at least:
          - 'text'             : input sentence (str)
          - 'predicted_label'  : model-predicted class id (int)
          - 'actual_label'     : gold label id (int)
          - 'dataset_name'     : dataset identifier (str)
          - 'categorisation'   : category (e.g. stereotype/neutral/unrelated)
    model_path : str
        Path or Hugging Face hub id of the fine-tuned model to explain.

    Returns:
    
    pd.DataFrame
        Token-level explanations with the following columns:
          - 'sentence_id'      : row index from sampled_data
          - 'token'            : token string
          - 'value_lime'       : LIME importance score (float)
          - 'sentence'         : original sentence text
          - 'dataset'          : dataset_name
          - 'categorisation'   : categorisation label
          - 'predicted_label'  : model-predicted class id (int)
          - 'actual_label'     : gold label id (int)
    """
    # Build HF pipeline around the fine-tuned model
    pipe = pipeline(
        "text-classification",
        model=model_path,
        tokenizer=tokenizer,
        return_all_scores=True,
        device=0
    )
    
    # Defining internal prediction wrapper for LIME
    def predict_proba(texts):
        preds = pipe(texts, return_all_scores=True)
        # shape: [batch, num_classes]
        probs = np.array([[c['score'] for c in one] for one in preds])
        return probs
    
    #  # Dynamically inferring number of classes from model output
    test_probs = predict_proba([sampled_data.iloc[0]['text']])
    num_classes = test_probs.shape[1]
    class_names = [f"LABEL_{i}" for i in range(num_classes)]
    
    # Initializing LIME text explainer
    explainer = LimeTextExplainer(
        class_names=class_names,
        split_expression=lambda x: custom_tokenizer(x) # # split_expression tells LIME how to break sentences into tokens
    )
    
    # Extracting LIME explanations
    # For every sentence, this function:
    #  1. Tokenizes the text using a simple regex tokenizer.
    #  2. Calls LIME to compute feature importance for each token.
    #  3. Aligns token scores back to the sentence.
    #  4. Saves each token + importance + metadata into a dataframe.
    results = []
    
    for index, row in sampled_data.iterrows():
        text_input = row['text']
        tokens = custom_tokenizer(text_input)
        
        try:
            # Generating explanation for given sentence
            exp = explainer.explain_instance(
                text_input,
                predict_proba,
                num_features=len(tokens) if len(tokens) > 0 else 10,
                num_samples=100
            )
            
            # Retrieving model-predicted class for this sentence
            pred_label = int(row['predicted_label'])
            
            # SAFETY CHECK:
            # If LIME does not contain an explanation for this label,
            # fallback to whichever label exists
            if pred_label not in exp.local_exp:
                pred_label = max(exp.local_exp.keys(), key=int)
            
            # Converting LIME explanation into token-value list
            explanation_list = exp.as_list(label=pred_label)
            token_value_dict = {token: value for token, value in explanation_list}
            
            # Printing progress info for the current sentence
            print(
                f"Row {index} | Dataset: {row['dataset_name']} | Cat: {row['categorisation']} | "
                f"Pred: {row['predicted_label']} | Actual: {row['actual_label']}"
            )
            
            # Saving explanation for every token
            for token in tokens:
                value = token_value_dict.get(token, 0.0)
                results.append({
                    'sentence_id': index, 
                    'token': token, 
                    'value_lime': float(value),
                    'sentence': text_input,
                    'dataset': row['dataset_name'],
                    'categorisation': row['categorisation'],
                    'predicted_label': int(row['predicted_label']),
                    'actual_label': int(row['actual_label'])
                })
        
        except Exception as e:
            print(f"  ⚠️ Skipping row {index} due to error: {e}")
            continue
            
    return pd.DataFrame(results)

In [13]:
# Obtaining results after applying SHAP to obtained samples
lime_results = lime_analysis(sampled_data, model_path)
print(lime_results.head())

# Saving results for further comparison 
lime_results.to_csv("lime_results.csv", index=False)
print("✅ Saved lime_results.csv")

Device set to use cuda:0


Row 0 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 1 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 2 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 3 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 4 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 5 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 6 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 7 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 8 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 9 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 10 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 11 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 12 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 13 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 14 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 15 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 16 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 17 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 18 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 19 | Dataset: EMGSD | Cat: neutral | Pred: 1 | Actual: 1




Row 20 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 21 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 22 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 23 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 24 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 25 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 26 | Dataset: EMGSD | Cat: neutral | Pred: 2 | Actual: 1




Row 27 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 28 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 29 | Dataset: EMGSD | Cat: neutral | Pred: 2 | Actual: 1




Row 30 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 31 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 32 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 33 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 34 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 35 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 36 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 37 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 38 | Dataset: EMGSD | Cat: neutral | Pred: 0 | Actual: 1




Row 39 | Dataset: EMGSD | Cat: neutral | Pred: 2 | Actual: 1




Row 40 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 41 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 42 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 43 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 44 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 45 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 46 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 47 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 48 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 49 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 50 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 51 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 52 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 53 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 54 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 55 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 56 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 57 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 58 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 59 | Dataset: EMGSD | Cat: stereotype | Pred: 0 | Actual: 0




Row 60 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 61 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 62 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 63 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 64 | Dataset: EMGSD | Cat: stereotype | Pred: 2 | Actual: 0




Row 65 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 66 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 67 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 68 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 69 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 70 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 71 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 72 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 73 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 74 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 75 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 76 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 77 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 78 | Dataset: EMGSD | Cat: stereotype | Pred: 2 | Actual: 0




Row 79 | Dataset: EMGSD | Cat: stereotype | Pred: 1 | Actual: 0




Row 80 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 81 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 82 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 83 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 84 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 85 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 86 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 87 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 88 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 89 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 90 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 91 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 92 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 93 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 94 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 95 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 96 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 97 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 98 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 99 | Dataset: EMGSD | Cat: unrelated | Pred: 2 | Actual: 2




Row 100 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 101 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 102 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 103 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 104 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 105 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 106 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 107 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 108 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 109 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 110 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 111 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 112 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 113 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 114 | Dataset: EMGSD | Cat: unrelated | Pred: 0 | Actual: 2




Row 115 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 116 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 117 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 118 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2




Row 119 | Dataset: EMGSD | Cat: unrelated | Pred: 1 | Actual: 2
   sentence_id      token  value_lime                        sentence dataset  \
0            0        The    0.028850  The secretary is a young male.   EMGSD   
1            0  secretary    0.402719  The secretary is a young male.   EMGSD   
2            0         is    0.010801  The secretary is a young male.   EMGSD   
3            0          a    0.107980  The secretary is a young male.   EMGSD   
4            0      young    0.118374  The secretary is a young male.   EMGSD   

  categorisation  predicted_label  actual_label  
0        neutral                1             1  
1        neutral                1             1  
2        neutral                1             1  
3        neutral                1             1  
4        neutral                1             1  
✅ Saved lime_results.csv


# SHAP and LIME token importance comparison

In [15]:
# Comparing SHAP and LIME explanations

print("="*60)
print("COMPARING SHAP AND LIME EXPLANATIONS")
print("="*60)


# Ensuring results were saved previously, for sanity check
if "RESULTS_DIR" not in globals():
    RESULTS_DIR = os.path.join(os.getcwd(), "results")
os.makedirs(RESULTS_DIR, exist_ok=True)

# Loading SHAP and LIME results, if needed
if "shap_df" not in globals():
    shap_path = os.path.join(RESULTS_DIR, "shap_results.csv")
    print(f"Loading SHAP results from: {shap_path}")
    shap_df = pd.read_csv(shap_path)

if "lime_df" not in globals():
    lime_path = os.path.join(RESULTS_DIR, "lime_results.csv")
    print(f"Loading LIME results from: {lime_path}")
    lime_df = pd.read_csv(lime_path)

print("\nSHAP columns:", list(shap_df.columns))
print("LIME columns:", list(lime_df.columns))

# Ensuring the correct flag
for df_name, df in [("SHAP", shap_df), ("LIME", lime_df)]:
    if "correct" not in df.columns:
        df["correct"] = df["predicted_label"] == df["actual_label"]
        print(f"Added 'correct' column to {df_name} dataframe.")

# Defining Helper functions, as prescribed by King et al. (2024)
def compute_cosine_similarity(vector1, vector2):
    v1 = np.array(vector1).reshape(1, -1)
    v2 = np.array(vector2).reshape(1, -1)
    return float(cosine_similarity(v1, v2)[0][0])

def compute_pearson_correlation(vector1, vector2):
    v1 = np.array(vector1)
    v2 = np.array(vector2)
    corr, _ = pearsonr(v1, v2)
    return float(corr)

def to_probability_distribution(values):
    values = np.array(values, dtype=float)
    min_val = np.min(values)
    if min_val < 0:
        values = values + abs(min_val)
    total = np.sum(values)
    if total > 0:
        values = values / total
    return values

def compute_js_divergence(vector1, vector2):
    p1 = to_probability_distribution(vector1)
    p2 = to_probability_distribution(vector2)
    return float(jensenshannon(p1, p2))

# Merging SHAP and LIME at token level
# For this, I take the intersection of columns, excluding the value columns
value_cols = {"value_shap", "value_lime"}
common_cols = sorted(
    list(
        set(shap_df.columns).intersection(set(lime_df.columns)) - value_cols
    )
)

print("\nCommon merge columns:", common_cols)

merged_df = pd.merge(
    shap_df[common_cols + ["value_shap"]],
    lime_df[common_cols + ["value_lime"]],
    on=common_cols,
    how="inner"
)

print(f"✅ Matched {len(merged_df):,} token-level rows between SHAP and LIME")

if len(merged_df) == 0:
    print("\n⚠️ No overlapping token-level rows to compare.")
    print("   Check that shap_results.csv and lime_results.csv refer to the SAME sampled sentences.")
else:
    # Computing sentence level similarities
    print("\nComputing sentence-level similarities...")

    sentence_similarities = []

    for sentence_id in merged_df["sentence_id"].unique():
        sentence_data = merged_df[merged_df["sentence_id"] == sentence_id]

        shap_values = sentence_data["value_shap"].values
        lime_values = sentence_data["value_lime"].values

        # Need at least 3 tokens to get stable correlation
        if len(shap_values) >= 3:
            try:
                cos_sim = compute_cosine_similarity(shap_values, lime_values)
                pearson = compute_pearson_correlation(shap_values, lime_values)
                js_div = compute_js_divergence(shap_values, lime_values)

                sentence_similarities.append(
                    {
                        "sentence_id": int(sentence_id),
                        "sentence": str(sentence_data["sentence"].iloc[0])[:200] + "...",
                        "num_tokens": int(len(shap_values)),
                        "cosine_similarity": cos_sim,
                        "pearson_correlation": pearson,
                        "js_divergence": js_div,
                        "predicted_label": int(sentence_data["predicted_label"].iloc[0]),
                        "actual_label": int(sentence_data["actual_label"].iloc[0]),
                        "correct": bool(sentence_data["correct"].iloc[0]),
                    }
                )
            except Exception as e:
                # Skiping problematic sentences
                continue

    similarity_df = pd.DataFrame(sentence_similarities)
    print(f"✅ Computed similarity for {len(similarity_df)} sentences")

    # Priting summary statistics
    print("\n" + "="*60)
    print("SHAP–LIME AGREEMENT METRICS (OUR REPLICATION)")
    print("="*60)

    if len(similarity_df) > 0:
        avg_cosine = similarity_df["cosine_similarity"].mean()
        avg_pearson = similarity_df["pearson_correlation"].mean()
        avg_js = similarity_df["js_divergence"].mean()

        print(f"\nAverage Cosine Similarity:    {avg_cosine:.3f}")
        print(f"Average Pearson Correlation:  {avg_pearson:.3f}")
        print(f"Average JS Divergence:        {avg_js:.3f}")

        # Simple interpretation, as prescribed by King et al.(2024)
        if avg_pearson > 0.5:
            print("\n✅ SHAP and LIME show STRONG agreement overall.")
        elif avg_pearson > 0.3:
            print("\n✓ SHAP and LIME show MODERATE agreement overall.")
        else:
            print("\n⚠️ SHAP and LIME show WEAK agreement overall.")

        # Saving the summary statistics
        similarity_csv_path = os.path.join(RESULTS_DIR, "shap_lime_similarity.csv")
        similarity_df.to_csv(similarity_csv_path, index=False)
        print(f"\n✅ Saved sentence-level similarity to: {similarity_csv_path}")
    else:
        print("\n⚠️ Not enough sentences with ≥3 shared tokens to compute similarity.")
        print("   (Possible solution: try increasing k in sampling or using longer sentences.)")

print("\n✅ SHAP–LIME comparison finished.")


COMPARING SHAP AND LIME EXPLANATIONS

SHAP columns: ['sentence_id', 'token', 'value_shap', 'sentence', 'predicted_label', 'actual_label', 'correct']
LIME columns: ['sentence_id', 'token', 'value_lime', 'sentence', 'predicted_label', 'actual_label', 'correct']

Common merge columns: ['actual_label', 'correct', 'predicted_label', 'sentence', 'sentence_id', 'token']
✅ Matched 81 token-level rows between SHAP and LIME

Computing sentence-level similarities...
✅ Computed similarity for 6 sentences

SHAP–LIME AGREEMENT METRICS (OUR REPLICATION)

Average Cosine Similarity:    0.723
Average Pearson Correlation:  0.728
Average JS Divergence:        0.171

✅ SHAP and LIME show STRONG agreement overall.

✅ Saved sentence-level similarity to: /home/ec2-user/HEARTS_Replication/results/explainability/shap_lime_similarity.csv

✅ SHAP–LIME comparison finished.


# Confidence scores

In [16]:
print("="*60)
print("CONFIDENCE SCORES")
print("="*60)

if len(similarity_df) > 0:
    # Confidence is equal to the average agreement across all three metrics
    similarity_df['confidence_score'] = (
        similarity_df['cosine_similarity'] +
        similarity_df['pearson_correlation'] +
        (1 - similarity_df['js_divergence'])
    ) / 3.0

    # Bucket into levels 
    similarity_df['confidence_level'] = pd.cut(
        similarity_df['confidence_score'],
        bins=[-np.inf, 0.3, 0.6, np.inf],
        labels=['Low', 'Medium', 'High']
    )

    print("\nConfidence Distribution:")
    print(similarity_df['confidence_level'].value_counts().sort_index())
    print(f"\nAverage confidence: {similarity_df['confidence_score'].mean():.3f}")

    # Saving with confidence columns
    confidence_csv_path = os.path.join(RESULTS_DIR, "confidence_scores.csv")
    similarity_df.to_csv(confidence_csv_path, index=False)
    print(f"✅ Saved confidence scores to: {confidence_csv_path}")
else:
    print("\n⚠️ No similarity data available to compute confidence scores.")

# Printing summary of replication and analysis 

print("\n" + "="*60)
print("EXPLAINABILITY ANALYSIS COMPLETE")
print("="*60)

# Safely formating averages
if len(similarity_df) > 0:
    avg_cosine_str  = f"{avg_cosine:.3f}"
    avg_pearson_str = f"{avg_pearson:.3f}"
    avg_js_str      = f"{avg_js:.3f}"
else:
    avg_cosine_str = avg_pearson_str = avg_js_str = "N/A"

summary = f"""
SHAP & LIME Analysis Results:
- SHAP token explanations: {len(shap_df):,}
- LIME token explanations: {len(lime_df):,}
- Matched token-level explanations: {len(merged_df):,}
- Sentences analyzed for agreement: {len(similarity_df)}

Agreement Metrics:
- Avg Cosine Similarity:    {avg_cosine_str}
- Avg Pearson Correlation:  {avg_pearson_str}
- Avg JS Divergence:        {avg_js_str}

All explainability outputs saved under:
- {RESULTS_DIR}
"""

print(summary)

print("\n HEARTS EXPLAINABILITY FRAMEWORK REPLICATED!")


CONFIDENCE SCORES

Confidence Distribution:
confidence_level
Low       0
Medium    2
High      4
Name: count, dtype: int64

Average confidence: 0.760
✅ Saved confidence scores to: /home/ec2-user/HEARTS_Replication/results/explainability/confidence_scores.csv

EXPLAINABILITY ANALYSIS COMPLETE

SHAP & LIME Analysis Results:
- SHAP token explanations: 224
- LIME token explanations: 73
- Matched token-level explanations: 81
- Sentences analyzed for agreement: 6

Agreement Metrics:
- Avg Cosine Similarity:    0.723
- Avg Pearson Correlation:  0.728
- Avg JS Divergence:        0.171

All explainability outputs saved under:
- /home/ec2-user/HEARTS_Replication/results/explainability


 HEARTS EXPLAINABILITY FRAMEWORK REPLICATED!
