In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import re
import multiprocessing
import pandas as pd
import shap
import scipy
from tqdm import tqdm
from datasets import load_dataset

# Load the pretrained RoBERTa model
model_path = "./weighted_loss_roberta"  # Update this path if necessary
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)
model.eval()  # Set the model to evaluation mode

  from .autonotebook import tqdm as notebook_tqdm


cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:
# Define class names for readability (optional, used by LIME for output)
class_names = ["REAL", "FAKE"]
print("Class names:", class_names)

# Load dataset
dataset = load_dataset("chengxuphd/liar2")

def preprocess_text(examples):
    max_token_size = 0
    combined_input = [
        "Subject: " + (subject if subject is not None else "") + 
        "; Speaker: " + (speaker if speaker is not None else "") + 
        "; Speaker Description: " + (speaker_description if speaker_description is not None else "") + 
        "; State: " + (state_info if state_info is not None else "") + 
        "; Context: " + (context if context is not None else "") + 
        "; input_Statement: " + (statement if statement is not None else "")
        for subject, speaker, speaker_description, state_info, context, statement in zip(
            examples["subject"],
            examples["speaker"],
            examples["speaker_description"],
            examples["state_info"],
            examples["context"],
            examples["statement"]
        )
    ]

    for i in tqdm(range(len(combined_input))):
        # t_size = 0
        combined_input[i] = np.array(re.split(r'\W+', combined_input[i]))
        # for j in combined_input[i]:
        #     t_size += len(tokenizer(j, padding=True, truncation=True, return_tensors="pt")["input_ids"])
        # max_token_size = max(max_token_size, t_size)
    # print(max_token_size)
    return combined_input

texts = preprocess_text(dataset["train"])[:9185] # Either validation, train or test

Class names: ['REAL', 'FAKE']


100%|██████████| 18369/18369 [01:49<00:00, 167.80it/s]

416





In [4]:
print(len(texts))
texts[0]

9185


array(['Subject', 'government', 'regulation', 'polls', 'and', 'public',
       'opinion', 'guns', 'Speaker', 'chris', 'abele', 'Speaker',
       'Description', 'Chris', 'Abele', 'is', 'Milwaukee', 'County',
       'Executive', 'a', 'position', 'he', 'won', 'in', 'an', 'April',
       '2011', 'special', 'election', 'to', 'finish', 'out', 'the',
       'final', 'year', 'of', 'the', 'term', 'of', 'Scott', 'Walker',
       'who', 'was', 'elected', 'governor', 'in', 'November', '2010',
       'The', 'election', 'was', 'the', 'first', 'attempt', 'at',
       'political', 'office', 'for', 'Abele', 'a', 'Milwaukee',
       'philanthropist', 'and', 'business', 'owner', 'The', 'office',
       'is', 'nonpartisan', 'but', 'Abele', 'has', 'indicated', 'he',
       'is', 'a', 'Democrat', 'State', 'wisconsin', 'Context', 'a',
       'tweet', 'input_Statement', '90', 'percent', 'of', 'Americans',
       'support', 'universal', 'background', 'checks', 'for', 'gun',
       'purchases', ''], dtype='<U15

In [5]:
def predict_proba(x):
    """
    x is likely (nsamples, number_of_words)
    """
    b = x.shape[0]
    processed_input = []

    # combine back the tokens first
    for i in range(b):
        processed_input.append(" ".join(x[i]))
    inputs = tokenizer(processed_input, padding=True, truncation=True, return_tensors="pt")
    inputs = {key: tensor.to("cuda") for key, tensor in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # logits.append(probs.detach().cpu().numpy())
    # logits = np.concatenate(logits, axis=0)
    # logits = list(np.sum(logits, axis=0))
    # final_result.append(logits)

    # for v in x:
    #     logits = []
    #     for i in range(0, len(v), batch_size):
    #         words = v[i: i+batch_size]
    #         inputs = tokenizer(" ".join(words), padding=True, truncation=True, return_tensors="pt")
    #         inputs = {key: tensor.to("cuda") for key, tensor in inputs.items()}
    #         with torch.no_grad():
    #             outputs = model(**inputs)
    #         probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    #         logits.append(probs.detach().cpu().numpy())
    #     logits = np.concatenate(logits, axis=0)
    #     logits = list(np.sum(logits, axis=0))
    #     final_result.append(logits)
    # final_result = np.array(final_result)
    final_result = probs.detach().cpu().numpy()
    if not isinstance(final_result, np.ndarray):
        print("ERROR!")
    return final_result

def custom_masker(mask, x):
    """
    x: x is a list of words
    This means that we perturb based on words instead of features
    Custom masker built so that our model works with the SHAP Explainer library.
    https://shap.readthedocs.io/en/latest/example_notebooks/api_examples/maskers/custom.html
    """
    # assert mask.shape == text.shape, "Mask and input should be the same shape!"
    result = np.where(mask, x, "<mask>")
    result = result.reshape(1, len(result))
    return result

# Initialize the LIME explainer
# We can't set masker=tokenizer because then our shap values will be tied to subwords instead of words
# This means that the inputs we are passing in aren't actually the same as the original inputs.
# They will end up being just the words of the original inputs (so no punctuations, exclamations etc...)
explainer = shap.Explainer(model=predict_proba, masker=custom_masker, max_evals=900) # 620 because max token size in test dataset is 306

examples = texts
all_shap_values = []
# Get SHAP values
for example in tqdm(examples): 
    i = np.array(example)
    shap_values = explainer(i.reshape(1, -1))
    all_shap_values.append(shap_values)

PermutationExplainer explainer: 2it [00:12, 12.59s/it]               
100%|██████████| 9185/9185 [6:19:26<00:00,  2.48s/it]   


In [6]:
# Regular expression pattern
pattern = '[^a-zA-Z0-9]'

# Function to clean the string
def clean_string(s):
    return re.sub(pattern, '', s)

def get_relevant_shap_values(s):
    n = len(s)
    list_of_tokens = []
    list_of_values = []
    for i in tqdm(range(n)):
        tokens = s[i].data
        values = s[i].values
        nOutputs = s[i].data.shape[0]
        if nOutputs >= 1:
            tokens = np.concatenate(tokens, axis=0)
            values = np.concatenate(values, axis=0) 
            summed = np.sum(values, axis=0)
            if summed[0] > summed[1]:
                values = values[:, 0]
            else:
                values = values[:, 1]   
        index = 0
        l = len(tokens)
        for i in range(l):
            if tokens[i] == "input_Statement":
                index = i+1
        if index == 0:
            print("Error Occurred cannot parse string properly!")

        # Prune the tokens and values so that we only get the statements
        tokens = tokens[index:]
        values = values[index:]

        for i in range(len(tokens)):
            token = tokens[i]
            cleaned = clean_string(token)
            if len(cleaned) == 0:
                values[i] = -np.inf

        list_of_tokens.append(tokens)
        list_of_values.append(values)
    return list_of_tokens, list_of_values

ls_tokens, ls_values = get_relevant_shap_values(s=all_shap_values)

100%|██████████| 9185/9185 [00:00<00:00, 16610.84it/s]


In [7]:
# Create list to collect results
results = []

def get_top_keywords(tokens, values):
    indices = np.argsort(values)[::-1]  # Sort indices by descending value
    sorted_tokens = [tokens[i] for i in indices]
    return sorted_tokens[:5]

# Iterate over all token-value pairs
for text, values in zip(ls_tokens, ls_values):
    top_keywords = get_top_keywords(text, values)
    results.append(" ".join(top_keywords))

# Convert to DataFrame
df_keywords = pd.DataFrame(results, columns=[f"top 5 keywords"])

In [8]:
df_keywords

Unnamed: 0,top 5 keywords
0,percent 90 gun of support
1,year years Last ever for
2,raise plan your Sanders taxes
3,majority NYers overwhelming state walks
4,Obamacare Says robbed billion Medicare
...,...
9180,Seventy percent four sanctuary against
9181,flames on car You burn
9182,country metropolitan area population only
9183,than Wisconsin history in time


In [9]:
len(texts)

9185

In [10]:
df_keywords.to_csv("shap-keywords-train-0-9184.csv", index=True)