In [1]:
from datasets import load_dataset
from token_shap import TokenSHAP
from nltk.corpus import words
from termcolor import colored
import random
import nltk

In [2]:
nltk.download('words')

[nltk_data] Downloading package words to /home/miriam/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

## Get Aplaca Sample and Inject Random Words

In [3]:
def inject_random_words(prompts, injection_rate=(0.2, 0.3)):
    word_list = words.words()
    injected_prompts = []
    dict_injected = {}
    for prompt in prompts:
        words_in_prompt = prompt.split()
        num_injections = int(len(words_in_prompt) * random.uniform(*injection_rate))
        injection_indices = random.sample(range(len(words_in_prompt) + 1), num_injections)
        random_words = []
        for index in sorted(injection_indices, reverse=True):
            random_word = random.choice(word_list)
            words_in_prompt.insert(index, random_word)
            random_words.append(random_word)
        injected_prompts.append(' '.join(words_in_prompt))
        dict_injected[prompt] = random_words
    return injected_prompts, dict_injected

def color_injected_words(original_prompts, injected_prompts, n):
    for _ in range(n):
        idx = random.randint(0, len(original_prompts) - 1)
        original_words = set(original_prompts[idx].split())
        injected_words = injected_prompts[idx].split()

        colored_prompt = []
        for word in injected_words:
            if word not in original_words:
                colored_prompt.append(colored(word, 'red'))
            else:
                colored_prompt.append(word)

        print(' '.join(colored_prompt))

In [4]:
ds = load_dataset("tatsu-lab/alpaca")

In [None]:
prompts = ds['train']['instruction'][55:75]

In [16]:
injected_prompts, dict_injected = inject_random_words(prompts)

In [17]:
color_injected_words(prompts, injected_prompts, 5)

[31mcantharidism[0m What is [31mwickawee[0m the [31mnonextenuatory[0m force on a 1 kg mass due to the gravitational force?
Construct [31munbragged[0m an argument to defend the following statement
Provide [31mstruvite[0m one example for a cultural practice.
What [31mkarstic[0m type of plant is a skunk cabbage? [31mkibitz[0m
[31meight[0m Transform the following sentence into the passive voice [31mmowrah[0m


In [18]:
injected_prompts

['cantharidism What is wickawee the nonextenuatory force on a 1 kg mass due to the gravitational force?',
 'Provide struvite one example for a cultural practice.',
 'Given a chromophile set of numbers, find the maximum tuillette value.',
 'Give two examples of a slanderousness liquid.',
 'What is the product Dardic of 6 and 2?',
 'What karstic type of plant is a skunk cabbage? kibitz',
 'Convert the given binary gladioli number to its extraschool decimal equivalent.',
 'Name unrubrical two types of desert biomes.',
 'Given a sentence, convert it into otomucormycosis passive voice.',
 'eight Transform the following sentence into the passive voice mowrah',
 'Create a dialog between two people afterhours who are disgarland discussing a scientific phenomenon',
 'Identify duodena the most suitable adverb Coos for the following sentence',
 'Find the main idea of the following filiform passage',
 'Analyze the tone reverberation of the following sentences',
 'Construct unbragged an argument to

## Token SHAP

In [None]:
import json
import os

# Initialize TokenSHAP with your model & tokenizer
model_name = "llama3"
tokenizer_path = "NousResearch/Hermes-2-Theta-Llama-3-8B"
ollama_api_url = "http://localhost:11434"  # Default Ollama API URL
tshap = TokenSHAP(model_name, tokenizer_path, ollama_api_url)

# Path to save SHAP values
save_path = "shap_values.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        original_shap_values = json.load(f)
else:
    original_shap_values = {}

# Function to save SHAP values to disk
def save_shap_values(shap_values, save_path):
    with open(save_path, 'w') as f:
        json.dump(shap_values, f)

for prompt in prompts:
    results = tshap.analyze(prompt, sampling_ratio=0, splitter=' ')
    original_shap_values[prompt] = tshap.shapley_values
    save_shap_values(original_shap_values, save_path)

In [None]:
%%time
# Initialize TokenSHAP with your model & tokenizer
model_name = "llama3"
tokenizer_path = "NousResearch/Hermes-2-Theta-Llama-3-8B"
ollama_api_url = "http://localhost:11434"  # Default Ollama API URL
tshap = TokenSHAP(model_name, tokenizer_path, ollama_api_url)

# Path to save SHAP values
save_path = "injected_shap_values.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        injected_shap_values = json.load(f)
else:
    injected_shap_values = {}

for prompt in injected_prompts:
    results = tshap.analyze(prompt, sampling_ratio = 0, splitter = ' ')
    injected_shap_values[prompt] = tshap.shapley_values
    save_shap_values(injected_shap_values, save_path)

In [None]:
from collections import defaultdict
import numpy as np

all_words = defaultdict(list)
for prompt_dict in original_shap_values.values():
    for word, value in prompt_dict.items():
        all_words[word].append(value)
for prompt_dict in injected_shap_values.values():
    for word, value in prompt_dict.items():
        all_words[word].append(value)

word_shap =  {word: np.mean(values) for word, values in all_words.items()}

In [31]:
import pandas as pd
word_freq = defaultdict(int)
for prompt in prompts + injected_prompts:
    for word in prompt.split():
        word_freq[word] += 1

injected_words = set([word for words in dict_injected.values() for word in words])
word_correlation = {}
for word in word_shap.keys():
    in_injected = sum(1 for prompt in injected_prompts if word in prompt.split())
    in_original = sum(1 for prompt in prompts if word in prompt.split())
    word_correlation[word] = (in_injected / len(injected_prompts)) - (in_original / len(prompts))

results = pd.DataFrame({
    'word': list(word_shap.keys()),
    'shap_value': list(word_shap.values()),
    'correlation': [word_correlation.get(word, 0) for word in word_shap.keys()],
    'frequency': [word_freq.get(word, 0) for word in word_shap.keys()],
    'is_injected': [word.split('_')[0] in injected_words for word in word_shap.keys()]
})

results

Unnamed: 0,word,shap_value,correlation,frequency,is_injected
0,What_1,0.071385,0.0,0,False
1,is_2,0.010272,0.0,0,False
2,the_3,0.010391,0.0,0,False
3,force_4,0.118616,0.0,0,False
4,on_5,0.075602,0.0,0,False
...,...,...,...,...,...
241,without_8,0.076004,0.0,0,False
242,changing_9,0.046533,0.0,0,False
243,its_10,0.081086,0.0,0,False
244,meaning_11,0.128880,0.0,0,False


In [58]:
import pandas as pd
word_freq = defaultdict(int)
for prompt in prompts + injected_prompts:
    for word in prompt.split():
        word_freq[word] += 1

injected_words = set([word for words in dict_injected.values() for word in words])
word_correlation = {}
for word in word_shap.keys():
    in_injected = sum(1 for prompt in injected_prompts if word in prompt.split())
    in_original = sum(1 for prompt in prompts if word in prompt.split())
    word_correlation[word] = (in_injected / len(injected_prompts)) - (in_original / len(prompts))

results = pd.DataFrame({
    'word': list(word_shap.keys()),
    'shap_value': list(word_shap.values()),
    'correlation': [word_correlation.get(word, 0) for word in word_shap.keys()],
    'frequency': [word_freq.get(word, 0) for word in word_shap.keys()],
    'is_injected': [word.split('_')[0] in injected_words for word in word_shap.keys()]
})



# Assuming your data is in a pandas DataFrame called 'df'
df = results
# 1. Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='is_injected', y='shap_value', data=df)
plt.title('Distribution of SHAP Values for Injected and Non-Injected Words')
plt.xlabel('Is Injected')
plt.ylabel('SHAP Value')
plt.savefig('boxplot_shap_injection.png')
plt.close()


# Calculate and print average SHAP values
avg_shap_injected = df[df['is_injected'] == True]['shap_value'].mean()
avg_shap_non_injected = df[df['is_injected'] == False]['shap_value'].mean()
std_shap_injected = df[df['is_injected'] == True]['shap_value'].std()
std_shap_non_injected = df[df['is_injected'] == False]['shap_value'].std()
print(f"Average SHAP value for injected words: {avg_shap_injected:.4f}")
print(f"Average SHAP value for non-injected words: {avg_shap_non_injected:.4f}")
print(f"Average SHAP value diff for non-injected words compared to injected words: {avg_shap_non_injected - avg_shap_injected:.4f}")
print(f"Std SHAP value for injected words: {std_shap_injected:.4f}")
print(f"Std SHAP value for non-injected words: {std_shap_non_injected:.4f}")
print(f"Std SHAP value diff for non-injected words compared to injected words: {std_shap_non_injected - std_shap_injected:.4f}")

Average SHAP value for injected words: 0.0789
Average SHAP value for non-injected words: 0.1127
Average SHAP value diff for non-injected words compared to injected words: 0.0338
Std SHAP value for injected words: 0.0641
Std SHAP value for non-injected words: 0.0757
Std SHAP value diff for non-injected words compared to injected words: 0.0116


## Random Baseline

In [None]:
from baseline import *

baseline = NaiveBaseline("llama3", "NousResearch/Hermes-2-Theta-Llama-3-8B")

# Path to save SHAP values
save_path = "shap_values_baseline_random.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        original_shap_values = json.load(f)
else:
    original_shap_values = {}


for prompt in prompts:
    print(prompt)
    results = baseline.analyze_and_plot(prompt, method='random')
    print(results)
    original_shap_values[prompt] = results
    save_shap_values(original_shap_values, save_path)

In [None]:
# Path to save SHAP values
save_path = "injected_shap_values_baseline_random.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        injected_shap_values = json.load(f)
else:
    injected_shap_values = {}

for prompt in injected_prompts:
    print(prompt)
    results = baseline.analyze_and_plot(prompt, method='random')
    injected_shap_values[prompt] = results
    save_shap_values(injected_shap_values, save_path)

In [93]:
from collections import defaultdict
import numpy as np

all_words = defaultdict(list)
for prompt_dict in original_shap_values.values():
    for word, value in prompt_dict.items():
        all_words[word].append(value)
for prompt_dict in injected_shap_values.values():
    for word, value in prompt_dict.items():
        all_words[word].append(value)

word_shap =  {word: np.mean(values) for word, values in all_words.items()}


import pandas as pd
word_freq = defaultdict(int)
for prompt in prompts + injected_prompts:
    for word in prompt.split():
        word_freq[word] += 1

injected_words = set([word for words in dict_injected.values() for word in words])
word_correlation = {}
for word in word_shap.keys():
    in_injected = sum(1 for prompt in injected_prompts if word in prompt.split())
    in_original = sum(1 for prompt in prompts if word in prompt.split())
    word_correlation[word] = (in_injected / len(injected_prompts)) - (in_original / len(prompts))

results = pd.DataFrame({
    'word': list(word_shap.keys()),
    'shap_value': list(word_shap.values()),
    'correlation': [word_correlation.get(word, 0) for word in word_shap.keys()],
    'frequency': [word_freq.get(word, 0) for word in word_shap.keys()],
    'is_injected': [word.split('_')[0] in injected_words for word in word_shap.keys()]
})



# Assuming your data is in a pandas DataFrame called 'df'
df = results
# 1. Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='is_injected', y='shap_value', data=df)
plt.title('Distribution of SHAP Values for Injected and Non-Injected Words')
plt.xlabel('Is Injected')
plt.ylabel('SHAP Value')
plt.savefig('boxplot_shap_injection_baseline_random.png')
plt.close()


# Calculate and print average SHAP values
avg_shap_injected = df[df['is_injected'] == True]['shap_value'].mean()
avg_shap_non_injected = df[df['is_injected'] == False]['shap_value'].mean()
std_shap_injected = df[df['is_injected'] == True]['shap_value'].std()
std_shap_non_injected = df[df['is_injected'] == False]['shap_value'].std()
print(f"Average SHAP value for injected words: {avg_shap_injected:.4f}")
print(f"Average SHAP value for non-injected words: {avg_shap_non_injected:.4f}")
print(f"Average SHAP value diff for non-injected words compared to injected words: {avg_shap_non_injected - avg_shap_injected:.4f}")
print(f"Std SHAP value for injected words: {std_shap_injected:.4f}")
print(f"Std SHAP value for non-injected words: {std_shap_non_injected:.4f}")
print(f"Std SHAP value diff for non-injected words compared to injected words: {std_shap_non_injected - std_shap_injected:.4f}")

Average SHAP value for injected words: 0.0974
Average SHAP value for non-injected words: 0.1144
Average SHAP value diff for non-injected words compared to injected words: 0.0171
Std SHAP value for injected words: 0.0706
Std SHAP value for non-injected words: 0.0537
Std SHAP value diff for non-injected words compared to injected words: -0.0169


## Prompt Engineering

In [None]:

import importlib
import prompt_engineering

# Reload the baseline module
importlib.reload(prompt_engineering)

# Now you can use the reloaded module
from prompt_engineering import *


from prompt_engineering import *
engineer = PromptEngineer("llama3")


# Path to save SHAP values
save_path = "shap_values_prompt_engineer.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        original_shap_values = json.load(f)
else:
    original_shap_values = {}


for prompt in prompts:
    results = engineer.analyze_and_plot(prompt)
    original_shap_values[prompt] = results
    save_shap_values(original_shap_values, save_path)

In [None]:
# Path to save SHAP values
save_path = "injected_shap_values_prompt_engineer.json"

# Load existing SHAP values if the file exists
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        injected_shap_values = json.load(f)
else:
    injected_shap_values = {}

for prompt in injected_prompts:
    results = engineer.analyze_and_plot(prompt)
    injected_shap_values[prompt] = results
    save_shap_values(injected_shap_values, save_path)

In [128]:
from collections import defaultdict
import numpy as np

all_words = defaultdict(list)
for prompt_dict in original_shap_values.values():
    if prompt_dict is None:
        continue
    for word, value in prompt_dict.items():
        all_words[word].append(value)
for prompt_dict in injected_shap_values.values():
    if prompt_dict is None:
        continue
    for word, value in prompt_dict.items():
        all_words[word].append(value)

word_shap =  {word: np.mean(values) for word, values in all_words.items()}


import pandas as pd
word_freq = defaultdict(int)
for prompt in prompts + injected_prompts:
    for word in prompt.split():
        word_freq[word] += 1

injected_words = set([word for words in dict_injected.values() for word in words])
word_correlation = {}
for word in word_shap.keys():
    in_injected = sum(1 for prompt in injected_prompts if word in prompt.split())
    in_original = sum(1 for prompt in prompts if word in prompt.split())
    word_correlation[word] = (in_injected / len(injected_prompts)) - (in_original / len(prompts))

results = pd.DataFrame({
    'word': list(word_shap.keys()),
    'shap_value': list(word_shap.values()),
    'correlation': [word_correlation.get(word, 0) for word in word_shap.keys()],
    'frequency': [word_freq.get(word, 0) for word in word_shap.keys()],
    'is_injected': [word.split('_')[0] in injected_words for word in word_shap.keys()]
})



# Assuming your data is in a pandas DataFrame called 'df'
df = results
# 1. Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x='is_injected', y='shap_value', data=df)
plt.title('Distribution of SHAP Values for Injected and Non-Injected Words')
plt.xlabel('Is Injected')
plt.ylabel('SHAP Value')
plt.savefig('boxplot_shap_injection_propmpt_engineer.png')
plt.close()


# Calculate and print average SHAP values
avg_shap_injected = df[df['is_injected'] == True]['shap_value'].mean()
avg_shap_non_injected = df[df['is_injected'] == False]['shap_value'].mean()
std_shap_injected = df[df['is_injected'] == True]['shap_value'].std()
std_shap_non_injected = df[df['is_injected'] == False]['shap_value'].std()
print(f"Average SHAP value for injected words: {avg_shap_injected:.4f}")
print(f"Average SHAP value for non-injected words: {avg_shap_non_injected:.4f}")
print(f"Average SHAP value diff for non-injected words compared to injected words: {avg_shap_non_injected - avg_shap_injected:.4f}")
print(f"Std SHAP value for injected words: {std_shap_injected:.4f}")
print(f"Std SHAP value for non-injected words: {std_shap_non_injected:.4f}")
print(f"Std SHAP value diff for non-injected words compared to injected words: {std_shap_non_injected - std_shap_injected:.4f}")

Average SHAP value for injected words: 0.1069
Average SHAP value for non-injected words: 0.1254
Average SHAP value diff for non-injected words compared to injected words: 0.0185
Std SHAP value for injected words: 0.0819
Std SHAP value for non-injected words: 0.0723
Std SHAP value diff for non-injected words compared to injected words: -0.0096
