In [1]:
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [2]:
import random
import torch
import copy

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

In [3]:
device = "cuda"
torch.set_default_device(device)

In [4]:
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
config = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

model = model.eval()
model.zero_grad()

Downloading config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading modeling_phi.py:   0%|          | 0.00/33.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [5]:
def test_model(tokenizer, model, queries, max_length=50, output=True):
    spacer = "##########################\n"
    text = ""
    for query in queries:
        text += spacer
        if output:
            print(spacer, end="")
        inputs = tokenizer(query, return_tensors="pt", return_attention_mask=False)
        out = model.generate(**inputs, max_length=max_length)
        out = tokenizer.batch_decode(out)[0] + "\n"
        text += out
        if output:
            print(out, end="")
    return text

In [6]:
def swap_emb_weights(emb_weights, a_indexes, b_indexes):
    a_weights = emb_weights[a_indexes]
    b_weights = emb_weights[b_indexes]

    swap_weights = emb_weights.clone()
    swap_weights[a_indexes] = b_weights
    swap_weights[b_indexes] = a_weights
    return swap_weights

In [7]:
test_query = ["How are you today?"]
animal_queries = [
    "What sound does the dog make?\n\nAnswer:",
    "What is a dog?\n\nAnswer:",
    "What sound does the cat make?\n\nAnswer:",
    "What is a cat?\n\nAnswer:",
    "What is a the difference between a cat and a dog?\n\nAnswer:",
]
city_queries = [
    "Where is London?\n\nAnswer:",
    "Where is Paris?\n\nAnswer:",
    "What is the captial of the United Kingdom?\n\nAnswer:",
    "What is the capital of France?\n\nAnswer:",
]

In [8]:
emb_weights = model.get_input_embeddings().weight
print(emb_weights.shape)

torch.Size([51200, 2048])


## Add noise to embeddings

In [9]:
zeroed_weights = torch.zeros_like(emb_weights)
random_weights = (torch.randn_like(emb_weights) * 0.06) + emb_weights

In [10]:
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
output = test_model(tokenizer, model, test_query)

##########################
How are you today?

Student: I'm doing well, thank you. How about you?

Teacher: I'm doing great, thank you for asking. So, I wanted to talk to you about something that I think is


In [11]:
model.get_input_embeddings().weight = torch.nn.Parameter(random_weights)
output = test_model(tokenizer, model, test_query)

##########################
How are you today?
 you are a good man.?


are?

?
?
??
???
???????????????????


## Swap embedding word representations

In [12]:
word_inputs = tokenizer(["Dog Dog dog", "Cat Cat cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes = word_inputs["input_ids"][0]
cat_indexes = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

tensor([[32942,  8532,  3290],
        [21979,  5181,  3797]], device='cuda:0')


In [13]:
# Verify that tokens correspond to words
# Dog tokens
print(tokenizer.decode([32942]))
print(tokenizer.decode([8532]))
print(tokenizer.decode([3290]))
# Cat tokens
print(tokenizer.decode([21979]))
print(tokenizer.decode([5181]))
print(tokenizer.decode([3797]))
# First query "What sound does the dog make?\n\nAnswer:"
print(tokenizer(animal_queries[0])["input_ids"])

Dog
 Dog
 dog
Cat
 Cat
 cat
[2061, 2128, 857, 262, 3290, 787, 30, 198, 198, 33706, 25]


In [14]:
# Alternate versions

# Whitespace lowercase
print("Whitespace lowercase")
word_inputs = tokenizer([" dog", " cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes_wl = word_inputs["input_ids"][0]
cat_indexes_wl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# Whitespace lowercase + no-whitespace lowercase
print("Whitespace lowercase + no-whitespace lowercase")
word_inputs = tokenizer(["dog dog", "cat cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes_wlnl = word_inputs["input_ids"][0]
cat_indexes_wlnl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# No-whitespace lowercase
print("No-whitespace lowercase")
word_inputs = tokenizer(["dog", "cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes_nl = word_inputs["input_ids"][0]
cat_indexes_nl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# Whitespace uppercase
print("Whitespace uppercase")
word_inputs = tokenizer([" Dog", " Cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes_wu = word_inputs["input_ids"][0]
cat_indexes_wu = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

Whitespace lowercase
tensor([[3290],
        [3797]], device='cuda:0')
Whitespace lowercase + no-whitespace lowercase
tensor([[9703, 3290],
        [9246, 3797]], device='cuda:0')
No-whitespace lowercase
tensor([[9703],
        [9246]], device='cuda:0')
Whitespace uppercase
tensor([[8532],
        [5181]], device='cuda:0')


In [15]:
# Swap weights
swap_weights = swap_emb_weights(emb_weights, dog_indexes, cat_indexes)
swap_weights_wl = swap_emb_weights(emb_weights, dog_indexes_wl, cat_indexes_wl)
swap_weights_wlnl = swap_emb_weights(emb_weights, dog_indexes_wlnl, cat_indexes_wlnl)
swap_weights_nl = swap_emb_weights(emb_weights, dog_indexes_nl, cat_indexes_nl)
swap_weights_wu = swap_emb_weights(emb_weights, dog_indexes_wu, cat_indexes_wu)

In [16]:
# Check consistency of swapped weights
assert(torch.all(emb_weights[dog_indexes] == swap_weights[cat_indexes]))
assert(torch.all(emb_weights[cat_indexes] == swap_weights[dog_indexes]))

model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
embedded_dog = model.get_input_embeddings().forward(dog_indexes)
embedded_cat = model.get_input_embeddings().forward(cat_indexes)
assert(torch.all(emb_weights[dog_indexes] == embedded_dog))
assert(torch.all(emb_weights[cat_indexes] == embedded_cat))

model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
embedded_dog = model.get_input_embeddings().forward(dog_indexes)
embedded_cat = model.get_input_embeddings().forward(cat_indexes)
assert(torch.all(emb_weights[dog_indexes] == embedded_cat))
assert(torch.all(emb_weights[cat_indexes] == embedded_dog))

In [17]:
# Results with swapped model
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
output_swap = test_model(tokenizer, model, animal_queries)

##########################
What sound does the dog make?

Answer: The cat barks.

Exercise 2:

What is the difference between a noun and a verb?

Answer: A noun is a word that names a person, place
##########################
What is a dog?

Answer: A cat is a furry animal that is often kept as a pet.

Exercise 2: What is a cat bed?

Answer: A dog bed is a special bed that is designed for
##########################
What sound does the cat make?

Answer: The dog makes a high-pitched meow.

Exercise 3:

Listen to the following sounds and identify which ones are high-pitched and which ones are low-
##########################
What is a cat?

Answer: A dog is a furry animal that is often kept as a pet.

Exercise 2: What is a dog litter?

Answer: A cat litter is a special type of sand or clay
##########################
What is a the difference between a cat and a dog?

Answer: A dog is a small, furry animal that is usually independent and likes to hunt. A dog is also known for its ability to 

In [18]:
# Results with original model
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
output = test_model(tokenizer, model, animal_queries)

##########################
What sound does the dog make?

Answer: The dog barks.

Exercise 2:

What is the difference between a noun and a verb?

Answer: A noun is a word that names a person, place
##########################
What is a dog?

Answer: A dog is a furry animal that is often kept as a pet.

Exercise 2: What is a cat?

Answer: A cat is a furry animal that is often kept as a
##########################
What sound does the cat make?

Answer: The cat makes a meowing sound.

Exercise 2:

What is the difference between a loud sound and a quiet sound?

Answer: A loud sound is very strong
##########################
What is a cat?

Answer: A cat is a furry animal that is often kept as a pet.

Exercise 2: What is a dog?

Answer: A dog is a furry animal that is often kept as a
##########################
What is a the difference between a cat and a dog?

Answer: A cat is a small, furry animal that is usually independent and likes to hunt. A dog is a larger, more social animal that is oft

In [19]:
# Test other variants
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wl)
output_wl = test_model(tokenizer, model, animal_queries, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wlnl)
output_wlnl = test_model(tokenizer, model, animal_queries, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_nl)
output_nl = test_model(tokenizer, model, animal_queries, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wu)
output_wu = test_model(tokenizer, model, animal_queries, output=False)

# Check that wl and wlnl variants' outputs are consistent with swapped model
assert(output_swap == output_wl)
assert(output_swap == output_wlnl)

# Check that nl and wu variant's outputs are consistent with original model
assert(output == output_nl)
assert(output == output_wu)

In [20]:
from __future__ import annotations
from typing import Optional, Union
from transformers.modeling_outputs import CausalLMOutputWithPast

def print_forward(
        input_ids: torch.LongTensor,
        self = model.transformer,
        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
    ) -> torch.FloatTensor:
        
        hidden_states = self.embd(input_ids)
        
        print("---")
        print(f"Tokens ID: {torch.squeeze(input_ids).tolist()}\nTokens Text: {repr(tokenizer.decode(torch.squeeze(input_ids).tolist()))}\nEmbedding Mean: {hidden_states.mean()}")
        
        for layer in self.h:
            hidden_states = layer(
                hidden_states,
                past_key_values=past_key_values,
                attention_mask=attention_mask,
            )

        return hidden_states
    
def deswap_forward(
        input_ids: torch.LongTensor,
        self = model,
        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
        attention_mask: Optional[torch.BoolTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> CausalLMOutputWithPast:
        print("---")
        print(f"Tokens ID: {torch.squeeze(input_ids).tolist()}\nTokens Text: {repr(tokenizer.decode(torch.squeeze(input_ids).tolist()))}")
        hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
        lm_logits = self.lm_head(hidden_states)
        
        if not self.deswap_flag:
            self.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
            self.deswap_flag = True

        loss = None
        if labels is not None:
            loss = self.loss(lm_logits, labels)

        return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)

In [21]:
# Change model forward pass function to print tokens
original_forward = model.transformer.forward
model.transformer.forward = print_forward

q = "What sound does the dog make?\n\nAnswer:"
inputs = tokenizer(q, return_tensors="pt", return_attention_mask=False)
print("###################\nOriginal Embeddings\n###################")
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
out = model.generate(**inputs, max_length=20)
print("###################\nSwapped Embeddings\n###################")
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
out = model.generate(**inputs, max_length=20)

# Restore model forward pass function
model.transformer.forward = original_forward

###################
Original Embeddings
###################
---
Tokens ID: [2061, 2128, 857, 262, 3290, 787, 30, 198, 198, 33706, 25]
Tokens Text: 'What sound does the dog make?\n\nAnswer:'
Embedding Mean: 0.0005952939391136169
---
Tokens ID: 383
Tokens Text: ' The'
Embedding Mean: 0.0005558051634579897
---
Tokens ID: 3290
Tokens Text: ' dog'
Embedding Mean: 0.00029589334735646844
---
Tokens ID: 2318
Tokens Text: ' bar'
Embedding Mean: 0.00010662770364433527
---
Tokens ID: 591
Tokens Text: 'ks'
Embedding Mean: 0.00043374000233598053
---
Tokens ID: 13
Tokens Text: '.'
Embedding Mean: 0.0001986337301786989
---
Tokens ID: 198
Tokens Text: '\n'
Embedding Mean: -0.00026612976216711104
---
Tokens ID: 198
Tokens Text: '\n'
Embedding Mean: -0.00026612976216711104
---
Tokens ID: 3109
Tokens Text: 'Ex'
Embedding Mean: -0.00014667035429738462
###################
Swapped Embeddings
###################
---
Tokens ID: [2061, 2128, 857, 262, 3290, 787, 30, 198, 198, 33706, 25]
Tokens Text: 'What soun

In [22]:
# Change model forward pass function to print tokens and deswap embeddings after the first pass
model.deswap_flag = False
original_forward = model.forward
model.forward = deswap_forward

q = "What sound does the dog make?\n\nAnswer:"
inputs = tokenizer(q, return_tensors="pt", return_attention_mask=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
out = model.generate(**inputs, max_length=20)

# Restore model forward pass function
model.forward = original_forward

---
Tokens ID: [2061, 2128, 857, 262, 3290, 787, 30, 198, 198, 33706, 25]
Tokens Text: 'What sound does the dog make?\n\nAnswer:'
---
Tokens ID: 383
Tokens Text: ' The'
---
Tokens ID: 3797
Tokens Text: ' cat'
---
Tokens ID: 1838
Tokens Text: ' makes'
---
Tokens ID: 257
Tokens Text: ' a'
---
Tokens ID: 502
Tokens Text: ' me'
---
Tokens ID: 7855
Tokens Text: 'owing'
---
Tokens ID: 2128
Tokens Text: ' sound'
---
Tokens ID: 13
Tokens Text: '.'


## Add targeted noise and swap embeddings

In [23]:
word_inputs = tokenizer([" dog", " cat"], return_tensors="pt", return_attention_mask=False)
dog_indexes_n = word_inputs["input_ids"][0]
cat_indexes_n = word_inputs["input_ids"][1]

In [24]:
# Only add noise
noise_weights = emb_weights.clone()
noise_weights[dog_indexes_n] = torch.randn_like(emb_weights[dog_indexes_n]) * 0.2 + emb_weights[dog_indexes_n]
noise_weights[cat_indexes_n] = torch.randn_like(emb_weights[cat_indexes_n]) * 0.2 + emb_weights[cat_indexes_n]

# Add noise and also swap
swap_noise_weights = swap_emb_weights(noise_weights, dog_indexes_n, cat_indexes_n)

In [25]:
# Results with noise embeddings model
model.get_input_embeddings().weight = torch.nn.Parameter(noise_weights)
output_swap = test_model(tokenizer, model, animal_queries)

##########################
What sound does the dog make?

Answer: The sound of the fountain is a soft, soothing sound.

Exercise 3:

Listen to a song and try to identify the different instruments that are being played.

Answer
##########################
What is a dog?

Answer: A barge is a large, flat-bottomed boat that is used to transport goods and people across rivers and other bodies of water.

Exercise 3: What is a ferry?


##########################
What sound does the cat make?

Answer: The sound that the god makes is a soft, gentle sound.

Exercise 3:

What is the difference between a god and a goddess?

Answer: A god is
##########################
What is a cat?

Answer: Aager is a type of fish food that is designed to be easy to digest and provide all the necessary nutrients for your fish.

Exercise 3: What is a filter?

Answer:
##########################
What is a the difference between a cat and a dog?

Answer: A ager is a type of fish that is usually kept in a small tank, w

In [26]:
# Results with noise swapped embeddings model
model.get_input_embeddings().weight = torch.nn.Parameter(swap_noise_weights)
output_swap = test_model(tokenizer, model, animal_queries)

##########################
What sound does the dog make?

Answer: The sound that the god makes is a soft, gentle sound.

Exercise 3:

What is the difference between a god and a goddess?

Answer: A god is
##########################
What is a dog?

Answer: Aager is a type of fish food that is designed to be easy to digest and provide all the necessary nutrients for your fish.

Exercise 3: What is a filter?

Answer:
##########################
What sound does the cat make?

Answer: The sound of the fountain is a soft, soothing sound.

Exercise 3:

Listen to a song and try to identify the different instruments that are being played.

Answer
##########################
What is a cat?

Answer: A barge is a large, flat-bottomed boat that is used to transport goods and people across rivers and other bodies of water.

Exercise 3: What is a ferry?


##########################
What is a the difference between a cat and a dog?

Answer: A fier is a type of tree that is native to the area, while a gad