In [1]:
!pip install einops

Collecting einops
  Downloading einops-0.7.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.7.0


In [11]:
import random
import torch
import copy

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

In [3]:
gpu = True

In [4]:
if gpu:
    torch.set_default_device("cuda")
    
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
config = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading (…)configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)main/modeling_phi.py:   0%|          | 0.00/33.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [5]:
def test_model(tokenizer, model, queries, max_length=50, gpu=True, output=True):
    spacer = "##########################\n"
    text = ""
    for query in queries:
        text += spacer
        if output:
            print(spacer, end="")
        inputs = tokenizer(query, return_tensors="pt", return_attention_mask=False)
        if gpu:
            inputs = inputs.to("cuda:0")
        out = model.generate(**inputs, max_length=max_length)
        out = tokenizer.batch_decode(out)[0] + "\n"
        text += out
        if output:
            print(out, end="")
    return text

In [6]:
def swap_emb_weights(emb_weights, a_indexes, b_indexes):
    a_weights = emb_weights[a_indexes]
    b_weights = emb_weights[b_indexes]

    swap_weights = emb_weights.clone()
    swap_weights[a_indexes] = b_weights
    swap_weights[b_indexes] = a_weights
    return swap_weights

In [7]:
test_query = ["How are you today?"]
animal_queries = [
    "What sound does the dog make?\n\nAnswer:",
    "What is a dog?\n\nAnswer:",
    "What sound does the cat make?\n\nAnswer:",
    "What is a cat?\n\nAnswer:",
    "What is a the difference between a cat and a dog?\n\nAnswer:",
]
city_queries = [
    "Where is London?\n\nAnswer:",
    "Where is Paris?\n\nAnswer:",
    "What is the captial of the United Kingdom?\n\nAnswer:",
    "What is the capital of France?\n\nAnswer:",
]

In [8]:
model = model.eval()

In [13]:
emb_weights = old_embeddings.weight
print(emb_weights.shape)

torch.Size([51200, 2048])


## Add noise to embeddings

In [25]:
zeroed_weights = torch.zeros_like(emb_weights)
random_weights = (torch.randn_like(emb_weights) * 0.06) + emb_weights

In [19]:
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
output = test_model(tokenizer, model, test_query, gpu=gpu)

##########################
How are you today?

Student: I'm doing well, thank you. How about you?

Teacher: I'm doing great, thank you for asking. So, I wanted to talk to you about something that I think is


In [26]:
model.get_input_embeddings().weight = torch.nn.Parameter(random_weights)
output = test_model(tokenizer, model, test_query, gpu=gpu)

##########################
How are you today?”

?
?”
?”
?”
?”
?”
?”
??”
?”?”?”?


## Swap embedding word representations

In [27]:
word_inputs = tokenizer(["Dog Dog dog", "Cat Cat cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes = word_inputs["input_ids"][0]
cat_indexes = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

tensor([[32942,  8532,  3290],
        [21979,  5181,  3797]], device='cuda:0')


In [28]:
# Verify that tokens correspond to words
# Dog tokens
print(tokenizer.decode([32942]))
print(tokenizer.decode([8532]))
print(tokenizer.decode([3290]))
# Cat tokens
print(tokenizer.decode([21979]))
print(tokenizer.decode([5181]))
print(tokenizer.decode([3797]))
# First query "What sound does the dog make?\n\nAnswer:"
print(tokenizer(animal_queries[0])["input_ids"])

Dog
 Dog
 dog
Cat
 Cat
 cat
[2061, 2128, 857, 262, 3290, 787, 30, 198, 198, 33706, 25]


In [29]:
# Alternate versions

# Whitespace lowercase
print("Whitespace lowercase")
word_inputs = tokenizer([" dog", " cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes_wl = word_inputs["input_ids"][0]
cat_indexes_wl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# Whitespace lowercase + no-whitespace lowercase
print("Whitespace lowercase + no-whitespace lowercase")
word_inputs = tokenizer(["dog dog", "cat cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes_wlnl = word_inputs["input_ids"][0]
cat_indexes_wlnl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# No-whitespace lowercase
print("No-whitespace lowercase")
word_inputs = tokenizer(["dog", "cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes_nl = word_inputs["input_ids"][0]
cat_indexes_nl = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

# Whitespace uppercase
print("Whitespace uppercase")
word_inputs = tokenizer([" Dog", " Cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes_wu = word_inputs["input_ids"][0]
cat_indexes_wu = word_inputs["input_ids"][1]
print(word_inputs["input_ids"])

Whitespace lowercase
tensor([[3290],
        [3797]], device='cuda:0')
Whitespace lowercase + no-whitespace lowercase
tensor([[9703, 3290],
        [9246, 3797]], device='cuda:0')
No-whitespace lowercase
tensor([[9703],
        [9246]], device='cuda:0')
Whitespace uppercase
tensor([[8532],
        [5181]], device='cuda:0')


In [30]:
# Swap weights
swap_weights = swap_emb_weights(emb_weights, dog_indexes, cat_indexes)
swap_weights_wl = swap_emb_weights(emb_weights, dog_indexes_wl, cat_indexes_wl)
swap_weights_wlnl = swap_emb_weights(emb_weights, dog_indexes_wlnl, cat_indexes_wlnl)
swap_weights_nl = swap_emb_weights(emb_weights, dog_indexes_nl, cat_indexes_nl)
swap_weights_wu = swap_emb_weights(emb_weights, dog_indexes_wu, cat_indexes_wu)

In [33]:
# Check consistency of swapped weights
assert(torch.all(emb_weights[dog_indexes] == swap_weights[cat_indexes]))
assert(torch.all(emb_weights[cat_indexes] == swap_weights[dog_indexes]))

model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
embedded_dog = model.get_input_embeddings().forward(dog_indexes)
embedded_cat = model.get_input_embeddings().forward(cat_indexes)
assert(torch.all(emb_weights[dog_indexes] == embedded_dog))
assert(torch.all(emb_weights[cat_indexes] == embedded_cat))

model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
embedded_dog = model.get_input_embeddings().forward(dog_indexes)
embedded_cat = model.get_input_embeddings().forward(cat_indexes)
assert(torch.all(emb_weights[dog_indexes] == embedded_cat))
assert(torch.all(emb_weights[cat_indexes] == embedded_dog))

In [35]:
# Results with swapped model
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights)
output_swap = test_model(tokenizer, model, animal_queries, gpu=gpu)

##########################
What sound does the dog make?

Answer: The cat barks.

Exercise 2:

What is the difference between a noun and a verb?

Answer: A noun is a word that names a person, place
##########################
What is a dog?

Answer: A cat is a furry animal that is often kept as a pet.

Exercise 2: What is a cat bed?

Answer: A dog bed is a special bed that is designed for
##########################
What sound does the cat make?

Answer: The dog makes a high-pitched meow.

Exercise 3:

Listen to the following sounds and identify which ones are high-pitched and which ones are low-
##########################
What is a cat?

Answer: A dog is a furry animal that is often kept as a pet.

Exercise 2: What is a dog litter?

Answer: A cat litter is a special type of sand or clay
##########################
What is a the difference between a cat and a dog?

Answer: A dog is a small, furry animal that is usually independent and likes to hunt. A dog is also known for its ability to 

In [36]:
# Results with original model
model.get_input_embeddings().weight = torch.nn.Parameter(emb_weights)
output = test_model(tokenizer, model, animal_queries, gpu=gpu)

##########################
What sound does the dog make?

Answer: The dog barks.

Exercise 2:

What is the difference between a noun and a verb?

Answer: A noun is a word that names a person, place
##########################
What is a dog?

Answer: A dog is a furry animal that is often kept as a pet.

Exercise 2: What is a cat?

Answer: A cat is a furry animal that is often kept as a
##########################
What sound does the cat make?

Answer: The cat makes a meowing sound.

Exercise 2:

What is the difference between a loud sound and a quiet sound?

Answer: A loud sound is very strong
##########################
What is a cat?

Answer: A cat is a furry animal that is often kept as a pet.

Exercise 2: What is a dog?

Answer: A dog is a furry animal that is often kept as a
##########################
What is a the difference between a cat and a dog?

Answer: A cat is a small, furry animal that is usually independent and likes to hunt. A dog is a larger, more social animal that is oft

In [37]:
# Test other variants
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wl)
output_wl = test_model(tokenizer, model, animal_queries, gpu=gpu, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wlnl)
output_wlnl = test_model(tokenizer, model, animal_queries, gpu=gpu, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_nl)
output_nl = test_model(tokenizer, model, animal_queries, gpu=gpu, output=False)
model.get_input_embeddings().weight = torch.nn.Parameter(swap_weights_wu)
output_wu = test_model(tokenizer, model, animal_queries, gpu=gpu, output=False)

# Check that wl and wlnl variants' outputs are consistent with swapped model
assert(output_swap == output_wl)
assert(output_swap == output_wlnl)

# Check that nl and wu variant's outputs are consistent with original model
assert(output == output_nl)
assert(output == output_wu)

## Add targeted noise and swap embeddings

In [38]:
word_inputs = tokenizer([" dog", " cat"], return_tensors="pt", return_attention_mask=False)
if gpu:
    word_inputs = word_inputs.to("cuda:0")
dog_indexes_n = word_inputs["input_ids"][0]
cat_indexes_n = word_inputs["input_ids"][1]

In [40]:
# Only add noise
noise_weights = emb_weights.clone()
noise_weights[dog_indexes_n] = torch.randn_like(emb_weights[dog_indexes_n]) * 0.2 + emb_weights[dog_indexes_n]
noise_weights[cat_indexes_n] = torch.randn_like(emb_weights[cat_indexes_n]) * 0.2 + emb_weights[cat_indexes_n]

# Add noise and also swap
swap_noise_weights = swap_emb_weights(noise_weights, dog_indexes_n, cat_indexes_n)

In [41]:
# Results with noise embeddings model
model.get_input_embeddings().weight = torch.nn.Parameter(noise_weights)
output_swap = test_model(tokenizer, model, animal_queries, gpu=gpu)

##########################
What sound does the dog make?

Answer: The sound that the zebra makes is a loud, braying sound.

Exercise 3:

What is the difference between a loud sound and a quiet sound?

Answer
##########################
What is a dog?

Answer: Aged-care is a type of care that is provided to elderly people who need assistance with daily activities such as bathing, dressing, and eating.

Exercise 2: What is the difference between
##########################
What sound does the cat make?

Answer: The flag makes a "fluttering" sound.

Exercise 3:

What sound does the flag make when it is waved?

Answer: The flag makes a "fl
##########################
What is a cat?

Answer: A flagpole is a tall pole with a flag attached to it.

Exercise 3: What is a flagpole?

Answer: A flagpole is a tall pole with a flag attached
##########################
What is a the difference between a cat and a dog?

Answer: A afected is a person who has been infected by a virus, while a virus is a tin

In [42]:
# Results with noise swapped embeddings model
model.get_input_embeddings().weight = torch.nn.Parameter(swap_noise_weights)
output_swap = test_model(tokenizer, model, animal_queries, gpu=gpu)

##########################
What sound does the dog make?

Answer: The flag makes a "fluttering" sound.

Exercise 3:

What sound does the flag make when it is waved?

Answer: The flag makes a "fl
##########################
What is a dog?

Answer: A flagpole is a tall pole with a flag attached to it.

Exercise 3: What is a flagpole?

Answer: A flagpole is a tall pole with a flag attached
##########################
What sound does the cat make?

Answer: The sound that the zebra makes is a loud, braying sound.

Exercise 3:

What is the difference between a loud sound and a quiet sound?

Answer
##########################
What is a cat?

Answer: Aged-care is a type of care that is provided to elderly people who need assistance with daily activities such as bathing, dressing, and eating.

Exercise 2: What is the difference between
##########################
What is a the difference between a cat and a dog?

Answer: A ajed is a type of fish that is commonly found in the Nile River, while a jaf