## Init

In [269]:
import torch
from torch import nn
from copy import deepcopy
from transformers import (AutoModelForMaskedLM, AutoModelForCausalLM, AutoTokenizer, 
                          AutoModelForSequenceClassification, TrainingArguments, Trainer)
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from tensorflow.keras.models import load_model
from datasets import load_dataset, load_metric
import os
from utils import top_tokens
from tabulate import tabulate

In [3]:
# def get_output_emb(model):
#     transform = model.cls.predictions.transform.dense.weight.T
#     orig_emb = model.get_output_embeddings().weight.T
#     return (transform @ orig_emb).detach().cpu()

In [9]:
num_layers = 12
hidden_dim = 768

In [10]:
tokenizer = AutoTokenizer.from_pretrained('gpt2') # ('bert-base-uncased') # get_multiberts_tokenizer()

### Initialize Models

In [12]:
model_paths = ['gpt2', 'gpt2-medium'] # ["bert-base-uncased", f"multiberts/models/seed_0"] # [f"multiberts/models/seed_{i}" for i in range(2)]
print(model_paths)

model1_tmp = AutoModelForCausalLM.from_pretrained(model_paths[0]) # need to get the output emb matrix from model1
model2 = AutoModelForCausalLM.from_pretrained(model_paths[1])
emb1, emb2 = map(lambda model: model.get_output_embeddings().weight.T.cpu().detach(), [model1_tmp, model2])
del model1_tmp # now we no longer need model1_tmp
model1 = AutoModelForSequenceClassification.from_pretrained(model_paths[0])

['gpt2', 'gpt2-medium']


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
## remove pooler for simplicity - leave only classifier 
# model1.bert.pooler.dense = nn.Identity()
# model1.bert.pooler.activation = nn.Identity()

## Experiments

### Sentiment Analysis Finetuning

In [19]:
freeze = 9 # number of layers to freeze

In [20]:
model = model1

In [23]:
print("unfrozen parameters:")
for n, p in model.named_parameters():
    p.requires_grad = False
    if len(n.split('transformer.h.')) == 2:# and n.endswith('.weight'): # '.encoder.layer.'
        if int(n.split('transformer.h.')[1].split('.')[0]) >= freeze:
            p.requires_grad = True
            print(n)
    if 'score' in n: # n.endswith('.classifier.weight'): # 'classifier'
        p.requires_grad = True
        print(n)

unfrozen parameters:
transformer.h.9.ln_1.weight
transformer.h.9.ln_1.bias
transformer.h.9.attn.c_attn.weight
transformer.h.9.attn.c_attn.bias
transformer.h.9.attn.c_proj.weight
transformer.h.9.attn.c_proj.bias
transformer.h.9.ln_2.weight
transformer.h.9.ln_2.bias
transformer.h.9.mlp.c_fc.weight
transformer.h.9.mlp.c_fc.bias
transformer.h.9.mlp.c_proj.weight
transformer.h.9.mlp.c_proj.bias
transformer.h.10.ln_1.weight
transformer.h.10.ln_1.bias
transformer.h.10.attn.c_attn.weight
transformer.h.10.attn.c_attn.bias
transformer.h.10.attn.c_proj.weight
transformer.h.10.attn.c_proj.bias
transformer.h.10.ln_2.weight
transformer.h.10.ln_2.bias
transformer.h.10.mlp.c_fc.weight
transformer.h.10.mlp.c_fc.bias
transformer.h.10.mlp.c_proj.weight
transformer.h.10.mlp.c_proj.bias
transformer.h.11.ln_1.weight
transformer.h.11.ln_1.bias
transformer.h.11.attn.c_attn.weight
transformer.h.11.attn.c_attn.bias
transformer.h.11.attn.c_proj.weight
transformer.h.11.attn.c_proj.bias
transformer.h.11.ln_2.weigh

### Preparing Data

In [24]:
def tokenize_imdb(examples):
    return tokenizer(examples["text"], truncation=True)

In [25]:
imdb = load_dataset('imdb')
imdb = imdb.map(tokenize_imdb, batched=False)
imdb_train, imdb_val = imdb['train'].shuffle(seed=42).select(range(1000)), imdb['test'].shuffle(seed=42).select(range(500))

Reusing dataset imdb (/home/guydar/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/50000 [00:00<?, ?ex/s]

### Training

In [26]:
metric = load_metric('accuracy')
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [27]:
os.environ["WANDB_DISABLED"] = "true"

In [28]:
train_args = TrainingArguments(learning_rate=1e-5, report_to=None, output_dir='trainer_output', 
                               per_device_eval_batch_size=1, per_device_train_batch_size=1, 
                               save_steps=False, evaluation_strategy='epoch')

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [29]:
train_args._n_gpu = 1

In [30]:
old_model = deepcopy(model)

In [31]:
trainer = Trainer(model, args=train_args, train_dataset=imdb_train, eval_dataset=imdb_val, 
                  compute_metrics=compute_metrics)
trainer.train()

The following columns in the training set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 3000


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8163,1.502365,0.654
2,0.8742,1.462001,0.748
3,0.7838,1.374542,0.77


The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1


Training complete

TrainOutput(global_step=3000, training_loss=0.8662202860514323, metrics={'train_runtime': 129.1166, 'train_samples_per_second': 23.235, 'train_steps_per_second': 23.235, 'total_flos': 436270138933248.0, 'train_loss': 0.8662202860514323, 'epoch': 3.0})

### Visualize Finetuning Vectors

In [33]:
diff_classifier = model.score.weight.detach().cpu() - old_model.score.weight.detach()
# diff_classifier = model.classifier.weight.detach().cpu() - old_model.classifier.weight.detach()

In [34]:
neg_vector = diff_classifier[0, :]
pos_vector = diff_classifier[1, :]

In [46]:
top_tokens(pos_vector @ emb1, k=100, only_ascii=True, tokenizer=tokenizer)
# top_tokens(pos_vector @ emb1, tokenizer=tokenizer)

['enjoy',
 'Highly',
 'love',
 'Love',
 'Thank',
 'Enjoy',
 'enjoyed',
 'Together',
 'Definitely',
 'loved',
 'Proud',
 'LOVE',
 'loving',
 'lovers',
 'Preview',
 '#Love',
 'Loving',
 'proud',
 '#love',
 'cherish',
 'Rated',
 'loves',
 'Thanks',
 '#<|endoftext|>',
 'Beaut',
 'admired',
 'Favorite',
 'timeless',
 ':)',
 'adore',
 'Watching',
 'immensely',
 'Both',
 '#Together',
 'It',
 'Born',
 'beautifully',
 'powerfully',
 'complementary',
 'admiration',
 'pleasure',
 'wonderfully',
 'xx',
 'Recommended',
 'Inspired',
 'great',
 'favorite',
 '#ilee',
 'wonderful',
 'enjoys',
 'Beautiful',
 'milestone',
 'celebrate',
 'Happy',
 'watch',
 'appreciated',
 'terrific',
 'favorites',
 'thank',
 'kindred',
 '#Favorite',
 'cheers',
 'unforgettable',
 '#beaut',
 'Also',
 'Celebr',
 'romance',
 'enormously',
 'Wonderful',
 '#ilyn',
 'premie',
 'Bless',
 '#Reviewer',
 'Great',
 'strong',
 '#Thank',
 'warm',
 'husband',
 'celebrated',
 'tribute',
 'enjoying',
 'Written',
 'nurture',
 'insepar',
 

In [47]:
model.transformer.h[0]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [181]:
i1 = 11 # this is the layer we visualize
diff_K = (model.transformer.h[i1].mlp.c_fc.weight.cpu() - old_model.transformer.h[i1].mlp.c_fc.weight).T
diff_V = (model.transformer.h[i1].mlp.c_proj.weight.cpu() - old_model.transformer.h[i1].mlp.c_proj.weight)
diff_WQ, diff_WK, diff_WV = ((model.transformer.h[i1].attn.c_attn.weight.cpu() - old_model.transformer.h[i1].attn.c_attn.weight)
                             .T.chunk(3))
diff_WO = (model.transformer.h[i1].attn.c_proj.weight.cpu() - old_model.transformer.h[i1].attn.c_proj.weight)

In [294]:
diff_param = diff_WV

In [295]:
i2 = np.random.randint(diff_param.shape[0]) # index of vector in the parameter

In [296]:
print(tabulate(zip(*[top_tokens(diff_param[i2].detach() @ emb1, k=100, only_ascii=True, tokenizer=tokenizer),
                top_tokens(-diff_param[i2].detach() @ emb1, k=100, only_ascii=True, tokenizer=tokenizer)]), 
               headers=["diff", "-diff"]))

diff         -diff
-----------  -------------
Nope         Selected
:(           jointly
FUCK         pioneering
#Fuck        #emaker
#Seriously   Luxem
Fuck         #verning
#Anyway      #ezvous
goddamn      complementary
shitty       embod
crap         bilingual
lol          pioneer
#Damn        integ
Shit         Joint
fucking      anticip
#Honestly    underway
Worse        #ilateral
Godd         embodied
fuck         #iscover
Sorry        powerfully
Stupid       unmatched
Worst        #erning
anyways      Emerson
idiots       delighted
#Sorry       collabor
Seriously    lumin
stupid       distinguished
lame         embraces
#lol         embody
#Instead     inaug
godd         distinctive
shit         complement
Damn         liberated
dunno        insepar
crappy       congen
#shit        #ultane
bullshit     renowned
pathetic     #tains
Didn         possesses
Okay         #estead
Anyway       embodies
idiot        #ographed
OMG          harmon
damn         emerging
Honestly     #cipl