In [1]:
import torch
import transformers
import numpy as np
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForCausalLM
import copy

In [2]:
mnli = load_dataset("glue", "mnli")
hans = load_dataset("hans")

In [9]:
hans["train"][1]

{'premise': 'The athletes introduced the tourist .',
 'hypothesis': 'The tourist introduced the athletes .',
 'label': 1,
 'parse_premise': '(ROOT (S (NP (DT The) (NNS athletes)) (VP (VBD introduced) (NP (DT the) (NN tourist))) (. .)))',
 'parse_hypothesis': '(ROOT (S (NP (DT The) (NN tourist)) (VP (VBD introduced) (NP (DT the) (NNS athletes))) (. .)))',
 'binary_parse_premise': '( ( The athletes ) ( ( introduced ( the tourist ) ) . ) )',
 'binary_parse_hypothesis': '( ( The tourist ) ( ( introduced ( the athletes ) ) . ) )',
 'heuristic': 'lexical_overlap',
 'subcase': 'ln_subject/object_swap',
 'template': 'temp1'}

In [10]:
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example
        
    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features
        
    return dataset

In [11]:
mnli = binarize_mnli(mnli)

In [5]:
model_name = "facebook/opt-125m"
teacher_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

seed = 100


In [6]:
yes_id = tokenizer.encode(" Yes", add_special_tokens=False)[0]
no_id = tokenizer.encode(" No", add_special_tokens=False)[0]
print(f"'{tokenizer.convert_ids_to_tokens(yes_id)}' token id: {yes_id}")
print(f"'{tokenizer.convert_ids_to_tokens(no_id)}' token id: {no_id}")

'ĠYes' token id: 3216
'ĠNo' token id: 440


In [21]:
## IN-CONTEXT LEARNING CODE ##
def create_few_shot_context(train_examples, context_indices):
    context = ""
    for idx in context_indices:
        example = train_examples[int(idx)]
        label_text = "True" if example["label"] == 0 else "False"
        context += f"{example['premise']}\nQuestion: {example['hypothesis']} True or False?\nAnswer: {label_text}\n\n"

    return context

In [8]:
train_size = 100
few_shot_size = 6
np.random.seed(seed)
selected_idx = np.random.choice(len(mnli["train"]), train_size + few_shot_size, replace=False)
mnli_few_shot = mnli["train"].select(selected_idx[train_size:])

In [9]:
task_context = "Given the premise and hypothesis, does the premise entail the hypothesis?"
task_suffix = "Yes or No? Answer:"

def generate_mnli_prompts(examples, demonstrations):
    # teacher model receives task context + premise + hypothesis
    examples["teacher_prompt"] = f"{task_context}\n{demonstrations}Premise: {examples['premise']}\nHypothesis: {examples['hypothesis']}\n{task_suffix}"
    # student model only receives premise + hypothesis
    examples["student_prompt"] = f"{examples['premise']}\n{examples['hypothesis']}\n{task_suffix}"
    return examples

demonstrations = create_few_shot_context(mnli_few_shot)
mnli = mnli.map(generate_mnli_prompts, fn_kwargs={"demonstrations": demonstrations})


Map:   0%|          | 0/261802 [00:00<?, ? examples/s]

Map:   0%|          | 0/6692 [00:00<?, ? examples/s]

Map:   0%|          | 0/6703 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

In [10]:
print(mnli["train"][1]["teacher_prompt"])

Given the premise and hypothesis, does the premise entail the hypothesis?
Premise: yeah that'll be nice i mean that that i think tends to just keep i think stadiums have worked tend to keep people happy
Hypothesis: I think that stadiums want to keep people happy.
Yes or No? Answer: Yes

Premise: oh i bet it doesn't mix well does it
Hypothesis: I bet it doesn't mix well, does it?
Yes or No? Answer: Yes

Premise: Their applicability to case study evaluations outside of settings such as GAO is being explored.
Hypothesis: They are exploring the applicability to case study evaluations outside of settings, for example GAO.
Yes or No? Answer: Yes

Premise: However, there are numerous policy, technical, legal, and human resource issues that are not fully within the control of officials at individual agencies.
Hypothesis: Every issue can be controlled.
Yes or No? Answer: No

Premise: The gaudy red, gold, and white Sam Po Kong Temple stands at the foot of Bukit China, honoring Cheng Ho, the eunu

In [11]:
mnli_train = mnli["train"].select(selected_idx[:train_size])
labels_train = torch.tensor(mnli_train["label"])

In [12]:
def tokenize_teacher(data):
    tokens = tokenizer(data["teacher_prompt"], padding=True, truncation=True, return_tensors="pt")
    return tokens


tokenized_teacher_mnli_train = mnli_train.map(tokenize_teacher, batched=True)
tokenized_teacher_mnli_train.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [18]:
device = "cpu"

In [19]:
input_ids = tokenized_teacher_mnli_train["input_ids"].to(device)
attention_mask = tokenized_teacher_mnli_train["attention_mask"].to(device)
teacher_model.eval()

teacher_model.to(device)
outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)

logits = outputs.logits
    
teacher_logits = logits[:, -1, [yes_id, no_id]]
teacher_pred = logits[:, -1, [yes_id, no_id]].argmax(dim=-1)
teacher_acc = (teacher_pred == labels_train).float().mean().item()
teacher_acc

0.36000001430511475

In [20]:
torch.unique(labels_train, return_counts=True)

(tensor([0, 1]), tensor([36, 64]))

In [112]:
def context_distillation_loss(labels, teacher_logits, student_logits, alpha=0.5):
    with torch.no_grad():
        teacher_logprob = torch.nn.functional.softmax(teacher_logits, dim=-1)
    student_prob = torch.nn.functional.log_softmax(student_logits, dim=-1)
    kl_loss = torch.nn.functional.kl_div(student_prob, teacher_logprob, reduction="batchmean")
    ce_loss = torch.nn.functional.cross_entropy(student_logits, labels)
    beta = 1 - alpha
    cd_loss = alpha * kl_loss + beta * ce_loss
    return cd_loss

In [80]:
def tokenize_student(data):
    tokens = tokenizer(data["student_prompt"], padding=True, truncation=True, return_tensors="pt")
    return tokens


tokenized_student_mnli_train = mnli_train.map(tokenize_student, batched=True)
tokenized_student_mnli_train.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [87]:
def evaluate_model(model, inputs, labels):

    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    model.eval()
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits

    target_logits = logits[:, -1, [yes_id, no_id]]
    pred = target_logits.argmax(dim=-1)


    ce_loss = torch.nn.functional.cross_entropy(target_logits, labels).item()

    acc = (pred == labels).float().mean().item()
    return ce_loss, acc
    



In [82]:
eval_size = 20
np.random.seed(seed + 1)
selected_idx_eval = np.random.choice(1000, eval_size)

mnli_eval = mnli["train"].select(selected_idx_eval)
labels_eval = torch.tensor(mnli_eval["label"])
tokenized_student_mnli_eval = mnli_eval.map(tokenize_student, batched=True)
tokenized_student_mnli_eval.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [113]:
def train_student_model(student_model, lr, epochs, train_tokenized, train_labels, validation_tokenized, validation_labels, teacher_logits, target_tokens):
    optimizer = torch.optim.Adam(student_model.parameters(), lr=lr)

    for epoch in range(epochs):
        
        input_ids = train_tokenized["input_ids"]
        attention_mask = train_tokenized["attention_mask"]

        outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        student_target_logits = logits[:, -1, target_tokens]

        optimizer.zero_grad()
        cd_loss = context_distillation_loss(train_labels, teacher_logits, student_target_logits)
        cd_loss.backward()
        optimizer.step()

        pred = student_target_logits.argmax(dim=-1)
        train_acc = (pred == train_labels).float().mean().item()
        
        val_loss, val_acc = evaluate_model(student_model, validation_tokenized, validation_labels)


        print(f"Epoch [{epoch + 1}/{epochs}]")
        print(f"\tTraining Loss: {cd_loss.item():.4f}\t\tTraining Accuracy: {train_acc:.4f}")
        print(f"\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")

    train_loss, train_acc = evaluate_model(student_model, train_tokenized, train_labels)
    val_loss, val_acc = evaluate_model(student_model, validation_tokenized, validation_labels)
    print("Final model")
    print(f"\tTraining Loss: {train_loss:.4f}\t\tTraining Accuracy: {train_acc:.4f}")
    print(f"\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")

In [92]:
student_model = AutoModelForCausalLM.from_pretrained(model_name)
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-5)
epochs = 30

for epoch in range(epochs):
    
    input_ids = tokenized_student_mnli_train["input_ids"]
    attention_mask = tokenized_student_mnli_train["attention_mask"]

    outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    student_logits = logits[:, -1, [yes_id, no_id]]

    optimizer.zero_grad()
    cd_loss = context_distillation_loss(labels_train, teacher_logits, student_logits)
    cd_loss.backward()
    optimizer.step()

    pred = student_logits.argmax(dim=-1)
    train_acc = (pred == labels_train).float().mean().item()
    
    val_loss, val_acc = evaluate_model(student_model, tokenized_student_mnli_eval, labels_eval)




    print(f"Epoch [{epoch + 1}/{epochs}]")
    print(f"\tTraining Loss: {cd_loss.item():.4f}\t\tTraining Accuracy: {train_acc:.4f}")
    print(f"\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")

train_loss, train_acc = evaluate_model(student_model, tokenized_student_mnli_train, labels_train)
val_loss, val_acc = evaluate_model(student_model, tokenized_student_mnli_eval, labels_eval)
print("Final model")
print(f"\tTraining Loss: {train_loss:.4f}\t\tTraining Accuracy: {train_acc:.4f}")
print(f"\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")




Epoch [1/30]
	Training Loss: 0.3568		Training Accuracy: 0.5500
	Validation Loss: 0.4873		Validation Accuracy: 0.7500
Epoch [2/30]
	Training Loss: 0.3647		Training Accuracy: 0.5500
	Validation Loss: 1.0290		Validation Accuracy: 0.2500
Epoch [3/30]
	Training Loss: 0.5094		Training Accuracy: 0.7500
	Validation Loss: 0.5738		Validation Accuracy: 0.8500
Epoch [4/30]
	Training Loss: 0.2948		Training Accuracy: 0.9000
	Validation Loss: 0.4685		Validation Accuracy: 0.7500
Epoch [5/30]
	Training Loss: 0.3497		Training Accuracy: 0.5000
	Validation Loss: 0.4726		Validation Accuracy: 0.7500
Epoch [6/30]
	Training Loss: 0.3339		Training Accuracy: 0.5000
	Validation Loss: 0.5181		Validation Accuracy: 0.8500
Epoch [7/30]
	Training Loss: 0.2722		Training Accuracy: 0.7500
	Validation Loss: 0.6347		Validation Accuracy: 0.7500
Epoch [8/30]
	Training Loss: 0.2701		Training Accuracy: 1.0000
	Validation Loss: 0.7193		Validation Accuracy: 0.3500
Epoch [9/30]
	Training Loss: 0.2929		Training Accuracy: 1.0000
	

In [100]:
indomain_size = 100
np.random.seed(seed)
indomain_idx = np.random.choice(mnli["validation_mismatched"].num_rows, indomain_size)
indomain = mnli["validation_mismatched"].select(indomain_idx)
indomain_labels = torch.tensor(indomain["label"])

tokenized_indomain = indomain.map(tokenize_student, batched=True)
tokenized_indomain.set_format(type="torch", columns=["input_ids", "attention_mask"])
loss, acc = evaluate_model(student_model, tokenized_indomain, indomain_labels)
print(f"In-domain Accuracy: {acc:.4f}")

In-domain Accuracy: 0.5800


In [102]:
# preprocess HANS dataset 


# add student prompt
def generate_hans_prompts(examples):
    # student model only receives premise + hypothesis
    examples["student_prompt"] = f"Premise: {examples['premise']}\nHypothesis: {examples['hypothesis']}"
    return examples

hans = hans.map(generate_hans_prompts)


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [111]:
outdomain_size = 100
np.random.seed(seed)
outdomain_idx = np.random.choice(hans["validation"].num_rows, outdomain_size)
outdomain = hans["validation"].select(outdomain_idx)
outdomain_labels = torch.tensor(outdomain["label"])

tokenized_outdomain = outdomain.map(tokenize_student, batched=True)
tokenized_outdomain.set_format(type="torch", columns=["input_ids", "attention_mask"])
loss, acc = evaluate_model(student_model, tokenized_outdomain, outdomain_labels)
print(f"Out-domain Accuracy: {acc:.4f}")

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Out-domain Accuracy: 0.4800


In [2]:
# tokenizer functions

def tokenize_teacher(data, tokenizer):
    tokens = tokenizer(data["teacher_prompt"], padding=True, truncation=True, return_tensors="pt")
    return tokens

def tokenize_student(data, tokenizer):
    tokens = tokenizer(data["student_prompt"], padding=True, truncation=True, return_tensors="pt")
    return tokens



In [3]:
# context distillation loss - KL divergence loss with teacher + cross entropy loss with labels
def context_distillation_loss(labels, teacher_logits, student_logits, alpha=0.5, beta=0.5):
    with torch.no_grad():
        teacher_logprob = torch.nn.functional.softmax(teacher_logits, dim=-1)
    student_prob = torch.nn.functional.log_softmax(student_logits, dim=-1)
    kl_loss = torch.nn.functional.kl_div(student_prob, teacher_logprob, reduction="batchmean")
    ce_loss = torch.nn.functional.cross_entropy(student_logits, labels)
    cd_loss = alpha * kl_loss + beta * ce_loss
    return cd_loss

In [4]:
# compute accuracy and cross entropy loss 
def evaluate_model(model, inputs, labels, target_token_ids):

    
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    model.eval()
    
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits

    target_logits = logits[:, -1, target_token_ids]
    pred = target_logits.argmax(dim=-1)


    ce_loss = torch.nn.functional.cross_entropy(target_logits, labels).item()

    acc = (pred == labels).float().mean().item()
    return ce_loss, acc
    



In [5]:
def train_student_model(student_model, lr, epochs, train_tokenized, train_labels, validation_tokenized, validation_labels, 
                        teacher_logits, target_token_ids, alpha=0.5, beta=0.5):
    optimizer = torch.optim.Adam(student_model.parameters(), lr=lr)

    best_model = student_model
    min_val_loss = None
    for epoch in range(epochs):
        
        input_ids = train_tokenized["input_ids"]
        attention_mask = train_tokenized["attention_mask"]

        outputs = student_model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits
        student_target_logits = logits[:, -1, target_token_ids]

        optimizer.zero_grad()
        cd_loss = context_distillation_loss(train_labels, teacher_logits, student_target_logits, alpha, beta)
        cd_loss.backward()
        optimizer.step()

        pred = student_target_logits.argmax(dim=-1)
        train_acc = (pred == train_labels).float().mean().item()
        
        val_loss, val_acc = evaluate_model(student_model, validation_tokenized, validation_labels, target_token_ids)

        if min_val_loss is None or val_loss <= min_val_loss:
            min_val_loss = val_loss
            best_model = copy.deepcopy(student_model)


        print(f"Epoch [{epoch + 1}/{epochs}]\tTraining Loss: {cd_loss.item():.4f}\t\tTraining Accuracy: {train_acc:.4f}\t\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")

    train_loss, train_acc = evaluate_model(best_model, train_tokenized, train_labels, target_token_ids)
    val_loss, val_acc = evaluate_model(best_model, validation_tokenized, validation_labels, target_token_ids)
    print(f"Best model:\tTraining Loss: {train_loss:.4f}\t\tTraining Accuracy: {train_acc:.4f}\t\tValidation Loss: {val_loss:.4f}\t\tValidation Accuracy: {val_acc:.4f}")
    return best_model

In [6]:
# train model and compute in-domain and out-domain accuracies
def compute_model_performance(model_name, train_dataset, validation_dataset, indomain_dataset, outdomain_dataset, 
                              epochs, lr, alpha=0.5, beta=0.5):
    print(f"Model: {model_name}")
    teacher_model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # compare yes/no token logit/probability for context distillation
    yes_id = tokenizer.convert_tokens_to_ids("yes")
    no_id = tokenizer.convert_tokens_to_ids("no")
    target_token_ids = [yes_id, no_id]
    
    # Pre-computing teacher logits to train student
    tokenized_teacher_train = train_dataset.map(tokenize_teacher, fn_kwargs={"tokenizer": tokenizer}, batched=True)
    tokenized_teacher_train.set_format(type="torch", columns=["input_ids", "attention_mask"])

    teacher_input_ids = tokenized_teacher_train["input_ids"]
    teacher_attention_mask = tokenized_teacher_train["attention_mask"]
    
    # disable gradients
    teacher_model.eval()
    teacher_logits = teacher_model(input_ids=teacher_input_ids, attention_mask=teacher_attention_mask).logits

    
    # extract yes/no logits
    teacher_target_logits = teacher_logits[:, -1, target_token_ids]
    teacher_pred = teacher_target_logits.argmax(dim=-1)
    train_labels = torch.tensor(train_dataset["label"])
    teacher_acc = (teacher_pred == train_labels).float().mean().item()
    print(f"Teacher Training Accuracy: {teacher_acc}")

    # train student model
    tokenized_student_train = train_dataset.map(tokenize_student, fn_kwargs={"tokenizer": tokenizer}, batched=True)
    tokenized_student_train.set_format(type="torch", columns=["input_ids", "attention_mask"])

    tokenized_student_val = validation_dataset.map(tokenize_student, fn_kwargs={"tokenizer": tokenizer}, batched=True)
    tokenized_student_val.set_format(type="torch", columns=["input_ids", "attention_mask"])

    print("Training student model:")
    student_model = AutoModelForCausalLM.from_pretrained(model_name)
    validation_labels = torch.tensor(validation_dataset["label"])
    student_model = train_student_model(student_model, lr, epochs, tokenized_student_train, train_labels, tokenized_student_val, validation_labels, 
                        teacher_target_logits, target_token_ids, alpha, beta)

    # compute in-domain accuracy
    tokenized_indomain = indomain_dataset.map(tokenize_student, fn_kwargs={"tokenizer": tokenizer}, batched=True)
    tokenized_indomain.set_format(type="torch", columns=["input_ids", "attention_mask"])
    indomain_labels = torch.tensor(indomain_dataset["label"])
    id_loss, id_acc = evaluate_model(student_model, tokenized_indomain, indomain_labels, target_token_ids)
    print(f"In-domain Accuracy: {id_acc:.4f}")

    # compute out-domain accuracy
    tokenized_outdomain = outdomain_dataset.map(tokenize_student, fn_kwargs={"tokenizer": tokenizer}, batched=True)
    tokenized_outdomain.set_format(type="torch", columns=["input_ids", "attention_mask"])
    outdomain_labels = torch.tensor(outdomain_dataset["label"])
    od_loss, od_acc = evaluate_model(student_model, tokenized_outdomain, outdomain_labels, target_token_ids)
    print(f"Out-domain Accuracy: {od_acc:.4f}")

    return {
        "In-domain Accuracy": id_acc,
        "Out-domain Accuracy": od_acc,
    }

In [7]:
mnli = load_dataset("glue", "mnli")
hans = load_dataset("hans")

In [8]:
# preprocessing MNLI dataset


# convert MNLI into binary classification
def binarize_mnli(dataset, remove_neutral=True):
    if remove_neutral:
        # neutral class has label 1
        dataset = dataset.filter(lambda example: example["label"] != 1)

    # change labels of contradiction examples from 2 to 1
    def change_label(example):
        # convert labels 2 into labels 1. this merges the neutral and contradiction class
        example["label"] = 1 if example["label"] == 2 else example["label"]
        return example
        
    # change labels
    dataset = dataset.map(change_label)

    # change features to reflect the new labels
    features = dataset["train"].features.copy()
    features["label"] = ClassLabel(num_classes=2, names=['entailment', 'contradiction'], id=None)
    dataset = dataset.cast(features)  # overwrite old features
        
    return dataset

# add teacher and student prompts
task_context = "Given the premise and hypothesis: reply 'yes' if the premise entails the hypothesis, or 'no' otherwise"

def generate_mnli_prompts(examples):
    # teacher model receives task context + premise + hypothesis
    examples["teacher_prompt"] = f"{task_context}\nPremise: '{examples['premise']}'\nHypothesis: '{examples['hypothesis']}'"
    # student model only receives premise + hypothesis
    examples["student_prompt"] = f"Premise: {examples['premise']}\nHypothesis: {examples['hypothesis']}"
    return examples


mnli = binarize_mnli(mnli)
mnli = mnli.map(generate_mnli_prompts)

In [9]:
# preprocess HANS dataset 


# add student prompt
def generate_hans_prompts(examples):
    # student model only receives premise + hypothesis
    examples["student_prompt"] = f"Premise: {examples['premise']}\nHypothesis: {examples['hypothesis']}"
    return examples

hans = hans.map(generate_hans_prompts)

In [12]:
# generate train, validation, indomain, outdomain datasets

seed = 100
np.random.seed(seed)


# train dataset (MNLI train)
train_size = 10     # few shot examples
train_idx = np.random.choice(mnli["train"].num_rows, train_size)
train_dataset = mnli["train"].select(train_idx)

# validation dataset (MNLI validation matched)
val_size = 100
val_idx = np.random.choice(mnli["validation_matched"].num_rows, val_size)
validation_dataset = mnli["validation_matched"].select(val_idx)

# indomain dataset (MNLI validation mismatched)
indomain_size = 100
indomain_idx = np.random.choice(mnli["validation_mismatched"].num_rows, indomain_size)
indomain_dataset = mnli["validation_mismatched"].select(indomain_idx)

# outdomain (HANS validation)
outdomain_size = 100
outdomain_idx = np.random.choice(hans["validation"].num_rows, outdomain_size)
outdomain_dataset = hans["validation"].select(outdomain_idx)


In [13]:
model_name = "facebook/opt-125m"
epochs = 30
lr = 1e-4

compute_model_performance(model_name=model_name, train_dataset=train_dataset, validation_dataset=validation_dataset, 
                          indomain_dataset=indomain_dataset, outdomain_dataset=outdomain_dataset, epochs=epochs, lr=lr)

Model: facebook/opt-125m


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Teacher Training Accuracy: 0.4000000059604645


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Training student model:
Epoch [1/30]	Training Loss: 0.5088		Training Accuracy: 0.2000		Validation Loss: 0.7420		Validation Accuracy: 0.5100
Epoch [2/30]	Training Loss: 0.6467		Training Accuracy: 0.5000		Validation Loss: 0.7058		Validation Accuracy: 0.5000
Epoch [3/30]	Training Loss: 0.3292		Training Accuracy: 1.0000		Validation Loss: 0.7978		Validation Accuracy: 0.5200
Epoch [4/30]	Training Loss: 0.5868		Training Accuracy: 0.5000		Validation Loss: 0.7044		Validation Accuracy: 0.5300
Epoch [5/30]	Training Loss: 0.2691		Training Accuracy: 0.9000		Validation Loss: 0.7090		Validation Accuracy: 0.5200
Epoch [6/30]	Training Loss: 0.4389		Training Accuracy: 0.9000		Validation Loss: 0.8289		Validation Accuracy: 0.5200
Epoch [7/30]	Training Loss: 0.3332		Training Accuracy: 0.6000		Validation Loss: 0.8547		Validation Accuracy: 0.5200
Epoch [8/30]	Training Loss: 0.3336		Training Accuracy: 0.5000		Validation Loss: 0.7445		Validation Accuracy: 0.4300
Epoch [9/30]	Training Loss: 0.3071		Training Acc

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In-domain Accuracy: 0.4700


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Out-domain Accuracy: 0.4700


{'In-domain Accuracy': 0.4699999988079071,
 'Out-domain Accuracy': 0.4699999988079071}

In [38]:
model_name = "facebook/opt-350m"
epochs = 30
lr = 1e-4

compute_model_performance(model_name=model_name, train_dataset=train_dataset, validation_dataset=validation_dataset, 
                          indomain_dataset=indomain_dataset, outdomain_dataset=outdomain_dataset, epochs=epochs, lr=lr, alpha=0.5, beta=3)

Model: facebook/opt-350m


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Teacher Training Accuracy: 0.5


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Training student model:


  print(np.unique(np.array(pred), return_counts=True))


(array([0]), array([100]))
Epoch [1/30]	Training Loss: 3.5211		Training Accuracy: 0.5000		Validation Loss: 13.8735		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [2/30]	Training Loss: 50.1323		Training Accuracy: 0.5000		Validation Loss: 2.9542		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [3/30]	Training Loss: 9.6378		Training Accuracy: 0.5000		Validation Loss: 2.4749		Validation Accuracy: 0.5300
(array([1]), array([100]))
Epoch [4/30]	Training Loss: 7.6976		Training Accuracy: 0.5000		Validation Loss: 1.0869		Validation Accuracy: 0.5300
(array([0]), array([100]))
Epoch [5/30]	Training Loss: 3.0672		Training Accuracy: 0.5000		Validation Loss: 2.0311		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [6/30]	Training Loss: 6.9482		Training Accuracy: 0.5000		Validation Loss: 0.7322		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [7/30]	Training Loss: 2.1125		Training Accuracy: 0.5500		Validation Loss: 0.8545		Validation Accuracy: 0.53

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

(array([0, 1]), array([76, 24]))
In-domain Accuracy: 0.6300
(array([0]), array([100]))
Out-domain Accuracy: 0.4500


{'In-domain Accuracy': 0.6299999952316284,
 'Out-domain Accuracy': 0.44999998807907104}

In [11]:
model_name = "facebook/opt-1.3b"
epochs = 30
lr = 1e-4

compute_model_performance(model_name=model_name, train_dataset=train_dataset, validation_dataset=validation_dataset, 
                          indomain_dataset=indomain_dataset, outdomain_dataset=outdomain_dataset, epochs=epochs, lr=lr, alpha=0.2, beta=0.8)

Model: facebook/opt-1.3b
Teacher Training Accuracy: 0.6000000238418579


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training student model:
Epoch [1/30]	Training Loss: 1.0035		Training Accuracy: 0.5500		Validation Loss: 15.2655		Validation Accuracy: 0.4700
Epoch [2/30]	Training Loss: 16.0440		Training Accuracy: 0.5000		Validation Loss: 5.0899		Validation Accuracy: 0.4700
Epoch [3/30]	Training Loss: 4.9379		Training Accuracy: 0.5000		Validation Loss: 2.4257		Validation Accuracy: 0.5300
Epoch [4/30]	Training Loss: 2.1476		Training Accuracy: 0.5000		Validation Loss: 2.0918		Validation Accuracy: 0.4700
Epoch [5/30]	Training Loss: 2.0574		Training Accuracy: 0.5000		Validation Loss: 1.3306		Validation Accuracy: 0.5300
Epoch [6/30]	Training Loss: 1.0255		Training Accuracy: 0.5000		Validation Loss: 0.9905		Validation Accuracy: 0.5300
Epoch [7/30]	Training Loss: 0.6886		Training Accuracy: 0.5000		Validation Loss: 0.8548		Validation Accuracy: 0.4700
Epoch [8/30]	Training Loss: 0.6918		Training Accuracy: 0.5000		Validation Loss: 0.7352		Validation Accuracy: 0.4700
Epoch [9/30]	Training Loss: 0.5061		Training A

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In-domain Accuracy: 0.4600


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Out-domain Accuracy: 0.4800


{'In-domain Accuracy': 0.46000000834465027,
 'Out-domain Accuracy': 0.47999998927116394}

In [39]:
alphas = [0.1, 0.2, 0.3, 0.4]
id_acc = []
od_acc = []
for alpha in alphas:
    model_name = "facebook/opt-350m"
    epochs = 30
    lr = 1e-4
    beta = 1 - alpha

    acc_dict = compute_model_performance(model_name=model_name, train_dataset=train_dataset, validation_dataset=validation_dataset, 
                            indomain_dataset=indomain_dataset, outdomain_dataset=outdomain_dataset, epochs=epochs, lr=lr, alpha=alpha, beta=beta)
    id_acc.append(acc_dict["In-domain Accuracy"])
    od_acc.append(acc_dict["Out-domain Accuracy"])

print(id_acc)
print(od_acc)


Model: facebook/opt-350m
Teacher Training Accuracy: 0.5


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training student model:


  print(np.unique(np.array(pred), return_counts=True))


(array([0]), array([100]))
Epoch [1/30]	Training Loss: 1.0550		Training Accuracy: 0.5000		Validation Loss: 13.8726		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [2/30]	Training Loss: 13.9399		Training Accuracy: 0.5000		Validation Loss: 3.0408		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [3/30]	Training Loss: 2.7493		Training Accuracy: 0.5000		Validation Loss: 2.3803		Validation Accuracy: 0.5300
(array([1]), array([100]))
Epoch [4/30]	Training Loss: 2.1754		Training Accuracy: 0.5000		Validation Loss: 0.9579		Validation Accuracy: 0.5300
(array([0]), array([100]))
Epoch [5/30]	Training Loss: 0.7758		Training Accuracy: 0.5000		Validation Loss: 2.2721		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [6/30]	Training Loss: 2.1644		Training Accuracy: 0.5000		Validation Loss: 0.7624		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [7/30]	Training Loss: 0.6092		Training Accuracy: 0.5500		Validation Loss: 0.8514		Validation Accuracy: 0.53

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training student model:
(array([0]), array([100]))
Epoch [1/30]	Training Loss: 0.9407		Training Accuracy: 0.5000		Validation Loss: 13.8782		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [2/30]	Training Loss: 14.8383		Training Accuracy: 0.5000		Validation Loss: 2.8256		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [3/30]	Training Loss: 2.7323		Training Accuracy: 0.5000		Validation Loss: 2.6002		Validation Accuracy: 0.5300
(array([1]), array([100]))
Epoch [4/30]	Training Loss: 2.2161		Training Accuracy: 0.5000		Validation Loss: 1.2239		Validation Accuracy: 0.5300
(array([0]), array([100]))
Epoch [5/30]	Training Loss: 0.9615		Training Accuracy: 0.5000		Validation Loss: 2.0627		Validation Accuracy: 0.4700
(array([0, 1]), array([75, 25]))
Epoch [6/30]	Training Loss: 2.0901		Training Accuracy: 0.5000		Validation Loss: 0.6977		Validation Accuracy: 0.4000
(array([1]), array([100]))
Epoch [7/30]	Training Loss: 0.5677		Training Accuracy: 0.9000		Validation Loss: 0.9

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


(array([0, 1]), array([67, 33]))
In-domain Accuracy: 0.5400
(array([0]), array([100]))
Out-domain Accuracy: 0.4500
Model: facebook/opt-350m
Teacher Training Accuracy: 0.5
Training student model:
(array([0]), array([100]))
Epoch [1/30]	Training Loss: 0.7119		Training Accuracy: 0.5000		Validation Loss: 13.9004		Validation Accuracy: 0.4700
(array([0]), array([100]))
Epoch [2/30]	Training Loss: 16.6480		Training Accuracy: 0.5000		Validation Loss: 2.4665		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [3/30]	Training Loss: 2.6588		Training Accuracy: 0.5000		Validation Loss: 2.9531		Validation Accuracy: 0.5300
(array([1]), array([100]))
Epoch [4/30]	Training Loss: 2.0794		Training Accuracy: 0.5000		Validation Loss: 1.8406		Validation Accuracy: 0.5300
(array([0]), array([100]))
Epoch [5/30]	Training Loss: 1.2146		Training Accuracy: 0.5000		Validation Loss: 0.8420		Validation Accuracy: 0.4700
(array([1]), array([100]))
Epoch [6/30]	Training Loss: 0.7781		Training Accuracy: 0.5000

In [43]:
models = [
    "facebook/opt-125m",
    "facebook/opt-350m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b"
]

acc_dict = {}
epochs = 30
lr = 1e-4
alpha = 0.2
beta = 0.8
for model_name in models:
    
    acc = compute_model_performance(model_name=model_name, train_dataset=train_dataset, validation_dataset=validation_dataset, 
                            indomain_dataset=indomain_dataset, outdomain_dataset=outdomain_dataset, epochs=epochs, lr=lr, alpha=alpha, beta=beta)
    acc_dict[model_name] = acc


Model: facebook/opt-125m
Teacher Training Accuracy: 0.5


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training student model:
Epoch [1/30]	Training Loss: 0.6885		Training Accuracy: 0.4000		Validation Loss: 0.7521		Validation Accuracy: 0.5200
Epoch [2/30]	Training Loss: 0.5446		Training Accuracy: 0.5500		Validation Loss: 0.7902		Validation Accuracy: 0.4700
Epoch [3/30]	Training Loss: 0.4974		Training Accuracy: 0.6000		Validation Loss: 0.7293		Validation Accuracy: 0.5300
Epoch [4/30]	Training Loss: 0.3364		Training Accuracy: 0.9500		Validation Loss: 0.7927		Validation Accuracy: 0.5300
Epoch [5/30]	Training Loss: 0.2725		Training Accuracy: 1.0000		Validation Loss: 0.7032		Validation Accuracy: 0.4700
Epoch [6/30]	Training Loss: 0.2149		Training Accuracy: 1.0000		Validation Loss: 0.7310		Validation Accuracy: 0.5500
Epoch [7/30]	Training Loss: 0.2370		Training Accuracy: 1.0000		Validation Loss: 0.7505		Validation Accuracy: 0.4600
Epoch [8/30]	Training Loss: 0.2404		Training Accuracy: 1.0000		Validation Loss: 0.8585		Validation Accuracy: 0.5500
Epoch [9/30]	Training Loss: 0.2176		Training Acc

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Teacher Training Accuracy: 0.6000000238418579


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Training student model:
Epoch [1/30]	Training Loss: 1.0035		Training Accuracy: 0.5500		Validation Loss: 15.2655		Validation Accuracy: 0.4700


: 

In [None]:
acc_dict