# 0. Mount drive, environment setting, and set directory

* Settings for colabe env

In [204]:
# from google.colab import drive
# drive.mount('/content/drive')
# !ls "/content/drive"
# BASE_DIR = "/content/drive/MyDrive/semantic plausibility/datasets/adept/train-dev-test-split"
# TRAIN_FN = 'train.json'
# DEV_FN = 'val.json'
# TEST_FN = 'test.json'

* Settings for local (mps)

In [205]:
BASE_DIR = "/Users/chihyi/Documents/CL_WS_24/semantic plausibility/project/semantic_plausibility/datasets/adept/train-dev-test-split"
TRAIN_FN = 'train.json'
DEV_FN = 'val.json'
TEST_FN = 'test.json'

# 1. RoBerta Prompt Tuning
This script uses the OpenPrompt API and some of the code is adapted from [OpenPrompt's tutorials](https://github.com/thunlp/OpenPrompt).

### Prerpocessing - ADEPT

In [206]:
# !pip install -q transformers datasets evaluate accelerate

In [207]:
# !pip install openprompt

Check device

In [208]:
import torch
# set the device
device = "mps" if torch.backends.mps.is_available() \
else "gpu" if torch.cuda.is_available() \
else "cpu"
print(device)

mps


In [209]:
# Set a seed for reproducibility
import random
seed = 42
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x118c594f0>

In [210]:
from datasets import load_dataset
adept = load_dataset('json', data_files={
    'train':f'{BASE_DIR}/{TRAIN_FN}',
    'dev': f'{BASE_DIR}/{DEV_FN}',
    'test': f'{BASE_DIR}/{TEST_FN}'})

In [211]:
# print(adept)
# for i in adept['train']:
#     print(i['sentence1'])
#     print(i['sentence2'])
#     break

DatasetDict({
    train: Dataset({
        features: ['modifier', 'label', 'idx', 'sentence1', 'sentence2', 'noun'],
        num_rows: 12892
    })
    dev: Dataset({
        features: ['modifier', 'label', 'idx', 'sentence1', 'sentence2', 'noun'],
        num_rows: 1611
    })
    test: Dataset({
        features: ['modifier', 'label', 'idx', 'sentence1', 'sentence2', 'noun'],
        num_rows: 1612
    })
})
The effect of sleeping is rejuvenation.
The effect of additional sleeping is rejuvenation.


In [212]:
from openprompt.data_utils.utils import InputExample
# convert the data into the input format for prompting
def get_examples(split):
  examples = []
  sentences1 = split['sentence1']
  sentences2 = split['sentence2']
  labels = split['label']
  idxes = split['idx']
  for i in range(len(sentences1)):
    sentence1, sentence2, label, idx = sentences1[i], sentences2[i], int(labels[i]), int(idxes[i])
    example = InputExample(guid=idx, text_a=sentence1, text_b=sentence2, label=label)
    examples.append(example)
  return examples

# Prepare the dataset
# def prepare_dataset(data):
#   dataset = {}
#   for split in ['train', 'dev', 'test']:
#       dataset[split] = []
#       for i in data[split]:
#           input_example = InputExample(text_a = i['sentence1'], text_b = i['sentence2'], label=int(i['label']), guid=i['idx'])
#           dataset[split].append(input_example)
#   return dataset



1. Convert train, dev, test sets into the input format

In [213]:
# adept = prepare_dataset(adept)
# len(adept)

In [314]:
adept_train_set = get_examples(adept['train'])
adept_dev_set = get_examples(adept['dev'])
adept_test_set = get_examples(adept['test'])
adept_test_set[0]

{
  "guid": 2082,
  "label": 3,
  "meta": {},
  "text_a": "Urine is tested to check for disease.",
  "text_b": "Dark urine is tested to check for disease.",
  "tgt_text": null
}

2. Load pre-trained LM

In [289]:
from openprompt.plms import load_plm
plm, tokenizer, model_config, robertaTokenizerWrapper = load_plm("roberta", "roberta-base")

3. Define prompt template: manual vs. soft

In [290]:
from openprompt.prompts import ManualTemplate
from openprompt.prompts import MixedTemplate

# define manual prompt temeplate with a mask token for predictions
def manual_template(tokenizer=tokenizer):
  prompt_template = ManualTemplate(
      text = 'Compared with the statement {"placeholder":"text_a"}, does {"placeholder":"text_b"} become more plausible or less plausible? {"mask"}.',
      tokenizer = tokenizer,
  )
  return prompt_template

# soft template
def soft_template():
  mytemplate = MixedTemplate(model=plm, tokenizer=tokenizer, text='{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"} {"mask"}.')
  return mytemplate

4. Define verbalizer that maps the original class labels to the words that we consider are valid predictions: manual vs. soft

In [291]:
from openprompt.prompts import ManualVerbalizer
from openprompt.prompts import SoftVerbalizer

def manual_verbalizer(num_classes=5, tokenizer=tokenizer):
  if num_classes==5:
    classes = [0, 1, 2, 3, 4]
    prompt_verbalizer = ManualVerbalizer(
        classes = classes,
        label_words = {
            0: ['impossible', 'no', 'incorrect', 'invalid'],
            1: ['less likely', 'less correct'],
            2: ['same likely', 'equally likely', 'same', 'no change'],
            3: ['more likely', 'more possible'],
            4: ['yes', 'correct', 'true', 'valid']},
        tokenizer=tokenizer,
    )
  # TODO: elif 3 classes
  return prompt_verbalizer

def soft_verbalizer(num_classes=5, tokenizer=tokenizer, plm=plm):
  prompt_verbalizer = SoftVerbalizer(tokenizer, plm, num_classes=num_classes)
  return prompt_verbalizer

**Change experiment settings**:
uncomment one of the following setting.

In [292]:
# 1st settings: manual_template() + manual_verbalizer()
# prompt_template = manual_template()
# prompt_verbalizer = manual_verbalizer()

# 2nd settings: manual_template() + soft_verbalizer()
# prompt_template = manual_template()
# prompt_verbalizer = soft_verbalizer()

# 3rd settings: soft_template() + manual_verbalizer()
# prompt_template = soft_template()
# prompt_verbalizer = manual_verbalizer()

# 4th settings: soft_template() + soft_verbalizer()
prompt_template = soft_template()
prompt_verbalizer = soft_verbalizer()

look at one example wrapped with template, and how it is later tokenized by RoBerta tokenizer.

In [293]:
wrapped_example = prompt_template.wrap_one_example(adept_test_set[0])
print(wrapped_example)

[[{'text': 'Urine is tested to check for disease.', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '', 'soft_token_ids': 1, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 2, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 3, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': ' Dark urine is tested to check for disease.', 'soft_token_ids': 0, 'loss_ids': 0, 'shortenable_ids': 1}, {'text': '', 'soft_token_ids': 4, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 5, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 6, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 7, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 8, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '', 'soft_token_ids': 9, 'loss_ids': 0, 'shortenable_ids': 0}, {'text': '<mask>', 'soft_token_ids': 0, 'loss_ids': 1, 'shortenable_ids': 0}, {'text': '.', 'soft_token_ids': 0, 'loss_ids': 0

In [294]:
wrapped_tokenizer = robertaTokenizerWrapper(max_seq_length=256, tokenizer = tokenizer, decoder_max_length=3, truncate_method="head")
# tokenize example
tokenized_example = wrapped_tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
print(tokenized_example)
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))

{'input_ids': [0, 791, 24339, 16, 4776, 7, 1649, 13, 2199, 4, 0, 0, 0, 10524, 20987, 16, 4776, 7, 1649, 13, 2199, 4, 0, 0, 0, 0, 0, 0, 50264, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'soft_token_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

A RoBERTa tokenizer using Byte-Pair Encoding subword segmentation. Ġ is space.

5. Initialize dataloader

In [295]:
from openprompt import PromptDataLoader

train_loader = PromptDataLoader(
            dataset = adept_train_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )

tokenizing: 12892it [00:05, 2518.71it/s]


6. Initialize the prompt model

In [296]:
from openprompt import PromptForClassification

def init_prompt_model():
  prompt_model = PromptForClassification(
      template = prompt_template,
      plm = plm,
      verbalizer = prompt_verbalizer,
  )
  prompt_model = prompt_model.to(device)
  return prompt_model

In [297]:
prompt_model = init_prompt_model()

## Experiment 1: Zero-shot Inference

Initialize dev_dataloader and test_dataloader

In [298]:
dev_loader = PromptDataLoader(
            dataset = adept_dev_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )

test_loader = PromptDataLoader(
            dataset = adept_test_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )

tokenizing: 1611it [00:00, 2590.02it/s]
tokenizing: 1612it [00:00, 2678.57it/s]


In [225]:
# for step, i in enumerate(train_loader):
#     labels = i['label']
#     print(labels)

tensor([1, 3, 2, 2])
tensor([2, 0, 2, 2])
tensor([2, 2, 2, 2])
tensor([0, 2, 2, 2])
tensor([2, 2, 0, 0])
tensor([3, 2, 2, 2])
tensor([2, 2, 2, 2])
tensor([2, 1, 0, 2])
tensor([2, 0, 2, 2])
tensor([0, 2, 0, 0])
tensor([2, 0, 1, 2])
tensor([2, 1, 0, 2])
tensor([2, 2, 2, 2])
tensor([0, 2, 2, 2])
tensor([2, 0, 2, 0])
tensor([2, 1, 2, 2])
tensor([2, 2, 2, 2])
tensor([2, 2, 1, 2])
tensor([2, 2, 0, 2])
tensor([2, 2, 3, 2])
tensor([1, 2, 2, 2])
tensor([0, 2, 3, 2])
tensor([2, 2, 1, 2])
tensor([2, 2, 2, 2])
tensor([2, 0, 2, 2])
tensor([0, 2, 0, 0])
tensor([2, 2, 2, 1])
tensor([2, 2, 0, 2])
tensor([2, 2, 2, 0])
tensor([1, 3, 2, 2])
tensor([0, 2, 0, 2])
tensor([2, 0, 0, 2])
tensor([2, 2, 1, 4])
tensor([1, 2, 0, 2])
tensor([1, 0, 3, 2])
tensor([2, 2, 1, 0])
tensor([2, 2, 2, 3])
tensor([2, 2, 2, 2])
tensor([0, 0, 2, 2])
tensor([2, 1, 2, 2])
tensor([2, 2, 2, 0])
tensor([1, 2, 0, 2])
tensor([2, 0, 1, 2])
tensor([2, 1, 2, 1])
tensor([2, 1, 2, 3])
tensor([2, 2, 2, 0])
tensor([0, 2, 3, 2])
tensor([2, 1,

In [299]:
def evaluate(dataloader=dev_loader):
  allpreds = []
  alllabels = []
  with torch.no_grad():
      for batch in dataloader:
          batch = batch.to(device)
          logits = prompt_model(batch)
          labels = batch['label']
          outputs = torch.argmax(logits, dim = -1)
          alllabels.extend(labels.cpu().tolist())
          allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())

  acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
  return acc

In [300]:
# Set the model to eval mode
prompt_model.eval()
# Run the inference loop
acc = evaluate(test_loader)  # change to test_loader for zero-shot inference
print(acc)


0.12034739454094293


In [228]:
# Save the model
# torch.save(prompt_model.state_dict(),'./soft_seed_42')

In [229]:
# Load the model
# prompt_model.load_state_dict(torch.load('./soft_seed_42'))

### Zero-Shot Experiment Results on ADEPT Test Set (5 classes)
- Comparison of 4 settings on accuracy ( no seed / **seed=42** / seed=12 / seed=99 )
    - manual_template + manual_verbalizer: 0.0043 / **0.0633** / 0.0043 / 0.0043
    - manual_template + soft_verbalizer: 0.0050 / **0.0136** / 0.0601 / 0.0043
    - soft_template + manual_verbalizer: 0.0627 / **0.0639** / 0.0639 / 0.0639
    - soft_template + soft_verbalizer: 0.1421 / **0.1370** / 0.0986 / 0.0676
 
- Try with longer soft template (seed=42)
    '{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"}{"mask"}.'
    - soft_template + soft_verbalizer: 0.1203
    - Longer soft template which contains more soft tokens after text_b shows not apparent improvement.

1. Performance: 
    - All zero-shot configurations perform badly. 
    - Using **soft_template + soft_verbalizer** improve performance the most, following by soft_template + manual_verbalizer.
    - Set different seeds yield different performance.
    

2. Runtime: On average, it takes about 1m 2s to run each configuration on mps (M1 GPU).

For reproducibility, we choose a ralatively better seed (seed=42) for the rest of the experiment.

## Experiment 2: Few-shot Learning

1. Sample a few examples to form the few-shot training pool

In [315]:
from openprompt.data_utils.data_sampler import FewShotSampler
sampler = FewShotSampler(num_examples_per_label=16, num_examples_per_label_dev=16, also_sample_dev=True)
adept_train_set, _ = sampler(adept_train_set)
# len(train_set) = 80 (5 labels * 16 examples)
# len(dev_set) = 1611
# len(test_set) = 1612

1611


2. Select template and verbalizer

In [316]:
prompt_template = soft_template()
prompt_verbalizer = soft_verbalizer()

3. Initialize data loader

In [317]:
train_loader = PromptDataLoader(
    dataset = adept_train_set, 
    template=prompt_template, 
    tokenizer=tokenizer,
    tokenizer_wrapper_class=robertaTokenizerWrapper, 
    max_seq_length=256, 
    decoder_max_length=3,
    batch_size=4,
    shuffle=True, 
    teacher_forcing=False, 
    predict_eos_token=False,
    truncate_method="head")

dev_loader = PromptDataLoader(
    dataset = adept_dev_set, 
    template=prompt_template, 
    tokenizer=tokenizer,
    tokenizer_wrapper_class=robertaTokenizerWrapper, 
    max_seq_length=256, 
    decoder_max_length=3,
    batch_size=4,
    shuffle=True, 
    teacher_forcing=False, 
    predict_eos_token=False,
    truncate_method="head")

tokenizing: 80it [00:00, 2442.13it/s]
tokenizing: 1611it [00:00, 2697.35it/s]


In [322]:
# check label tensors
# for step, i in enumerate(train_loader):
#     labels = i['label']
#     print(labels)
# check len of data loader
print(len(train_loader))
print(len(dev_loader))
print(len(test_loader))

20
403
403


4. run the training loop

In [323]:
from transformers import AdamW

prompt_model = init_prompt_model()
loss_func = torch.nn.CrossEntropyLoss()
# set no decay to biases and LayerNorm parameters
no_decay = ['bias', 'LayerNorm.weight']
# Using different optimizer for prompt parameters (2) and model parameters (1)
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer_grouped_parameters2 = [
    {'params': prompt_model.verbalizer.group_parameters_1, "lr":3e-5},
    {'params': prompt_model.verbalizer.group_parameters_2, "lr":3e-4},
    {'params': [p for name, p in prompt_model.template.named_parameters() if 'raw_embedding' not in name], "lr":1e-3}
]
optimizer1 = AdamW(optimizer_grouped_parameters1, lr=3e-5)
optimizer2 = AdamW(optimizer_grouped_parameters2)




In [336]:
def train(train_loader, training_epoch):
    for epoch in range(training_epoch):
        total_loss = 0
        for step, inputs in enumerate(train_loader):
            inputs = inputs.to(device)
            logits = prompt_model(inputs)
            labels = inputs['label']
            loss = loss_func(logits, labels)
            loss.backward()
            total_loss += loss.item()
            optimizer1.step()
            optimizer1.zero_grad()
            optimizer2.step()
            optimizer2.zero_grad()
            if step %100 ==1:
                print("Epoch {}, average loss: {}".format(epoch, total_loss/(step+1)), flush=True)

In [328]:
acc = evaluate(dev_loader)
print(acc)

0.2377405338299193


In [329]:
acc = evaluate(test_loader)
print(acc)

0.24441687344913152


### Experiment result: Few-shot prompting with soft templete + soft verbalizer
- template1: '{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"mask"}. : dev: 0.175, test: 0.1011

- template2: {"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"} {"soft"} {"mask"}.': 
    - **5epochs: dev:0.2297, test: 0.2506**
    - 10epochs: dev: 0.2377, test: 0.2444

- **soft_template (template2) with more soft tokens after text_b helps improve the accuracy for both dev and test set.**

## Experiment 3: LM Tuning Using Soft Prompts

In [333]:
plm, tokenizer, model_config, robertaTokenizerWrapper = load_plm("roberta", "roberta-base")

In [334]:
prompt_template = soft_template()
prompt_verbalizer = soft_verbalizer()

Reinitialize data loader

In [335]:
adept_train_set = get_examples(adept['train'])
adept_dev_set = get_examples(adept['dev'])
adept_test_set = get_examples(adept['test'])
train_loader = PromptDataLoader(
            dataset = adept_train_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )
dev_loader = PromptDataLoader(
            dataset = adept_dev_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )
test_loader = PromptDataLoader(
            dataset = adept_test_set,
            tokenizer = tokenizer,
            tokenizer_wrapper_class = robertaTokenizerWrapper,
            template = prompt_template,
            max_seq_length=256, decoder_max_length=3,
            batch_size=4, shuffle=True, teacher_forcing=False, predict_eos_token=False,
            truncate_method="head"
        )

tokenizing: 12892it [00:05, 2486.99it/s]
tokenizing: 1611it [00:00, 2686.39it/s]
tokenizing: 1612it [00:00, 2717.90it/s]


In [337]:
prompt_model = init_prompt_model()
loss_func = torch.nn.CrossEntropyLoss()
# set no decay to biases and LayerNorm parameters
no_decay = ['bias', 'LayerNorm.weight']
# Using different optimizer for prompt parameters (2) and model parameters (1)
optimizer_grouped_parameters1 = [
    {'params': [p for n, p in prompt_model.plm.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in prompt_model.plm.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer_grouped_parameters2 = [
    {'params': prompt_model.verbalizer.group_parameters_1, "lr":3e-5},
    {'params': prompt_model.verbalizer.group_parameters_2, "lr":3e-4},
    {'params': [p for name, p in prompt_model.template.named_parameters() if 'raw_embedding' not in name], "lr":1e-3}
]
optimizer1 = AdamW(optimizer_grouped_parameters1, lr=3e-5)
optimizer2 = AdamW(optimizer_grouped_parameters2)



In [340]:
train(train_loader, training_epoch=3)

Epoch 0, average loss: 1.7877539992332458
Epoch 0, average loss: 0.6948403280739691
Epoch 0, average loss: 0.6669779200623236
Epoch 0, average loss: 0.6585226474539531
Epoch 0, average loss: 0.6515637943382139
Epoch 0, average loss: 0.6537875801666205
Epoch 0, average loss: 0.6623721459069818
Epoch 0, average loss: 0.6541679122795662
Epoch 0, average loss: 0.646560301632618
Epoch 0, average loss: 0.6429978040833761
Epoch 0, average loss: 0.6460267020586484
Epoch 0, average loss: 0.6517743194059971
Epoch 0, average loss: 0.6536499037222934
Epoch 0, average loss: 0.6415457775467254
Epoch 0, average loss: 0.6363406970966793
Epoch 0, average loss: 0.6365750859591977
Epoch 0, average loss: 0.6411501892206356
Epoch 0, average loss: 0.641028431892071
Epoch 0, average loss: 0.6503082290570873
Epoch 0, average loss: 0.6476129570928892
Epoch 0, average loss: 0.6473293233575387
Epoch 0, average loss: 0.6488989087346264
Epoch 0, average loss: 0.656159330398812
Epoch 0, average loss: 0.653555229782

In [341]:
acc = evaluate(dev_loader)
print(acc)

0.6430788330229671


In [342]:
acc = evaluate(test_loader)
print(acc)

0.6495037220843672


In [343]:
torch.save(prompt_model.state_dict(),'./lm_tuning_soft_seed_42')

### Experiment result: Prompt-tuning with soft templete + soft verbalizer
- **3 epochs: dev: 0.6431, test: 0.6495** (68m 38.5s)
- Compare with few-shot learning, prompt-tuning drastically improve the performance on both devlopment set (+0.41) and test set (+0.40).