In [1]:
#@title Setup for Colab only
#!pip install transformers
#!pip install git+git://github.com/corolla-johnson/mkultra.git#egg=mkultra

# Tuning on Datasets
This sheet is adapted from the language modeling example at
https://github.com/huggingface/transformers/tree/master/examples/pytorch

In [2]:
from transformers.pipelines import pipeline
from mkultra.models.tuning import GPT2PromptTuningLM
from mkultra.tokenizers import GPT2TokenizerFast
from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import random

In [3]:
# Use an mkultra prompt tuning LM and a standard tokenizer.
model = GPT2PromptTuningLM.from_pretrained("distilgpt2").to("cuda")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

n_tokens = 100

model.initialize_soft_prompt(n_tokens=n_tokens)

In [4]:
from datasets import load_dataset
path = "datasets/neuromancer_reformatted.txt"
datasets = load_dataset("text", data_files={"train": path, "validation": path})

Using custom data configuration default-812ce7cd67147d37
Reusing dataset text (C:\Users\STARSTRUCK\.cache\huggingface\datasets\text\default-812ce7cd67147d37\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def tokenize(x):
    return tokenizer(x['text'])

tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=1, remove_columns=["text"])
tokenized_datasets["train"][1]

100%|██████████| 3/3 [00:00<00:00, 20.40ba/s]
100%|██████████| 3/3 [00:00<00:00, 29.41ba/s]


{'attention_mask': [1, 1, 1], 'input_ids': [1525, 3977, 20400]}

In [6]:
# Group texts into blocks
# WARNING: Be sure to subtract the size of the soft prompt!
block_size = 1024 - n_tokens

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

6, 607, 314, 1101, 287, 290, 4769, 553, 8913, 531, 13, 44, 5098, 2540, 284, 42102, 866, 262, 20749, 13, 1649, 673, 27846, 736, 11, 1752, 11, 8913, 2497, 262, 1067, 388, 10137, 5920, 286, 1115, 24956, 14, 7934, 2324, 10942, 13, 1881, 286, 606, 3947, 284, 423, 645, 2951, 13, 1, 45803, 20155, 290, 27466, 423, 15283, 262, 2323, 4314, 11, 5181, 10584, 13, 19434, 321, 31853, 2367, 13, 35068, 338, 1972, 35075, 526, 1, 35700, 35075, 866, 994, 553, 673, 531, 11, 25635, 5223, 832, 257, 5166, 286, 12768, 7771, 8215, 13, 366, 23379, 612, 11, 49160, 526, 20448, 26157, 656, 262, 17593, 290, 5954, 262, 4161, 8906, 422, 465, 22645, 13, 679, 373, 288, 23437, 351, 15488, 13, 679, 21122, 465, 22645, 351, 257, 24808, 11, 1718, 257, 2068, 31145, 286, 1660, 422, 262, 17026, 9294, 13970, 262, 30789, 8130, 11, 290, 10667, 262, 3975, 286, 262, 5888, 9066, 319, 262, 3159, 13, 317, 22271, 278, 2266, 23493, 47092, 832, 262, 19001, 286, 257, 32439, 13, 5514, 3939, 31551, 422, 262, 4077, 16605, 326, 8203, 262, 4067

In [7]:
from transformers import TrainingArguments
from mkultra.trainers import SoftPromptTrainer
from torch.utils.data.dataloader import DataLoader
import transformers
from transformers import Trainer

optimizer = Adafactor([model.get_soft_params()], scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
scheduler = transformers.get_constant_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=1000)

training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    optimizers = (optimizer, scheduler)
)

In [8]:
trainer.train()

  0%|          | 0/174 [00:00<?, ?it/s]

ValueError: You have to specify either input_ids or inputs_embeds