In [1]:
#@title Setup for Colab only
#!pip install transformers
#!pip install git+git://github.com/corolla-johnson/mkultra.git#egg=mkultra

# Tuning on Datasets
This sheet is adapted from the language modeling example at
https://github.com/huggingface/transformers/tree/master/examples/pytorch

In [2]:
from transformers.pipelines import pipeline
from mkultra.models.tuning import GPT2PromptTuningLM
from mkultra.tokenizers import GPT2TokenizerFast
from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import random

In [3]:
# Use an mkultra prompt tuning LM and a standard tokenizer.
model = GPT2PromptTuningLM.from_pretrained("gpt2").to("cuda")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

n_tokens = 100

model.initialize_soft_prompt(n_tokens=n_tokens)

In [4]:
from datasets import load_dataset
path = "datasets/neuromancer_reformatted.txt"
datasets = load_dataset("text", data_files={"train": path, "validation": path})

Using custom data configuration default-812ce7cd67147d37
Reusing dataset text (C:\Users\STARSTRUCK\.cache\huggingface\datasets\text\default-812ce7cd67147d37\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def tokenize(x):
    return tokenizer(x['text'])

tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=1, remove_columns=["text"])
tokenized_datasets["train"][1]

100%|██████████| 3/3 [00:00<00:00, 17.44ba/s]
100%|██████████| 3/3 [00:00<00:00, 28.57ba/s]


{'attention_mask': [1, 1, 1], 'input_ids': [1525, 3977, 20400]}

In [6]:
# Group texts into blocks
# WARNING: Be sure to subtract the size of the soft prompt!
block_size = 1024 - n_tokens

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

, 13, 1400, 2128, 475, 262, 38952, 1308, 1806, 286, 262, 698, 3618, 290, 262, 37479, 12704, 286, 262, 8486, 13, 8134, 12609, 7577, 35456, 1973, 30236, 338, 18405, 355, 262, 1450, 48149, 13, 383, 31912, 9474, 547, 3478, 12, 6477, 7842, 6637, 26, 379, 3478, 11, 262, 23110, 484, 2714, 547, 655, 739, 257, 16430, 890, 13, 383, 638, 361, 891, 4799, 338, 11762, 318, 262, 277, 12137, 338, 11762, 11, 8913, 12086, 11, 262, 9353, 41108, 11, 15683, 19874, 351, 11865, 13, 383, 23110, 3947, 284, 1445, 286, 511, 220, 898, 8178, 11, 1278, 2530, 351, 257, 12146, 3092, 286, 25615, 832, 262, 44606, 290, 8318, 286, 511, 9280, 11, 966, 6427, 966, 11, 355, 262, 1450, 13488, 329, 281, 4756, 13, 30236, 338, 18529, 44866, 1986, 373, 7209, 290, 991, 11, 4964, 13, 1, 40, 1183, 467, 1064, 514, 617, 2057, 553, 8913, 531, 13, 1375, 14464, 11, 2626, 287, 50152, 286, 262, 9280, 13, 1544, 1422, 470, 588, 428, 1295, 13, 1544, 2900, 290, 6807, 736, 656, 262, 16187, 13, 14190, 3223, 13, 14190, 5897, 13, 464, 4315, 11, 33

In [7]:
from transformers import TrainingArguments
from mkultra.trainers import SoftPromptTrainer
from torch.utils.data.dataloader import DataLoader
import transformers

optimizer = Adafactor([model.get_soft_params()], scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
scheduler = transformers.get_constant_schedule_with_warmup(optimizer=optimizer,num_warmup_steps=1000)

training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=1e-3
)

trainer = SoftPromptTrainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    optimizers = (optimizer, scheduler)
)

In [8]:
trainer.train()

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
 33%|███▎      | 116/348 [00:30<01:00,  3.81it/s]

RuntimeError: CUDA out of memory. Tried to allocate 1.53 GiB (GPU 0; 8.00 GiB total capacity; 4.43 GiB already allocated; 1.14 GiB free; 5.03 GiB reserved in total by PyTorch)