In [1]:
#@title Setup for Colab only
#!pip install transformers
#!pip install git+git://github.com/corolla-johnson/mkultra.git#egg=mkultra

# Tuning on Datasets
This sheet is adapted from the language modeling example at
https://github.com/huggingface/transformers/tree/master/examples/pytorch

The process is similar to finetuning, but only a set of input embeddings (given by model.get_soft_params()) are used as optimizer parameters.

In [2]:
from transformers.pipelines import pipeline
from mkultra.models.tuning import GPT2PromptTuningLM
from mkultra.tokenizers import GPT2TokenizerFast
from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import random
import torch

In [3]:
# Use an mkultra prompt tuning LM and a standard tokenizer.
model = GPT2PromptTuningLM.from_pretrained("gpt2").to("cuda")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Decide the length of your soft prompt in tokens.
n_tokens = 100
model.initialize_soft_prompt(n_tokens=n_tokens)

In [4]:
# Optionally load an existing soft prompt
#sp = SoftPrompt.from_file("existing_soft_prompt.json")
#model.set_soft_prompt(sp)

In [5]:
from datasets import load_dataset
path = "datasets/neuromancer_reformatted.txt"
datasets = load_dataset("text", data_files={"train": path, "validation": path})

Using custom data configuration default-812ce7cd67147d37
Reusing dataset text (C:\Users\STARSTRUCK\.cache\huggingface\datasets\text\default-812ce7cd67147d37\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [6]:
def tokenize(x):
    return tokenizer(x['text'])

tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=1, remove_columns=["text"])
tokenized_datasets["train"][1]

100%|██████████| 3/3 [00:00<00:00, 14.92ba/s]
100%|██████████| 3/3 [00:00<00:00, 28.57ba/s]


{'attention_mask': [1, 1, 1], 'input_ids': [1525, 3977, 20400]}

In [7]:
# Group texts into blocks
# WARNING: Be sure to subtract the size of the soft prompt!
block_size = 1024 - n_tokens

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

62, 37368, 4025, 11, 1165, 3223, 13, 34686, 41976, 290, 262, 25035, 1203, 28668, 1991, 704, 1978, 319, 262, 3996, 11, 45668, 625, 416, 262, 2832, 351, 511, 6016, 23361, 13, 383, 3996, 373, 6546, 783, 351, 38744, 286, 7872, 276, 11, 44494, 35938, 326, 1067, 11137, 379, 257, 3638, 13, 337, 6421, 286, 8977, 29939, 1088, 34686, 41976, 290, 262, 665, 19811, 21755, 11, 262, 629, 333, 14992, 11, 6757, 10813, 11, 1275, 11697, 2832, 13, 20448, 27846, 379, 30236, 13, 2332, 1986, 373, 9178, 26, 262, 7577, 286, 34686, 41976, 338, 20128, 6002, 276, 290, 2900, 287, 607, 22353, 13, 943, 2781, 496, 373, 21804, 2651, 11, 465, 2832, 2835, 262, 10717, 286, 257, 8237, 20721, 11, 465, 14005, 2951, 5969, 319, 262, 3800, 11, 262, 21377, 2119, 13, 3844, 21755, 290, 28668, 550, 23791, 11, 290, 34686, 41976, 427, 4185, 1068, 13, 383, 1182, 373, 612, 11, 262, 2939, 1844, 13, 30236, 338, 1986, 11, 351, 7209, 627, 3378, 46978, 32249, 262, 2951, 13, 34686, 41976, 290, 262, 30236, 12, 9060, 2540, 284, 3155, 351, 257

In [8]:
from torch.utils.data.dataloader import DataLoader
from transformers import default_data_collator
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=1
)
steps = len(train_dataloader)

In [9]:
# Make sure to set the optimizer to tune the soft prompt (given by model.get_soft_params()).
optimizer = Adafactor([model.get_soft_params()])

In [10]:
num_train_epochs = 1
total_step = 0

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {k:v.type(torch.long).to("cuda") for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_step += 1
        print(f"{total_step}: Loss: {loss}")

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ..\torch\csrc\utils\python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))
1: Loss: 3.9363033771514893
2: Loss: 4.437915802001953
3: Loss: 4.698455810546875
4: Loss: 4.394512176513672
5: Loss: 4.525238037109375
6: Loss: 4.355782508850098
7: Loss: 4.301104545593262
8: Loss: 4.29131555557251
9: Loss: 4.497940540313721
10: Loss: 4.546276092529297
11: Loss: 4.205498695373535
12: Loss: 4.343509674072266
13: Loss: 4.519213676452637
14: Loss: 4.268179893493652
15: Loss: 3.900041341781616
16: Loss: 3.981795310974121
17: Loss: 4.636607646942139
18: Loss: 4.0246453285217285
19: Loss: 3.9811670780181885
20: Loss: 4.103039264678955
21: Loss: 4.189911842346191
22: Loss: 3.935837507247925
23: Loss: 4.241316795349121
24: Loss: 4.428104400634766
25: Loss: 4.221592903137207
26: Loss: 3.9282944202423096
27: Lo

In [12]:
# Save your soft prompt
metadata = { "name" : "My soft prompt",
             "description" : "What I trained it on and for how long, final loss, model, etcetera" }

sp = SoftPrompt.from_tuning_model(model, metadata)
sp.to_file("soft_prompt.json")

In [13]:
# Try generating with your model
model.eval()

call = tokenizer("Armitage", return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    call,
    do_sample=True,
    min_length=call.shape[-1] + 100,
    max_length=call.shape[-1] + 100,
    temperature=0.1,
    top_p = 0.9,
    repetition_penalty = 1.7,
    pad_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(basic_output[0]))

Armitage, the man who'd been waiting for him to come out of his room. He was wearing a black suit and tie with white trim on it; he had an old-fashioned hat that looked like something from The Wizard Of Oz—a sorter version than what you might find in any other place: grayish brown hair tied back into ponytail over long sleeves (his eyes were wide open), blue jeans tucked under tight pants or dark boots underneath them as if they weren't there at all but
