In [1]:
#@title Setup for Colab only
#!pip install transformers
#!pip install git+git://github.com/corolla-johnson/mkultra.git#egg=mkultra

# Tuning on Datasets
This sheet is adapted from the language modeling example at
https://github.com/huggingface/transformers/tree/master/examples/pytorch

In [1]:
from transformers.pipelines import pipeline
from mkultra.models.tuning import GPT2PromptTuningLM
from mkultra.tokenizers import GPT2TokenizerFast
from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import random

In [2]:
# Use an mkultra prompt tuning LM and a standard tokenizer.
model = GPT2PromptTuningLM.from_pretrained("gpt2").to("cuda")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

n_tokens = 100

model.initialize_soft_prompt(n_tokens=n_tokens)

In [4]:
from datasets import load_dataset
path = "datasets/neuromancer_reformatted.txt"
datasets = load_dataset("text", data_files={"train": path, "validation": path})

Using custom data configuration default-812ce7cd67147d37
Reusing dataset text (C:\Users\STARSTRUCK\.cache\huggingface\datasets\text\default-812ce7cd67147d37\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def tokenize(x):
    return tokenizer(x['text'])

tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=1, remove_columns=["text"])
tokenized_datasets["train"][1]

100%|██████████| 3/3 [00:00<00:00, 26.31ba/s]
100%|██████████| 3/3 [00:00<00:00, 28.84ba/s]


{'attention_mask': [1, 1, 1], 'input_ids': [1525, 3977, 20400]}

In [6]:
# Group texts into blocks
# WARNING: Be sure to subtract the size of the soft prompt!
block_size = 1024 - n_tokens

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

 1561, 259, 6, 45967, 6, 16899, 11, 45812, 384, 71, 2005, 281, 6, 1057, 526, 679, 4966, 262, 736, 286, 257, 1588, 7586, 1021, 1973, 465, 5422, 13, 1, 3163, 2781, 496, 1701, 8913, 1592, 771, 355, 262, 731, 499, 831, 33077, 2543, 500, 8181, 2502, 2277, 683, 351, 663, 1336, 12245, 11, 555, 9612, 276, 416, 262, 17593, 393, 985, 42003, 13, 14842, 338, 1392, 645, 25377, 287, 340, 11, 339, 1297, 2241, 11, 340, 460, 470, 1107, 1254, 428, 2089, 13, 366, 2061, 466, 345, 1612, 11, 582, 30, 679, 338, 3501, 345, 6266, 30, 1867, 1701, 1, 9069, 11, 943, 2781, 496, 11, 339, 1560, 259, 6, 502, 900, 1781, 329, 17837, 11, 21349, 760, 30, 679, 1560, 259, 6, 502, 612, 307, 2911, 11, 21349, 760, 30, 7911, 319, 616, 3159, 45967, 6, 465, 10147, 477, 2910, 11, 937, 11, 281, 6, 307, 7165, 355, 617, 3290, 11, 1561, 259, 6, 8196, 259, 6, 33053, 281, 6, 3394, 281, 6, 294, 6, 2910, 286, 294, 6, 13935, 364, 2236, 307, 319, 674, 2832, 526, 679, 14682, 465, 1182, 757, 11, 262, 15157, 11128, 20009, 278, 290, 29202, 462

In [7]:
from torch.utils.data.dataloader import DataLoader
from transformers import default_data_collator
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=1
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=1
)


In [8]:
from transformers import TrainingArguments
from mkultra.trainers import SoftPromptTrainer
from torch.utils.data.dataloader import DataLoader
import transformers

optimizer = Adafactor([model.get_soft_params()])

In [9]:
steps = len(train_dataloader)

In [10]:
import torch
num_train_epochs = 100

total_step = 0

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {k:v.type(torch.long).to("cuda") for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_step += 1
        print(f"{total_step}: Loss: {loss}")

s: 4.416869640350342
2248: Loss: 4.338390350341797
2249: Loss: 3.7304539680480957
2250: Loss: 4.077001571655273
2251: Loss: 3.929363250732422
2252: Loss: 3.9027459621429443
2253: Loss: 4.196310520172119
2254: Loss: 3.983847141265869
2255: Loss: 4.212437152862549
2256: Loss: 3.8088133335113525
2257: Loss: 3.9873321056365967
2258: Loss: 3.8761932849884033
2259: Loss: 4.325216293334961
2260: Loss: 4.3976850509643555
2261: Loss: 4.4717888832092285
2262: Loss: 3.9001622200012207
2263: Loss: 4.1136579513549805
2264: Loss: 4.366222858428955
2265: Loss: 4.522253036499023
2266: Loss: 4.0323028564453125
2267: Loss: 4.241368770599365
2268: Loss: 4.2641143798828125
2269: Loss: 3.9137303829193115
2270: Loss: 4.089511394500732
2271: Loss: 3.880185127258301
2272: Loss: 4.275704383850098
2273: Loss: 4.123953819274902
2274: Loss: 4.099294185638428
2275: Loss: 3.8179166316986084
2276: Loss: 3.8860814571380615
2277: Loss: 4.3091840744018555
2278: Loss: 3.926036834716797
2279: Loss: 3.870868682861328
2280

KeyboardInterrupt: 

In [12]:
#sp = SoftPrompt.from_tuning_model(model)
#sp.to_file("neuromancer.json")

In [3]:
sp = SoftPrompt.from_file("neuromancer.json")
model.set_soft_prompt(sp)

In [12]:
model.eval()

call = tokenizer("Armitage", return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    call,
    do_sample=True,
    min_length=call.shape[-1] + 100,
    max_length=call.shape[-1] + 100,
    temperature=0.1,
    top_p = 0.9,
    repetition_penalty = 1.7,
    pad_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(basic_output[0]))

Armitage, the man who had been his best friend since he was a boy. He'd always wanted to be with him.""I'm sorry," she said softly. "You're not going anywhere now?"He looked at her in surprise. She smiled and nodded slowly toward Case's shoulder. The girl sat down next door on an empty floor of one-story apartment building that housed two large offices for Maelcum; it wasn't long before they were all gone except their windows shattered by firecr
