In [11]:
#@title Setup for Colab only
#!pip install transformers
#!pip install git+git://github.com/corolla-johnson/mkultra.git#egg=mkultra

# Tuning on Datasets
This sheet is adapted from the language modeling example at
https://github.com/huggingface/transformers/tree/master/examples/pytorch

In [1]:
from transformers.pipelines import pipeline
from mkultra.models.tuning import GPT2PromptTuningLM
from mkultra.tokenizers import GPT2TokenizerFast
from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import random

In [2]:
# Use an mkultra prompt tuning LM and a standard tokenizer.
model = GPT2PromptTuningLM.from_pretrained("distilgpt2").to("cuda")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

n_tokens = 100

model.initialize_soft_prompt(n_tokens=n_tokens)

In [3]:
from datasets import load_dataset
path = "datasets/neuromancer_reformatted.txt"
datasets = load_dataset("text", data_files={"train": path, "validation": path})

Using custom data configuration default-812ce7cd67147d37
Reusing dataset text (C:\Users\STARSTRUCK\.cache\huggingface\datasets\text\default-812ce7cd67147d37\0.0.0\e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)


In [5]:
def tokenize(x):
    return tokenizer(x['text'])

tokenized_datasets = datasets.map(tokenize, batched=True, num_proc=1, remove_columns=["text"])
tokenized_datasets["train"][1]

100%|██████████| 3/3 [00:00<00:00, 31.24ba/s]
100%|██████████| 3/3 [00:00<00:00, 30.30ba/s]


{'attention_mask': [1, 1, 1], 'input_ids': [1525, 3977, 20400]}

In [6]:
# Group texts into blocks
# WARNING: Be sure to subtract the size of the soft prompt!
block_size = 1024 - n_tokens

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

 503, 13, 8913, 20821, 262, 2266, 311, 1092, 78, 6050, 284, 257, 6143, 25755, 735, 290, 5954, 2241, 866, 284, 262, 308, 12, 12384, 13, 1, 6214, 644, 294, 6, 10905, 910, 11, 937, 553, 337, 3010, 36340, 531, 13, 366, 34556, 7622, 1265, 259, 6, 329, 345, 526, 1, 2396, 508, 338, 510, 612, 287, 326, 1517, 1701, 1, 30556, 2869, 12, 7081, 1625, 878, 13, 1052, 6, 783, 339, 5399, 416, 345, 43438, 943, 2781, 496, 11, 1282, 503, 4848, 274, 485, 35713, 20448, 1234, 262, 4161, 8906, 319, 290, 474, 6021, 287, 13, 1, 35, 39291, 1701, 464, 17593, 3751, 683, 262, 11398, 34126, 286, 262, 7771, 12082, 287, 32615, 74, 320, 13, 1, 2061, 345, 651, 43701, 6, 510, 284, 11, 2933, 30, 314, 587, 3285, 259, 6, 20605, 312, 3923, 13, 30789, 8130, 338, 39378, 656, 257, 15203, 3331, 319, 534, 6478, 338, 8848, 783, 13, 16123, 8169, 381, 259, 4458, 921, 2834, 617, 39141, 4894, 1701, 1, 10995, 11, 475, 10633, 76, 1133, 2923, 705, 368, 526, 1, 5779, 11, 326, 1839, 470, 1745, 705, 368, 890, 13, 43257, 517, 810, 883, 1625,

In [7]:
from torch.utils.data.dataloader import DataLoader
from transformers import default_data_collator
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=2
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=2
)


In [8]:
from transformers import TrainingArguments
from mkultra.trainers import SoftPromptTrainer
from torch.utils.data.dataloader import DataLoader
import transformers

optimizer = Adafactor([model.get_soft_params()])

In [9]:
steps = len(train_dataloader)

In [11]:
import torch
num_train_epochs = 2

total_step = 0

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = {k:v.type(torch.long).to("cuda") for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_step += 1
        print(f"{total_step}: Loss: {loss}")

attention_mask.shape torch.Size([2, 924])
1: Loss: 4.683137893676758
attention_mask.shape torch.Size([2, 924])
2: Loss: 4.347650527954102
attention_mask.shape torch.Size([2, 924])
3: Loss: 4.402397632598877
attention_mask.shape torch.Size([2, 924])
4: Loss: 4.403602123260498
attention_mask.shape torch.Size([2, 924])
5: Loss: 4.4982991218566895
attention_mask.shape torch.Size([2, 924])
6: Loss: 4.507497310638428
attention_mask.shape torch.Size([2, 924])
7: Loss: 4.427779674530029
attention_mask.shape torch.Size([2, 924])
8: Loss: 4.495551586151123
attention_mask.shape torch.Size([2, 924])
9: Loss: 4.666794776916504
attention_mask.shape torch.Size([2, 924])
10: Loss: 4.386064529418945
attention_mask.shape torch.Size([2, 924])
11: Loss: 4.577767848968506
attention_mask.shape torch.Size([2, 924])
12: Loss: 4.400213241577148
attention_mask.shape torch.Size([2, 924])
13: Loss: 4.582705974578857
attention_mask.shape torch.Size([2, 924])
14: Loss: 4.4947357177734375
attention_mask.shape torch.

In [None]:
#sp = SoftPrompt.from_tuning_model(model)
#sp.to_file("neuromancer.json")

In [None]:
sp = SoftPrompt.from_file("sample_sps/finetune/neuromancer.json")
model.set_soft_prompt(sp)

In [12]:
model.eval()

call = tokenizer("Armitage", return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    call,
    do_sample=True,
    min_length=call.shape[-1] + 100,
    max_length=call.shape[-1] + 100,
    temperature=0.1,
    top_p = 0.9,
    repetition_penalty = 1.7,
    pad_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(basic_output[0]))

AttributeError: 'NoneType' object has no attribute 'shape'