# GPT Instruct

## Load libraries

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

from datasets import load_dataset

## make a function to create examples

In [None]:
def preprocess(example):
    example["prompt"] = f"{example['instruction']} {example['input']} {example['output']}"
    return example

## Make a function to tokenize the dataset

In [None]:
def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])

    return tokenized_dataset

# Limited to 128 tokens to reduce processing resources

## Load the dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("hakurei/open-instruct-v1", split="train")
# 楽園の素敵な巫女
print(dataset.to_pandas().sample(20))
# a dataset of instructions and responses

## preprocess the dataset

In [None]:
dataset = dataset.map(preprocess, remove_columns=["instruction", "input", "output"])

dataset = dataset.shuffle(seed=42).select(range(10000)).train_test_split(test_size=0.1)
# limiting the training data to 10k entries to save processing time

## Create the training/test sets

In [None]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

MODEL_NAME = "microsoft/DialoGPT-medium"
# This model is a variant of GPT2 that has some training instructions, but not enough. The goal will be to fine tune and improve it.

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer. eos_token
# this is necessary to keep the questions and responses separate (I don't completly understand why)

train_dataset = tokenize_dataset(train_dataset)
test_dataset = tokenize_dataset(test_dataset)

## create the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# creates the appropriate batches

traing_args = TrainingArguments(output_dir="models/dialo_gpt",
                                num_train_epochs=1,
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=16)\
# One epoch and modest patch sizes to reduce processing time (defaults are 10, 32, 32)
trainer = Trainer(model=model,
                    args=traing_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    data_collator=data_collator)

## Define training pipeline

In [None]:
trainer = Trainer(
    model = model,
    args = traing_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

## run the trainer

In [None]:
trainer.train()