# GPT Instruct

## Load libraries

In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## make a function to create examples

In [2]:
def preprocess(example):
    example["prompt"] = f"{example['instruction']} {example['input']} {example['output']}"
    return example

## Make a function to tokenize the dataset

In [3]:
def tokenize_dataset(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])

    return tokenized_dataset

# Limited to 128 tokens to reduce processing resources

## Load the dataset

In [4]:
from datasets import load_dataset
dataset = load_dataset("hakurei/open-instruct-v1", split="train")
# 楽園の素敵な巫女
print(dataset.to_pandas().sample(20))
# a dataset of instructions and responses

                                              instruction  \
200130  What is the value of the math expression 100(l...   
490147  What should I do to get ready for a job interv...   
194683  Create a function that returns the `n`th numbe...   
238015  What is the class of the word "training" in th...   
71162   Identify the emotion expressed by the writer i...   
319977  Analyze the given dataset and propose a suitab...   
245136  Which of the following words are considered a ...   
76590   What would be the title of a book about my lif...   
150443         Explain the rules of Sudoku, step by step.   
450799                how to Compile Python to Javascript   
89365   Classify whether the claim is supported by evi...   
184183                 What are today's top news stories.   
277659  Edit the following sentence: "I took my dog fo...   
25681     How would you measure the success of a website?   
440892                         What is FASTQ file format?   
465182  why we imagine g

## preprocess the dataset

In [5]:
dataset = dataset.map(preprocess, remove_columns=["instruction", "input", "output"])

dataset = dataset.shuffle(seed=42).select(range(10000)).train_test_split(test_size=0.1)
# limiting the training data to 10k entries to save processing time

## Create the training/test sets

In [6]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

MODEL_NAME = "microsoft/DialoGPT-medium"
# This model is a variant of GPT2 that has some training instructions, but not enough. The goal will be to fine tune and improve it.

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer. eos_token
# this is necessary to keep the questions and responses separate (I don't completly understand why)

train_dataset = tokenize_dataset(train_dataset)
test_dataset = tokenize_dataset(test_dataset)

Map: 100%|██████████| 9000/9000 [00:02<00:00, 3728.36 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4362.34 examples/s]


## create the model

In [8]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# creates the appropriate batches

traing_args = TrainingArguments(output_dir="models/dialo_gpt",
                                num_train_epochs=1,
                                per_device_train_batch_size=8,
                                per_device_eval_batch_size=16)\
# One epoch and modest batch sizes to reduce processing time (defaults are 10, 32, 32)
trainer = Trainer(model=model,
                    args=traing_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    data_collator=data_collator)

: 

## Define training pipeline

In [None]:
trainer = Trainer(
    model = model,
    args = traing_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

## run the trainer

In [None]:
trainer.train()
trainer.save_model()

prompt = ""

## A function to return outputs

In [None]:
def generate_text(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to("cuda")    # use a CUDA GPU if possible
    outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated[:generated.rfind('.')+1] # remove the last sentence from the output (don't know why)

# Prompts

In [None]:
generate_text("What's the best way to cook chiken breast?")