# LLM with OpenAI ChatGPT

## for question answering

See https://colab.research.google.com/drive/1OUHnyQevDJA1p_tDDUqfWdxKfwRCz1Xt?usp=sharing


In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset


In [2]:
# Load dataset

dataset = load_dataset("hakurei/open-instruct-v1", split='train')
dataset.to_pandas().sample(20)



Unnamed: 0,instruction,input,output
177101,What are the international guidelines for avoi...,,The international guidelines for avoiding iden...
427175,Explain quantum computing in simple terms,,Quantum computing is a type of computing that ...
427912,Please learn from https://openapi.iotera.io/#/...,,To create a Python script that fetches device ...
23327,Name two classic films from the 1950s?,,"Two classic films from the 1950s are ""Rebel Wi..."
369439,Find out whether the given number is even or o...,,Odd
228129,Create a method which receives a list of names...,,"""""""\nCreate a method which receives a list of ..."
408752,I have a string which consists of lowercase le...,,True
197764,Suggest a function to convert a given tuple of...,,def tuple_to_dict(tup):\n return {tup[i] : ...
50190,"Given a list of names, sort the list alphabeti...","Names: Arthur, Bob, Louis","Arthur, Bob, Louis"
359294,"Given a year, find out what season it was. out...",,Winter


In [3]:
def preprocess(example):
    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"
    return example

def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])
    return tokenized_dataset

In [4]:
# Set train partition
dataset = dataset.map(preprocess, remove_columns=['instruction', 'input', 'output'])
dataset =  dataset.shuffle(42).select(range(100000)).train_test_split(test_size=0.1, seed=42)


Map:   0%|          | 0/498813 [00:00<?, ? examples/s]

In [5]:
# Set test and train data
train_dataset = dataset['train']
test_dataset = dataset['test']


In [6]:
# Set model (DialoGPT)

MODEL_NAME = "microsoft/DialoGPT-medium"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
# Train model
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(output_dir="models/diablo_gpt",
                                num_train_epochs=1,
                                per_device_train_batch_size=32,
                                per_device_eval_batch_size=32) 
 
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset,
                  data_collator=data_collator)

2023-10-24 14:57:24.497 python[6371:146804] apply_selection_policy_once: avoid use of removable GPUs (via (null):GPUSelectionPolicy->avoidRemovable)


In [8]:
# Get the trained checkpoint directly
model = AutoModelForCausalLM.from_pretrained("TheFuzzyScientist/diabloGPT_open-instruct")

(…)T_open-instruct/resolve/main/config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

(…)ruct/resolve/main/generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [11]:
def generate_text(prompt):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    outputs = model.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generated[:generated.rfind('.')+1]

In [12]:
generate_text("What's the best way to cook chiken breast?")

"What's the best way to cook chiken breast?  The best way to cook chiken breast is to season it with salt and pepper, then heat a pan over medium heat. Add a tablespoon of olive oil and cook for about 5 minutes, stirring occasionally."