In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets matplotlib einops



In [2]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import datasets
from transformers import TrainingArguments
import random

In [None]:
#train_dataset = load_dataset('json', data_files='reddit_jokes.jsonl', split='train')
#eval_dataset = load_dataset('json', data_files='reddit_jokes_validation.jsonl', split='train')

In [3]:
def formatting_func(example):
    text = f"### The following is the joke title: {example['title']} \n ### joke: {example['body']}"
    return text

In [4]:
base_model_id = "EleutherAI/pythia-14m"

In [5]:
max_length = 1024

In [6]:
def get_tokenize_function(tokenizer, _max_length):

  def tokenize_function(examples):
    max_length = _max_length

    # Set pad token
    tokenizer.pad_token = tokenizer.eos_token
    text = f"The following is the joke title: {examples['title']} \n joke: {examples['body']}"

    # Run tokenizer on all the text (the input and the output)
    tokenized_inputs = tokenizer(
        text,
        # Return tensors in a numpy array (other options are pytorch or tf objects)
        return_tensors="np",
        # Padding type is to pad to the longest sequence in the batch (other option is to a certain max length, or no padding)
        max_length=max_length,
        padding="max_length",
        truncation=True
    )

    tokenizer.truncation_side = "left"

    #tokenized_inputs = tokenizer(
    #    text,
    #    return_tensors="np",
    #    #truncation=True,
    #)

    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]

    return tokenized_inputs
  return tokenize_function

In [7]:
def load_dataset(dataset_path, tokenizer, max_length):
    random.seed(42)
    finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_dataset = finetuning_dataset_loaded.map(
        get_tokenize_function(tokenizer, max_length), # returns tokenize_function
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )
    tokenized_dataset = tokenized_dataset.with_format("torch")
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
    return split_dataset

In [8]:
def tokenize_and_split_data(path, tokenizer):
    dataset = load_dataset(path, tokenizer, max_length)
    train_dataset = dataset["train"]
    test_dataset = dataset["test"]
    return train_dataset, test_dataset

In [9]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data('./data/reddit_jokes.jsonl', tokenizer)

print(train_dataset)
print(test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Dataset({
    features: ['title', 'body', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 95753
})
Dataset({
    features: ['title', 'body', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 10640
})


In [10]:
train_dataset[0]["input_ids"]

tensor([ 510, 1563,  310,  ...,    0,    0,    0])

In [21]:
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)
base_model.to('cuda')

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 128)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=128, out_features=384, bias=True)
          (dense): Linear(in_features=128, out_features=128, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=128, out_features=512, bias=True)
          (dense_4h_to_h): Linear(in_features=512, out_features=128, bias=True)
          (act)

In [22]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [23]:
eval_prompt = "### The following is the joke title: The apple \n ### joke:"
print("Question input (test):", eval_prompt)
print("Model's answer: ")
print(inference(eval_prompt, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): ### The following is the joke title: The apple 
 ### joke:
Model's answer: 
 The apple is a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little bit of a little


In [24]:
trained_model_name = f"joke_llm"
output_dir = trained_model_name

In [25]:
import transformers
from datetime import datetime

trainer = transformers.Trainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        num_train_epochs=1,
        optim="adafactor",
        #max_steps = 2000,
        learning_rate=1e-5, # Want a small lr for finetuning
        logging_steps=5000,              # When to start reporting loss
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=5000,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=5000,               # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training
        #report_to="wandb",           # Comment this out if you don't want to use weights & baises
        run_name=f"{trained_model_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
)

base_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/95753 [00:00<?, ?it/s]

{'loss': 0.3554, 'learning_rate': 9.477922132174787e-06, 'epoch': 0.05}


  0%|          | 0/1330 [00:00<?, ?it/s]

Checkpoint destination directory joke_llm\checkpoint-5000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


{'eval_loss': 0.3876953125, 'eval_runtime': 110.7593, 'eval_samples_per_second': 96.064, 'eval_steps_per_second': 12.008, 'epoch': 0.05}
{'loss': 0.4301, 'learning_rate': 8.955739827888713e-06, 'epoch': 0.1}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.45361328125, 'eval_runtime': 108.189, 'eval_samples_per_second': 98.346, 'eval_steps_per_second': 12.293, 'epoch': 0.1}
{'loss': 0.5058, 'learning_rate': 8.433557523602641e-06, 'epoch': 0.16}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.482421875, 'eval_runtime': 108.2088, 'eval_samples_per_second': 98.328, 'eval_steps_per_second': 12.291, 'epoch': 0.16}
{'loss': 0.5, 'learning_rate': 7.911375219316568e-06, 'epoch': 0.21}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.493408203125, 'eval_runtime': 108.651, 'eval_samples_per_second': 97.928, 'eval_steps_per_second': 12.241, 'epoch': 0.21}
{'loss': 0.4853, 'learning_rate': 7.389192915030497e-06, 'epoch': 0.26}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.492919921875, 'eval_runtime': 108.2222, 'eval_samples_per_second': 98.316, 'eval_steps_per_second': 12.29, 'epoch': 0.26}
{'loss': 0.4997, 'learning_rate': 6.867010610744423e-06, 'epoch': 0.31}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.478759765625, 'eval_runtime': 108.6312, 'eval_samples_per_second': 97.946, 'eval_steps_per_second': 12.243, 'epoch': 0.31}
{'loss': 0.4725, 'learning_rate': 6.344828306458351e-06, 'epoch': 0.37}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.466552734375, 'eval_runtime': 108.5047, 'eval_samples_per_second': 98.06, 'eval_steps_per_second': 12.258, 'epoch': 0.37}
{'loss': 0.4603, 'learning_rate': 5.822646002172279e-06, 'epoch': 0.42}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.447998046875, 'eval_runtime': 108.6434, 'eval_samples_per_second': 97.935, 'eval_steps_per_second': 12.242, 'epoch': 0.42}
{'loss': 0.4468, 'learning_rate': 5.300463697886207e-06, 'epoch': 0.47}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.440185546875, 'eval_runtime': 108.4604, 'eval_samples_per_second': 98.1, 'eval_steps_per_second': 12.263, 'epoch': 0.47}
{'loss': 0.4367, 'learning_rate': 4.778281393600134e-06, 'epoch': 0.52}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.429931640625, 'eval_runtime': 108.6509, 'eval_samples_per_second': 97.928, 'eval_steps_per_second': 12.241, 'epoch': 0.52}
{'loss': 0.4172, 'learning_rate': 4.256099089314062e-06, 'epoch': 0.57}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.42138671875, 'eval_runtime': 108.4896, 'eval_samples_per_second': 98.074, 'eval_steps_per_second': 12.259, 'epoch': 0.57}
{'loss': 0.4306, 'learning_rate': 3.7339167850279895e-06, 'epoch': 0.63}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.41796875, 'eval_runtime': 108.5649, 'eval_samples_per_second': 98.006, 'eval_steps_per_second': 12.251, 'epoch': 0.63}
{'loss': 0.4259, 'learning_rate': 3.2117344807419172e-06, 'epoch': 0.68}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.41259765625, 'eval_runtime': 108.2049, 'eval_samples_per_second': 98.332, 'eval_steps_per_second': 12.291, 'epoch': 0.68}
{'loss': 0.4149, 'learning_rate': 2.6895521764558445e-06, 'epoch': 0.73}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.4111328125, 'eval_runtime': 108.2311, 'eval_samples_per_second': 98.308, 'eval_steps_per_second': 12.289, 'epoch': 0.73}
{'loss': 0.4118, 'learning_rate': 2.167369872169772e-06, 'epoch': 0.78}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.41162109375, 'eval_runtime': 108.2048, 'eval_samples_per_second': 98.332, 'eval_steps_per_second': 12.292, 'epoch': 0.78}
{'loss': 0.4109, 'learning_rate': 1.6451875678836998e-06, 'epoch': 0.84}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.408935546875, 'eval_runtime': 108.2473, 'eval_samples_per_second': 98.293, 'eval_steps_per_second': 12.287, 'epoch': 0.84}
{'loss': 0.4138, 'learning_rate': 1.1230052635976273e-06, 'epoch': 0.89}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.40869140625, 'eval_runtime': 108.508, 'eval_samples_per_second': 98.057, 'eval_steps_per_second': 12.257, 'epoch': 0.89}
{'loss': 0.4083, 'learning_rate': 6.008229593115549e-07, 'epoch': 0.94}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.408447265625, 'eval_runtime': 108.3945, 'eval_samples_per_second': 98.16, 'eval_steps_per_second': 12.27, 'epoch': 0.94}
{'loss': 0.4017, 'learning_rate': 7.86406550254825e-08, 'epoch': 0.99}


  0%|          | 0/1330 [00:00<?, ?it/s]

{'eval_loss': 0.407958984375, 'eval_runtime': 108.1962, 'eval_samples_per_second': 98.34, 'eval_steps_per_second': 12.292, 'epoch': 0.99}
{'train_runtime': 10520.8712, 'train_samples_per_second': 9.101, 'train_steps_per_second': 9.101, 'train_loss': 0.43827720338437504, 'epoch': 1.0}


TrainOutput(global_step=95753, training_loss=0.43827720338437504, metrics={'train_runtime': 10520.8712, 'train_samples_per_second': 9.101, 'train_steps_per_second': 9.101, 'train_loss': 0.43827720338437504, 'epoch': 1.0})

In [26]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: joke_llm/final


In [27]:
finetuned_model = AutoModelForCausalLM.from_pretrained('./joke_llm/final', local_files_only=True)
finetuned_model.to('cuda')

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 128)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=128, out_features=384, bias=True)
          (dense): Linear(in_features=128, out_features=128, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=128, out_features=512, bias=True)
          (dense_4h_to_h): Linear(in_features=512, out_features=128, bias=True)
          (act)

In [29]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

text = f"The following is the joke title: "

print("Question input (test):", text)
print("Model's answer: ")
print(inference(eval_prompt, finetuned_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): The following is the joke title: 
Model's answer: 
 ['A man walks into a bar and says "I\'m a child."'] 
 joke: ['A man walks into a bar and says "I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a child, I\'m a
