In [104]:
import datasets
import tempfile
import logging
import random
import os
import yaml
import time
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from transformers import Trainer
from datasets import load_dataset
#from utilities import *

logger = logging.getLogger(__name__)
global_config = None

In [105]:
dataset_name = "lamini_docs_processed.jsonl"
dataset_path = f"/{dataset_name}"
use_hf = False
print(dataset_path)

/lamini_docs_processed.jsonl


In [106]:
train_dataset = load_dataset("kotzeje/lamini_docs.jsonl", split = "train", trust_remote_code=True)
print(train_dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1400
})


In [107]:
model_name = "EleutherAI/pythia-70m"

In [108]:
training_config = {
    "model":{
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf" : use_hf,
        "path" : dataset_path
    },
    "verbose": True
}
print(training_config["model"]["max_length"])

2048


In [110]:
# Tokenize and split data
def load_dataset1(dataset_path, tokenizer):
    random.seed(42)
    print(dataset_path)
    finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")
    tokenizer.pad_token = tokenizer.eos_token
    max_length = training_config["model"]["max_length"]
    tokenized_dataset = finetuning_dataset_loaded.map(
        get_tokenize_function(tokenizer, max_length), # returns tokenize_function
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )
    tokenized_dataset = tokenized_dataset.with_format("torch")
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
    return split_dataset
# Wrapper for data load, split, tokenize for training
def tokenize_and_split_data(training_config, tokenizer):
    dataset_path = training_config["datasets"]["path"]
    use_hf = training_config["datasets"]["use_hf"]
    print("tokenize", use_hf, dataset_path)
    if use_hf:
        dataset = datasets.load_dataset(dataset_path)
    else:
        dataset = load_dataset1(dataset_path, tokenizer)
        train_dataset = dataset["train"]
        test_dataset = dataset["test"]
    return train_dataset, test_dataset

In [140]:
from pprint import pprint
    
tokenizer =  AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config,tokenizer)
pprint(train_dataset[0])

tokenize False /lamini_docs_processed.jsonl
/lamini_docs_processed.jsonl
{'answer': "To leverage Lamini's features for improving model performance or "
           'generalization, you can use the pre-trained models and embeddings '
           'provided by Lamini, or fine-tune them on your specific task. '
           "Finally, you can use Lamini's model selection and hyperparameter "
           'tuning tools to find the best model architecture and '
           'hyperparameters for your task.',
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]),
 'input_ids': tensor([ 4118, 19782,    27,   187,  2347,   476,   309, 25057,   418,  4988,
           74,   434,  3386,   281,  3157,   253,  3045

In [113]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [114]:
device_conunt = torch.cuda.device_count()
if device_conunt >0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [115]:
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [116]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_out_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(device),
        max_length=max_out_tokens
    )

    # decode
    generated_tokens_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt)

    # Strip the prompt
    generated_text_answer = generated_tokens_with_prompt[0][len(text):]

    return generated_text_answer

In [117]:
train_text = train_dataset[0]["question"]
print("Question input:", train_text)
print(f"Correct answer from Lamini doc: {train_dataset[0]['answer']}" )
print(inference(train_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input: ### Question:
How can I leverage Lamini's features to improve the performance or generalization of a customized model?
### Answer:
Correct answer from Lamini doc: To leverage Lamini's features for improving model performance or generalization, you can use the pre-trained models and embeddings provided by Lamini, or fine-tune them on your specific task. Finally, you can use Lamini's model selection and hyperparameter tuning tools to find the best model architecture and hyperparameters for your task.


A:

I would like to use the Lamini feature to improve the performance of a custom model.

A:

I would like to use the Lamini feature to improve the performance of a custom model.

A:

I would like to use the Lamini feature to improve the performance of a


In [130]:
max_steps = 10
trained_model_name = f"lamini_doc_{max_steps}_steps"
output_dir = trained_model_name

In [131]:
training_args = TrainingArguments(
    learning_rate= 1.0e-5,
    num_train_epochs = 10,
    max_steps=max_steps,
    per_device_train_batch_size = 1,
    output_dir = output_dir,
    remove_unused_columns=False,

    overwrite_output_dir=False,
    disable_tqdm=False, #Enable/disable progress bars
    eval_steps=120, # number of update steps between two evaluations
    save_steps =120, # After # steps model is saved
    warmup_steps = 1, # no of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # batch size for evaluation
    eval_strategy="steps",
    logging_strategy ="steps",
    logging_steps=1,
    optim="adafactor",
    gradient_accumulation_steps = 4,
    gradient_checkpointing = False,

    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
    
)

In [120]:
model_flops = (
    base_model.floating_point_ops(
        {
            "input_ids": torch.zeros(
                (1, training_config["model"]["max_length"])
            )
        }
    )
    * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [121]:
pprint(train_dataset[0])
print(test_dataset)

{'answer': "To leverage Lamini's features for improving model performance or "
           'generalization, you can use the pre-trained models and embeddings '
           'provided by Lamini, or fine-tune them on your specific task. '
           "Finally, you can use Lamini's model selection and hyperparameter "
           'tuning tools to find the best model architecture and '
           'hyperparameters for your task.',
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1]),
 'input_ids': tensor([ 4118, 19782,    27,   187,  2347,   476,   309, 25057,   418,  4988,
           74,   434,  3386,   281,  3157,   253,  3045,   390, 26647,   273,
          247, 32176,  1566,    32,   187,  4118, 

In [132]:
trainer = Trainer(
    model = base_model,
    #model_flops = model_flops,
    #total_steps = max_steps,
    args = training_args,
    train_dataset = test_dataset,
    eval_dataset = test_dataset
)

In [133]:
training_output = trainer.train()

Step,Training Loss,Validation Loss


In [135]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("saved model to:", save_dir)

saved model to: lamini_doc_10_steps/final


In [137]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [138]:
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise

In [142]:
test_question = test_dataset[0]["question"]
print("question input(test):", test_question)
print("finetuned slightly model 's answer:'")
pprint(inference(test_question, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


question input(test): ### Question:
Is it possible to fine-tune Lamini on a specific dataset for text generation in legal documents?
### Answer:
finetuned slightly model 's answer:'
('Yes, it is possible to fine-tune Lamini on a specific dataset for text '
 'generation in legal documents.\n'
 '### Answer:Yes, it is possible to fine-tune Lamini on a specific dataset for '
 'text generation in legal documents.\n'
 '### Answer:Yes, it is possible to fine-tune Lamini on')


In [143]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): Laminiâ€™s LLM Engine can help you fine-tune any model on huggingface or any OpenAI model.
