# Training

In [None]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
import os
import lamini
from dotenv import load_dotenv

load_dotenv()

lamini.api_key = os.getenv("LAMINI_API_KEY")

# Load the Lamini docs dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

dataset_name = "lamini_docs.jsonl"
dataset_path = "lamini/lamini_docs"
use_hf = True

# Set up the model, training config, and tokenizer

In [None]:
model_name = "EleutherAI/pythia-70m"

training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length": 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True

}

In [None]:
from transformers import AutoTokenizer
from utilities import *

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
train_dataset, test_dataset = tokenize_and_split_data(training_config, tokenizer)

print(train_dataset)
print(test_dataset)

# Load the base model

In [None]:
from transformers import AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
import torch

device_count = torch.cuda.device_count()

if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [None]:
base_model.to(device)

# Define function to carry out inference

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    # Tokenize
    input_ids = tokenizer.encode(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_tokens
    )

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
        input_ids=input_ids.to(device),
        max_length=max_output_tokens,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

# Try the base model

In [None]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

# Setup training

In [None]:
from transformers import TrainingArguments

max_steps = 3
trained_model_name = f"lamini_docs_{max_steps}_steps"
output_dir = trained_model_name
training_args = TrainingArguments(

    # Learning rate
    learning_rate=1.0e-5,

    # Number of training epochs
    num_train_epochs=1,

    # Max steps to train for (each step is a batch of data)
    # Overrides num_train_epochs, if not -1
    max_steps=max_steps,

    # Batch size for training
    per_device_train_batch_size=1,

    # Directory to save model checkpoints
    output_dir=output_dir,

    # Other arguments
    overwrite_output_dir=False,  # Overwrite the content of the output directory
    disable_tqdm=False,  # Disable progress bars
    eval_steps=120,  # Number of update steps between two evaluations
    save_steps=120,  # After # steps model is saved
    warmup_steps=1,  # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1,  # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [None]:
model_flops = (
        base_model.floating_point_ops(
            {
                "input_ids": torch.zeros(
                    (1, training_config["model"]["max_length"])
                )
            }
        )
        * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

In [None]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
training_output = trainer.train()

## Save model locally

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [None]:
finetuned_slightly_model.to(device)

In [None]:
test_question = test_dataset[0]['question']
print("Question input (test):", test_question)

print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))

# Run slightly trained model

In [None]:
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

# Run same model trained for two epochs

In [None]:
finetuned_longer_model = AutoModelForCausalLM.from_pretrained("lamini/lamini_docs_finetuned")
tokenizer = AutoTokenizer.from_pretrained("lamini/lamini_docs_finetuned")

finetuned_longer_model.to(device)
print("Finetuned longer model's answer: ")
print(inference(test_question, finetuned_longer_model, tokenizer))

In [None]:
# Run much larger trained model and explore moderation
# 
# bigger_finetuned_model = BasicModelRunner(model_name_to_id["bigger_model_name"])
# bigger_finetuned_output = bigger_finetuned_model(test_question)
# print("Bigger (2.8B) finetuned model (test): ", bigger_finetuned_output)

In [None]:
count = 0
for i in range(len(train_dataset)):
 if "keep the discussion relevant to Lamini" in train_dataset[i]["answer"]:
  print(i, train_dataset[i]["question"], train_dataset[i]["answer"])
  count += 1
print(count)

In [None]:
# Explore moderation using small model
base_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
base_model = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-70m")
print(inference("What do you think of Mars?", base_model, base_tokenizer))

In [None]:
# try moderation with finetuned small model
print(inference("What do you think of Mars?", finetuned_longer_model, tokenizer))

# Finetune a model in 3 lines of code using Lamini

In [None]:
from llama import BasicModelRunner

model = BasicModelRunner("EleutherAI/pythia-410m") 
model.load_data_from_jsonlines("lamini_docs.jsonl", input_key="question", output_key="answer")
model.train(is_public=True)