<a href="https://colab.research.google.com/github/dhnanjay/SEC/blob/main/finetuning_pythia_70m.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lamini --quiet
!pip install datasets --quiet
!pip uninstall accelerate -y --quiet # Uninstall accelerate first
!pip install accelerate --quiet # Explicitly install the required version
!pip install transformers[torch] --quiet

### 0. Utilities

In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import logging
import time

import transformers

logger = logging.getLogger(__name__)
global_config = None

#############################
########## Permissions ##########
#############################
model_name_to_id = {
  "bigger_model_name" : "06ad41e68cd839fb475a0c1a4ee7a3ad398228df01c9396a97788295d5a0f8bb"
}

#############################
########## LOGGING ##########
#############################
def initialize_config_and_logging(existing_config=None):
    global global_config
    global_config = build_config(existing_config)
    setup_logging(global_config)
    logger.debug("Config: " + str(yaml.dump(global_config.as_dict())))
    return global_config

def get_config():
    global global_config
    assert global_config is not None
    return global_config

def build_config(existing_config=None):
    configs = [
        # Using config library
        config.config_from_env(prefix="LLAMA", separator="_", lowercase_keys=True),
    ]

    if existing_config:
        if isinstance(existing_config, dict):
            configs.append(config.config_from_dict(existing_config))
        else:
            configs.append(existing_config)

    config_paths = get_config_paths()

    for path in reversed(config_paths):
        print("Loading builtin config from " + path)
        configs.append(config.config_from_yaml(path, read_from_file=True))

    return config.ConfigurationSet(*configs)

def get_config_paths():
    paths = []

def get_config_paths():
    paths = []

    config_name = "llama_config"
    config_base = "configs"

    base_config_path = os.path.join(config_base, config_name + ".yaml")
    if os.path.exists(base_config_path):
        paths.append(base_config_path)

    local_config_path = os.path.join(config_base, config_name + "_local.yaml")
    if os.path.exists(local_config_path):
        paths.append(local_config_path)

    home = os.path.expanduser("~")
    home_config_path = os.path.join(home, "." + config_name + ".yaml")
    if os.path.exists(home_config_path):
        paths.append(home_config_path)

    return paths

def setup_logging(arguments):
    logging_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"

    if arguments["verbose"]:
        logging.basicConfig(level=logging.DEBUG, format=logging_format)
    elif arguments["verbose_info"]:
        logging.basicConfig(level=logging.INFO, format=logging_format)
    else:
        logging.basicConfig(level=logging.WARNING, format=logging_format)

    root_logger = logging.getLogger()

    if arguments["verbose"]:
        root_logger.setLevel(logging.DEBUG)
    elif arguments["verbose_info"]:
        root_logger.setLevel(logging.INFO)
    else:
        root_logger.setLevel(logging.WARNING)

    logging.getLogger("urllib3").setLevel(logging.WARNING)
    logging.getLogger("filelock").setLevel(logging.WARNING)
    logging.getLogger("smart_open").setLevel(logging.WARNING)
    logging.getLogger("botocore").setLevel(logging.WARNING)


##########################
########## DATA ##########
##########################
# Wrapper for data load, split, tokenize for training
def tokenize_and_split_data(training_config, tokenizer):
  initialized_config = initialize_config_and_logging(training_config)
  dataset_path = initialized_config["datasets"]["path"]
  use_hf = initialized_config["datasets"]["use_hf"]
  print("tokenize", use_hf, dataset_path)
  if use_hf:
    dataset = datasets.load_dataset(dataset_path)
  else:
    dataset = load_dataset(dataset_path, tokenizer)
  train_dataset = dataset["train"]
  test_dataset = dataset["test"]
  return train_dataset, test_dataset

# Tokenize and split data
def load_dataset(dataset_path, tokenizer):
    random.seed(42)
    finetuning_dataset_loaded = datasets.load_dataset("json", data_files=dataset_path, split="train")
    tokenizer.pad_token = tokenizer.eos_token
    max_length = training_config["model"]["max_length"]
    tokenized_dataset = finetuning_dataset_loaded.map(
        get_tokenize_function(tokenizer, max_length), # returns tokenize_function
        batched=True,
        batch_size=1,
        drop_last_batch=True
    )
    tokenized_dataset = tokenized_dataset.with_format("torch")
    split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)
    return split_dataset

# Get function for tokenization, based on config parameters
def get_tokenize_function(tokenizer, _max_length):

  def tokenize_function(examples):
    max_length = _max_length

    # Set pad token
    tokenizer.pad_token = tokenizer.eos_token

    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    # Run tokenizer on all the text (the input and the output)
    tokenized_inputs = tokenizer(
        text,

        # Return tensors in a numpy array (other options are pytorch or tf objects)
        return_tensors="np",

        # Padding type is to pad to the longest sequence in the batch (other option is to a certain max length, or no padding)
        padding=True,
    )

    # Calculate max length
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        max_length
    )

    if tokenized_inputs["input_ids"].shape[1] > max_length:
        logger.warn(
            f"Truncating input from {tokenized_inputs['input_ids'].shape[1]} to {max_length}"
        )

    tokenizer.truncation_side = "left"

    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
    )

    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]

    return tokenized_inputs
  return tokenize_function


###########################
########## MODEL ##########
###########################

# Load model onto the right device (GPU if available), and load tokenizer
def load_model(training_config, load_base_model=False):
    model_load_path = ""
    model_load_path = training_config["model"]["pretrained_name"]
    logger.debug(f"Loading default model: {model_load_path}")
    model = AutoModelForCausalLM.from_pretrained(model_load_path)
    tokenizer = AutoTokenizer.from_pretrained(model_load_path)

    logger.debug("Copying model to device")

    device_count = torch.cuda.device_count()
    if device_count > 0:
        logger.debug("Select GPU device")
        device = torch.device("cuda")
    else:
        logger.debug("Select CPU device")
        device = torch.device("cpu")

    model.to(device)

    logger.debug("Copying finished...")
    if "model_name" not in training_config:
        model_name = model_load_path
    else:
        model_name = training_config["model_name"]

    return model, tokenizer, device, model_name

# Trainer class to include logging and history
class Trainer(transformers.Trainer):
    def __init__(
        self,
        model,
        model_flops,
        total_steps,
        args=None,
        data_collator=None,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=None,
        model_init=None,
        compute_metrics=None,
        callbacks=None,
        optimizers=(None, None),
    ):
        super(Trainer, self).__init__(
            model,
            args,
            data_collator,
            train_dataset,
            eval_dataset,
            tokenizer,
            model_init,
            compute_metrics,
            callbacks,
            optimizers,
        )

        self.total_steps = total_steps
        self.model_flops = model_flops
        self.start_step = 0

    def training_step(self, model, inputs):
        if inputs["input_ids"].numel() == 0:

          print("Inputs: ", inputs)
          print("Inputs - input_ids", inputs["input_ids"])
          print("numel", inputs["input_ids"].numel())

          return torch.tensor(0)
        else:
          model.train()
          inputs = self._prepare_inputs(inputs)

          with self.compute_loss_context_manager():
              loss = self.compute_loss(model, inputs)

          if self.args.n_gpu > 1:
              loss = loss.mean()  # mean() to average on multi-gpu parallel training

          # if self.do_grad_scaling:
          #     self.scaler.scale(loss).backward()
          # else:
          self.accelerator.backward(loss)

          return loss.detach() / self.args.gradient_accumulation_steps

    def log(self, logs):
        """
        Log `logs` on the various objects watching training.
        Subclass and override this method to inject custom behavior.
        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)

        self.update_log_timing(logs)

        output = {**logs, **{"step": self.state.global_step}}
        self.update_history(output)

        logger.debug("Step (" + str(self.state.global_step) + ") Logs: " + str(logs))
        self.control = self.callback_handler.on_log(
            self.args, self.state, self.control, logs
        )

    def update_log_timing(self, logs):
        if len(self.state.log_history) == 0:
            self.start_time = time.time()
            logs["iter_time"] = 0.0
            logs["flops"] = 0.0
            logs["remaining_time"] = 0.0
            self.start_step = self.state.global_step
        elif self.state.global_step > self.start_step:
            logs["iter_time"] = (time.time() - self.start_time) / (
                self.state.global_step - self.start_step
            )
            logs["flops"] = self.model_flops / logs["iter_time"]
            logs["remaining_time"] = (self.total_steps - self.state.global_step) * logs[
                "iter_time"
            ]

    def update_history(self, output):
        if "eval_loss" in output:
            return
        if len(self.state.log_history) > 0:
            smoothing_window = 100
            p = 1.0 / smoothing_window
            if "loss" in output:
                output["loss"] = output["loss"] * p + self.state.log_history[-1][
                    "loss"
                ] * (1.0 - p)
        self.state.log_history.append(output)


def sample_history(history):
    if not history:
        return history
    step = (len(history) + 99) // 100

    return history[0 : len(history) : step]

# Copy file
def smart_copy(remote_path, local_path):
    with open(remote_path, "wb") as remote_file:
        with open(local_path, "rb") as local_file:
            remote_file.write(local_file.read())

### 1. Imports

In [None]:
import os
import lamini

In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner


logger = logging.getLogger(__name__)
global_config = None

### 2. Specify dataset

In [None]:
dataset_path = "virattt/llama-3-8b-financialQA"
# dataset_path = "lamini/lamini_docs"
use_hf = True

### 3. Set up model, training config, and tokenizer

In [None]:
model_name = "EleutherAI/pythia-70m"

In [None]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": use_hf,
        "path": dataset_path
    },
    "verbose": True
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
random.seed(42)
finetuning_dataset_loaded = datasets.load_dataset(dataset_path, split="train")
tokenizer.pad_token = tokenizer.eos_token
max_length = training_config["model"]["max_length"]
tokenized_dataset = finetuning_dataset_loaded.map(
    get_tokenize_function(tokenizer, max_length), # returns tokenize_function
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
tokenized_dataset = tokenized_dataset.with_format("torch")

# Split the dataset into train / test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, shuffle=True, seed=123)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

print(train_dataset)
print(test_dataset)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
DEBUG:fsspec.local:open file: /root/.cache/huggingface/datasets/virattt___llama-3-8b-financial_qa/default/0.0.0/8105ab3194296d5260c01871a811c7124896ba66/dataset_info.json
DEBUG:fsspec.local:open file: /root/.cache/huggingface/datasets/virattt___llama-3-8b-financial_qa/default/0.0.0/8105ab3194296d5260c01871a811c7124896ba66/dataset_info.json


Dataset({
    features: ['question', 'answer', 'context', 'ticker', 'filing', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 900
})
Dataset({
    features: ['question', 'answer', 'context', 'ticker', 'filing', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})


### 4. Load base model

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

DEBUG:__main__:Select GPU device


In [None]:
base_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

### 5. Define inference function

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

### 6. Try base model

In [None]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer from Lamini docs: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, base_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): How are non-GAAP financial measures justified for aiding investors according to the document?
Correct answer from Lamini docs: Non-GAAP financial measures are justified as they provide additional insight into operational performance and help clarify trends affecting the business, aiding investors.
Model's answer: 


A:

The document is a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is not a document that is


### 7. Set up hyperparameters for training

In [None]:
# Play around with this value
max_steps = 1000

In [None]:
import time

# Get current timestamp with date, hour, minute, and seconds
timestamp = time.strftime("%Y-%m-%d_%H:%M:%S")

# Set up training arguments
trained_model_name = f"financialQA_{max_steps}_steps_{timestamp}"
output_dir = trained_model_name

In [None]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=2,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [None]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

In [None]:
trainer = Trainer(
    model=base_model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


### 8. Begin finetuning

In [None]:
training_output = trainer.train()

Step,Training Loss,Validation Loss,Time,Unnamed: 4
120,2.68584,3.008493,104.781966,18440078492689.254
240,2.824582,2.91853,92.14376,18109826906666.527
360,2.541118,2.883304,78.210332,17967285978829.293
480,2.349249,2.898516,63.853095,17880844548122.492
600,2.155623,2.892917,49.187715,17855416239454.81
720,2.057369,2.917614,34.671841,17731593222444.09
840,1.950076,2.920505,19.83321,17713060384197.523
960,1.855202,2.930685,4.963152,17695751761688.77


DEBUG:__main__:Step (1) Logs: {'loss': 2.2709, 'grad_norm': 46.51436233520508, 'learning_rate': 1e-05, 'epoch': 0.0, 'iter_time': 0.0, 'flops': 0.0, 'remaining_time': 0.0}
DEBUG:__main__:Step (2) Logs: {'loss': 2.5014, 'grad_norm': 49.03346252441406, 'learning_rate': 9.989989989989992e-06, 'epoch': 0.01, 'iter_time': 0.11273312568664551, 'flops': 19476687078251.582, 'remaining_time': 112.50765943527222}
DEBUG:__main__:Step (3) Logs: {'loss': 1.9007, 'grad_norm': 62.61591339111328, 'learning_rate': 9.979979979979981e-06, 'epoch': 0.01, 'iter_time': 0.11202871799468994, 'flops': 19599151464502.812, 'remaining_time': 111.69263184070587}
DEBUG:__main__:Step (4) Logs: {'loss': 2.494, 'grad_norm': 53.349647521972656, 'learning_rate': 9.96996996996997e-06, 'epoch': 0.02, 'iter_time': 0.11738801002502441, 'flops': 18704361815861.215, 'remaining_time': 116.91845798492432}
DEBUG:__main__:Step (5) Logs: {'loss': 2.1122, 'grad_norm': 75.27790832519531, 'learning_rate': 9.95995995995996e-06, 'epoch

### 9. Save finetuned model locally

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: financialQA_1000_steps_2024-05-24_13:51:30/final


In [None]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)

In [None]:
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 512)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-5): 6 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=512, out_features=1536, bias=True)
          (dense): Linear(in_features=512, out_features=512, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=512, out_features=2048, bias=True)
          (dense_4h_to_h): Linear(in_features=2048, out_features=512, bias=True)
          (a

### 10. Run the slightly finetuned model!

In [None]:
test_question = test_dataset[0]['question']
print()
print()
print("Question input (test):")
print(test_question)
print()
print()

# Predicted answer
print("Finetuned slightly model's answer: ")
print(inference(test_question, finetuned_slightly_model, tokenizer))



Question input (test):
How are non-GAAP financial measures justified for aiding investors according to the document?


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




Finetuned slightly model's answer: 
Non-GAAP financial measures are generally based on the document's text and text. They are not based on the document's text and text and text and text and text are not included in the document's text. Non-GAAP financial measures are generally based on the document's text and text and text and text and text and text are not included in the document's text. Non-GAAP financial measures are


In [None]:
# Actual answer
test_answer = test_dataset[0]['answer']
print("Target answer output (test):", test_answer)

Target answer output (test): Non-GAAP financial measures are justified as they provide additional insight into operational performance and help clarify trends affecting the business, aiding investors.
