<a href="https://colab.research.google.com/github/dhnanjay/SEC/blob/main/finetuning_pythia_410m.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 0. Installation

In [None]:
!pip install lamini --quiet
!pip install datasets --quiet
!pip uninstall accelerate -y --quiet # Uninstall accelerate first
!pip install accelerate --quiet # Explicitly install the required version
!pip install transformers[torch] --quiet
https://twitter.com/virattt/status/1795588298330509783

### 1. Define imports

In [None]:
import datasets
import tempfile
import logging
import random
import config
import os
import yaml
import time
import torch
import transformers
import pandas as pd
import jsonlines
import json

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import TrainingArguments
from transformers import AutoModelForCausalLM
from llama import BasicModelRunner

from datasets import load_dataset, concatenate_datasets

import os
import lamini

### 2. Define logging helpers

In [None]:
logger = logging.getLogger(__name__)
global_config = None

def initialize_config_and_logging(existing_config=None):
    global global_config
    global_config = build_config(existing_config)
    setup_logging(global_config)
    logger.debug("Config: " + str(yaml.dump(global_config.as_dict())))
    return global_config

def get_config():
    global global_config
    assert global_config is not None
    return global_config

def build_config(existing_config=None):
    configs = [
        # Using config library
        config.config_from_env(prefix="LLAMA", separator="_", lowercase_keys=True),
    ]

    if existing_config:
        if isinstance(existing_config, dict):
            configs.append(config.config_from_dict(existing_config))
        else:
            configs.append(existing_config)

    config_paths = get_config_paths()

    for path in reversed(config_paths):
        print("Loading builtin config from " + path)
        configs.append(config.config_from_yaml(path, read_from_file=True))

    return config.ConfigurationSet(*configs)

def get_config_paths():
    paths = []

def get_config_paths():
    paths = []

    config_name = "llama_config"
    config_base = "configs"

    base_config_path = os.path.join(config_base, config_name + ".yaml")
    if os.path.exists(base_config_path):
        paths.append(base_config_path)

    local_config_path = os.path.join(config_base, config_name + "_local.yaml")
    if os.path.exists(local_config_path):
        paths.append(local_config_path)

    home = os.path.expanduser("~")
    home_config_path = os.path.join(home, "." + config_name + ".yaml")
    if os.path.exists(home_config_path):
        paths.append(home_config_path)

    return paths

def setup_logging(arguments):
    logging_format = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"

    if arguments["verbose"]:
        logging.basicConfig(level=logging.DEBUG, format=logging_format)
    elif arguments["verbose_info"]:
        logging.basicConfig(level=logging.INFO, format=logging_format)
    else:
        logging.basicConfig(level=logging.WARNING, format=logging_format)

    root_logger = logging.getLogger()

    if arguments["verbose"]:
        root_logger.setLevel(logging.DEBUG)
    elif arguments["verbose_info"]:
        root_logger.setLevel(logging.INFO)
    else:
        root_logger.setLevel(logging.WARNING)

    logging.getLogger("urllib3").setLevel(logging.WARNING)
    logging.getLogger("filelock").setLevel(logging.WARNING)
    logging.getLogger("smart_open").setLevel(logging.WARNING)
    logging.getLogger("botocore").setLevel(logging.WARNING)

### 3. Implement tokenizer

In [None]:
# Get function for tokenization, based on config parameters
def get_tokenize_function(tokenizer, _max_length):
  def tokenize_function(examples):
    max_length = _max_length

    # Create the JSON string
    json_string = json.dumps([{
        "question": examples["question"][0],
        "answer": examples["answer"][0],
        # "context": examples["context"][0]
    }])

    # Run tokenizer on the JSON string
    tokenized_inputs = tokenizer(
        json_string,
        return_tensors="np",
        padding=True,
    )

    # Calculate max length
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        max_length
    )

    if tokenized_inputs["input_ids"].shape[1] > max_length:
        logger.warn(
            f"Truncating input from {tokenized_inputs['input_ids'].shape[1]} to {max_length}"
        )

    # tokenizer.truncation_side = "left"

    tokenized_inputs = tokenizer(
        json_string,
        return_tensors="np",
        truncation=True,
        padding=True,
        max_length=max_length,
    )

    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]

    return tokenized_inputs
  return tokenize_function

### 4. Implement Trainer class

In [None]:
# Trainer class to include logging and history
class Trainer(transformers.Trainer):
    def __init__(
        self,
        model,
        model_flops,
        total_steps,
        args=None,
        data_collator=None,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=None,
        model_init=None,
        compute_metrics=None,
        callbacks=None,
        optimizers=(None, None),
    ):
        super(Trainer, self).__init__(
            model,
            args,
            data_collator,
            train_dataset,
            eval_dataset,
            tokenizer,
            model_init,
            compute_metrics,
            callbacks,
            optimizers,
        )

        self.total_steps = total_steps
        self.model_flops = model_flops
        self.start_step = 0

    def training_step(self, model, inputs):
        if inputs["input_ids"].numel() == 0:

          print("Inputs: ", inputs)
          print("Inputs - input_ids", inputs["input_ids"])
          print("numel", inputs["input_ids"].numel())

          return torch.tensor(0)
        else:
          model.train()
          inputs = self._prepare_inputs(inputs)

          with self.compute_loss_context_manager():
              loss = self.compute_loss(model, inputs)

          if self.args.n_gpu > 1:
              loss = loss.mean()  # mean() to average on multi-gpu parallel training

          # if self.do_grad_scaling:
          #     self.scaler.scale(loss).backward()
          # else:
          self.accelerator.backward(loss)

          return loss.detach() / self.args.gradient_accumulation_steps

    def log(self, logs):
        """
        Log `logs` on the various objects watching training.
        Subclass and override this method to inject custom behavior.
        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)

        self.update_log_timing(logs)

        output = {**logs, **{"step": self.state.global_step}}
        self.update_history(output)

        logger.debug("Step (" + str(self.state.global_step) + ") Logs: " + str(logs))
        self.control = self.callback_handler.on_log(
            self.args, self.state, self.control, logs
        )

    def update_log_timing(self, logs):
        if len(self.state.log_history) == 0:
            self.start_time = time.time()
            logs["iter_time"] = 0.0
            logs["flops"] = 0.0
            logs["remaining_time"] = 0.0
            self.start_step = self.state.global_step
        elif self.state.global_step > self.start_step:
            logs["iter_time"] = (time.time() - self.start_time) / (
                self.state.global_step - self.start_step
            )
            logs["flops"] = self.model_flops / logs["iter_time"]
            logs["remaining_time"] = (self.total_steps - self.state.global_step) * logs[
                "iter_time"
            ]

    def update_history(self, output):
        if "eval_loss" in output:
            return
        if len(self.state.log_history) > 0:
            smoothing_window = 100
            p = 1.0 / smoothing_window
            if "loss" in output:
                output["loss"] = output["loss"] * p + self.state.log_history[-1][
                    "loss"
                ] * (1.0 - p)
        self.state.log_history.append(output)


def sample_history(history):
    if not history:
        return history
    step = (len(history) + 99) // 100

    return history[0 : len(history) : step]

# Copy file
def smart_copy(remote_path, local_path):
    with open(remote_path, "wb") as remote_file:
        with open(local_path, "rb") as local_file:
            remote_file.write(local_file.read())

### 5. Set up training config

In [None]:
model_name = "EleutherAI/pythia-410m-deduped"

dataset_path = "virattt/llama-3-8b-financialQA"

In [None]:
training_config = {
    "model": {
        "pretrained_name": model_name,
        "max_length" : 2048
    },
    "datasets": {
        "use_hf": True,
        "path": dataset_path
    },
    "verbose": True
}

### 6. Load and tokenize dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load the dataset
random.seed(42)
finetuning_dataset_loaded = datasets.load_dataset(dataset_path, split="train")

# Tokenize the dataset
max_length = training_config["model"]["max_length"]
tokenized_dataset = finetuning_dataset_loaded.map(
    get_tokenize_function(tokenizer, max_length), # returns tokenize_function
    batched=True,
    batch_size=1,
    drop_last_batch=True
)
tokenized_dataset = tokenized_dataset.with_format("torch")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading data:   0%|          | 0.00/506k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

### 7. Split tokenized dataset into train + test

In [None]:
# Split the dataset into train / test
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)

train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['question', 'answer', 'context', 'ticker', 'filing', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1600
})
Dataset({
    features: ['question', 'answer', 'context', 'ticker', 'filing', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})


### 8. Load base model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/911M [00:00<?, ?B/s]

In [None]:
device_count = torch.cuda.device_count()
if device_count > 0:
    logger.debug("Select GPU device")
    device = torch.device("cuda")
else:
    logger.debug("Select CPU device")
    device = torch.device("cpu")

In [None]:
model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

### 9. Define inference function

In [None]:
def inference(text, model, tokenizer, max_input_tokens=1024, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

### 10. Run the base model (before finetuning)

In [None]:
test_text = test_dataset[0]['question']
print("Question input (test):", test_text)
print(f"Correct answer: {test_dataset[0]['answer']}")
print("Model's answer: ")
print(inference(test_text, model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): What accounting principles are followed for the assessment and recording of critical accounting estimates?
Correct answer: The preparation and recording of critical accounting estimates in financial statements are conducted in accordance with accounting principles generally accepted in the United States of America (GAAP), using management's estimates and assumptions based on historical experience and other factors deemed reasonable under the circumstances.
Model's answer: 


The following are the principles that are followed for the assessment and recording of critical accounting estimates:

The critical accounting estimate is the sum of all estimates of the critical accounting period.

The critical accounting estimate is the sum of all estimates of the critical accounting period.

The critical accounting estimate is the sum of all estimates of the critical accounting period.

The critical accounting estimate is the sum of all


### 11. Set up hyperparameters for training

In [None]:
# Play around with this value
max_steps = 1000

In [None]:
import time

# Get current timestamp with date, hour, minute, and seconds
timestamp = time.strftime("%Y-%m-%d_%H:%M:%S")

# Set up training arguments
trained_model_name = f"financialQA_{max_steps}_steps_{timestamp}"
output_dir = trained_model_name

In [None]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=2,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=max_steps,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False
)



In [None]:
model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

In [None]:
trainer = Trainer(
    model=model,
    model_flops=model_flops,
    total_steps=max_steps,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

max_steps is given, it will override any value given in num_train_epochs


### 12. Finetune the model

In [None]:
import time

start_time = time.time()

training_output = trainer.train()

elapsed_time = time.time() - start_time
print(f"Elapsed time: {elapsed_time:.2f} seconds")

Step,Training Loss,Validation Loss,Time,Unnamed: 4
120,2.182023,1.82707,418.154693,36599285556689.74
240,1.92742,1.752769,374.638856,35279927507673.836
360,1.794343,1.701699,319.157861,34873965978446.598
480,1.485202,1.704965,260.490987,34716629418237.17
600,1.301752,1.699369,201.552839,34514213597254.785
720,1.265264,1.682566,141.81043,34338140131239.96
840,1.154213,1.729428,81.053261,34330205216052.785
960,0.963753,1.733701,20.272419,34314788263190.535


Elapsed time: 506.98 seconds


### 13. Save finetuned model locally

In [None]:
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: financialQA_1000_steps_2024-05-28_22:34:02/final


### 14. Run the finetuned model (after finetuning)

In [None]:
# Load the model
finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_model.to(device)

In [None]:
test_question = test_dataset[0]['question']
print()
print()
print("Question input (test):")
print(test_question)
print()
print()

print("Finetuned model's answer: ")
print(inference(test_question, finetuned_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




Question input (test):
What accounting principles are followed for the assessment and recording of critical accounting estimates?


Finetuned model's answer: 

{"question": "What are the principles followed for the assessment and recording of critical accounting estimates?", "answer": "The principles are to record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record and record


In [None]:
test_answer = test_dataset[0]
print("Target answer output (test):", test_answer)