## LLM Finetuning for Actuarial Standards of Practice Knowledge
Uses the Mistral AI 7 billion parameter modelas a foundation model and a custom data set created from the actuarial standards of practice to fine tune. Uses QLoRA for finetuning for efficient memory and stores the model on Huggingface.

A significant amount of hyperparameter tuning still needs to be explored. This is an initial test of the process.

In [None]:
# Full notebook should run on Google colab. Install these backages
%%capture
!pip install datasets peft accelerate transformers bitsandbytes trl evaluate

In [None]:
# Set up Huggingface Token
import os
from google.colab import userdata

HUGGINGFACE_KEY = userdata.get('HFWRITE_API_KEY')
os.environ['HUGGINGFACE_API_KEY'] = HUGGINGFACE_KEY
HUGGINGFACE_API_KEY = os.environ['HUGGINGFACE_API_KEY']
HF_TOKEN = HUGGINGFACE_API_KEY
hf_token = HF_TOKEN

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from peft import LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer, GenerationConfig


In [None]:
# For splitting memory
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer from Hugging Face Hub
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    token=hf_token
)

model.config.use_cache = False  # Important for training with PEFT

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
import json

# Load the dataset
with open("combined_rag_dataset.json", "r") as file:
    dataset = json.load(file)


In [None]:
# Prepare data in a question-answer format
data = [
    {
        "instruction": entry["query"],  # Instruction or query
        "context": " ".join(entry["reference_contexts"]),  # Combine reference contexts
        "response": entry["reference_answer"]  # The target response
    }
    for entry in dataset['examples']
]


In [None]:
from transformers import AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name, token = hf_token)


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    # Combine instruction and context for input
    inputs = [f"Instruction: {instruction}\nContext: {context}"
              for instruction, context in zip(examples["instruction"], examples["context"])]

    # Use response as the target
    targets = examples["response"]

    # Tokenize inputs and targets
    model_inputs = tokenizer(
        inputs, padding="max_length", truncation=True, max_length=512
    )
    labels = tokenizer(
        targets, padding="max_length", truncation=True, max_length=512
    )

    # Replace padding tokens in labels with -100 for ignoring during loss computation
    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    return model_inputs


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from datasets import Dataset

# Convert list to a Hugging Face Dataset
dataset = Dataset.from_list(data)

# Tokenize dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["instruction", "context", "response"]
)

print("Tokenization complete!")


Map:   0%|          | 0/10760 [00:00<?, ? examples/s]

Tokenization complete!


In [None]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.1)  # 10% for validation
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]


In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=None, padding=True)


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,   # For causal language modeling
    inference_mode=False,           # Set to False for training
    r=8,                            # LoRA rank
    lora_alpha=16,                  # Scaling parameter
    lora_dropout=0.05,              # Dropout for LoRA layers
    target_modules=["q_proj", "v_proj"]  # Target modules to apply LoRA
)

model = get_peft_model(model, peft_config)


In [None]:
# Provide all the settings for the training

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    fp16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    eval_strategy='steps',
    eval_steps=500,
    dataloader_num_workers=4,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()




Step,Training Loss,Validation Loss
500,5.9205,5.993151


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.3 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.3.


Step,Training Loss,Validation Loss
500,5.9205,5.993151



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.3 is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-Instruct-v0.3.


TrainOutput(global_step=906, training_loss=6.434939201304455, metrics={'train_runtime': 13363.1035, 'train_samples_per_second': 2.174, 'train_steps_per_second': 0.068, 'total_flos': 6.338834211054551e+17, 'train_loss': 6.434939201304455, 'epoch': 2.994630318050392})

In [None]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `hf_token_2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `hf_toke

In [None]:
# Save the model to Hugging Face Hub
trainer.push_to_hub("asop_QLoRA_finetuned")


adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Robert-Richardson/results/commit/f0f5e045c82e956b520d93ed444f0d2f0e85834b', commit_message='asop_QLoRA_finetuned', commit_description='', oid='f0f5e045c82e956b520d93ed444f0d2f0e85834b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Robert-Richardson/results', endpoint='https://huggingface.co', repo_type='model', repo_id='Robert-Richardson/results'), pr_revision=None, pr_num=None)