# Fine-Tuning an LLM for Responsible AI for Evaluation
This code accompanies a report published by the IBM Center for the Business of Government found here: ADD ONCE PUBLISHED

In [None]:
# Some components derived from this article: https://mlops.community/basics-of-instruction-tuning-with-olmo-1b/

from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, set_seed, TrainingArguments, Trainer
import transformers
!pip install datasets
!pip install mlflow
from datasets import load_dataset, Features, Value, Sequence
import mlflow
import torch
import os
!pip install pynvml


model_ckpt = "allenai/OLMo-1B-hf"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model = AutoModelForCausalLM.from_pretrained(
    model_ckpt,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True
)

## Loading Data

In [None]:
cache_dir = 'agency/'

dataURL = 'Cleaned_ChatML_data.json'
agency = load_dataset('json', data_files={'train': dataURL})

## tokenizing the entire dataset

In [None]:
##### Formatting example (https://huggingface.co/docs/transformers/chat_templating)

tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

def format_agency(ex):

    chat = [
        {"role": message["role"], "content": message["content"]}
        for message in ex['conversations']]

    formatted_chat = tokenizer.apply_chat_template(
        chat,
        tokenize=False,
        add_generation_prompt=False,
    )+ tokenizer.eos_token


    tokenized_output = tokenizer(
            formatted_chat,
            add_special_tokens = False,
            padding="max_length",
            max_length=2048,
            truncation=True
    )
    return tokenized_output

agency_tokenized = agency.map(format_agency, num_proc=16).remove_columns(
    "conversations"
)


In [None]:
data_collator= DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

set_seed(318125)

agency_tokenized_split = agency_tokenized["train"].train_test_split(train_size = 0.9, test_size=0.1)

In [None]:
!mkdir -p mlflow_results/olmo_agency_instruct/
mlflow_tracking_path = 'mlflow_results/olmo_agency_instruct'

!mkdir -p mlflow_results/olmo_agency_instruct/output/
OUTPUT_DIR = "mlflow_results/olmo_agencyinstruct/output"

!mkdir -p mlflow_results/olmo_agency_instruct/logs/
LOG_DIR = "mlflow_results/olmo_agency_instruct/logs"

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=4,
    auto_find_batch_size=True,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    logging_dir=LOG_DIR,
    logging_steps=5,
    eval_strategy="epoch",
    lr_scheduler_type="linear",
    bf16=True,
    gradient_checkpointing=False,
    save_steps=10000,
    learning_rate=8.5e-6
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=agency_tokenized_split["train"],
    eval_dataset=agency_tokenized_split["test"],
    data_collator=data_collator,
)

In [None]:
# training model
transformers.integrations.is_wandb_available = lambda: False
os.environ["WANDB_DISABLED"] = "true" # Disable WandB integration

mlflow.set_tracking_uri(mlflow_tracking_path)
mlflow.set_experiment("agency_dataset_training")
with mlflow.start_run(log_system_metrics=True):
    mlflow.log_params(training_args.to_dict())
    trainer.evaluate()
    trainer.train()

In [None]:
trainer.save_model('models/olmo_instruct_agency/')

## generating text using trained model

In [None]:
fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    'models/olmo_instruct_agency/',
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

In [None]:
def generate(prompt, max_new_tokens=1000, chat=True):
    messages = [
        {
            "role": "system",
            "content": """You are a grant review panelist. Score the following application on a scale of 0-35. 0 being the lowest and 35 being the highest. Provide a brief explanation of your score based on the following criteria: 
                            1) The work deepens its impact by welcoming people of all abilities and backgrounds.
                            2) Strong ideas expressed with clarity advance artistic goals [truncated]

                        """
        },
        {"role": "user", "content": prompt},
    ]

    formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

    input_ids = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to(
        fine_tuned_model.device
    )

    gen_tokens = fine_tuned_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        eos_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1,
    )
    return tokenizer.batch_decode(gen_tokens, skip_special_tokens=False)[0]

In [None]:
promptText = """Example Arts Organization’s mission is to engage with, entertain, and serve the community by examining current social issues through the lens of classic text. Our vision is [truncated]  """

print(generate(promptText, chat=True))

### Output from Fine-Tuned LLM

<|im_start|>assistant
Avg_Reviewer_Score: 28; Reviewer_Notes: This is a worthy project that has been going on for many years [truncated] <|im_end|>

#### Actual reviewer score and notes

Score 29: This project has been around since 2010 and it's still going strong! - I like the idea [truncated]