# Environment setup - Vertex AI Colab

https://console.cloud.google.com/vertex-ai/colab/notebooks?project=capstone-engie4800

connect to an existing runtime



Uncomment to install libraries

In [1]:
#!python -m pip install transformers accelerate bitsandbytes
#!python -m pip install peft
#!python -m pip install datasets
#!python -m pip install sentencepiece scipy

## Check environment

In [2]:
import torch
print(torch.__version__)

2.1.0+cu118


In [3]:
!nvidia-smi --query-gpu=timestamp,memory.total,memory.used,memory.free --format=csv

timestamp, memory.total [MiB], memory.used [MiB], memory.free [MiB]
2023/12/04 02:46:26.593, 23034 MiB, 4 MiB, 22486 MiB


## Import necessary packages

In [4]:
import json
import os
import gc

import sys
from datasets import load_dataset
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

## Login into Google cloud storage to get models

In [6]:
# Cloud project id.
PROJECT_ID = "capstone-engie4800"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
# Cloud Storage bucket for storing experiments output.
BUCKET_URI = "gs://vertex-xt72os9"  # @param {type:"string"}


## Copy models from Cloud Storage
The base model used was llama2-7b-chat-hf

uncomment login to access storage

In [None]:
#!gcloud auth login
#!gcloud config set project $PROJECT_ID

In [7]:
base_model_name = "llama2-7b-chat-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]

BPO_MODEL_PATH = os.path.join(BUCKET_URI,
                              "peft",
                              "BPO_model",
                              "BPO_models")

In [8]:
local_model_folder = "/content/"

In [9]:
#!gsutil -m cp -R $BPO_MODEL_PATH/llama2-7b-chat-hf $local_model_folder

# LOADING MODEL
In Colab Enterprise using L4 GPU with 24G of RAM

In Colab Free using T4 with 14G of RAM

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
precision_loading_mode = "float16"

## Loading Base model

In [11]:
base_model_name = "llama2-7b-chat-hf"
model_path = os.path.join(local_model_folder,
                          base_model_name)
base_model = LlamaForCausalLM.from_pretrained(model_path,
                                              load_in_8bit=True,
                                              device_map="auto",
                                              torch_dtype=torch.float16,
                                              use_cache=None)
tokenizer = LlamaTokenizer.from_pretrained(model_path,
                                           device_map="auto",
                                           torch_dtype=torch.float16)
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
prompt_template = "[INST] You are an expert prompt engineer. Please help me improve this prompt to get a more helpful response:\n{} [/INST]"
text = 'What is the best company stock to invest my savings?'
prompt = prompt_template.format(text)
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [13]:
output = base_model.generate(**model_inputs, max_new_tokens=100, do_sample=True, top_p=0.9, temperature=0.05, num_beams=1)
resp = tokenizer.decode(output[0], skip_special_tokens=True).split('[/INST]')[1].strip()

print(resp)

Great, I'd be happy to help you improve your prompt! Here are some suggestions to make it more specific and helpful:

1. Provide more context: Can you tell me a bit more about your investment goals and preferences? For example, are you looking for long-term growth or short-term income? Are you comfortable with a higher level of risk, or do you want to play it safe?
2. Define "best": What criteria do you use


# Get training dataset

In [14]:
!git clone https://github.com/thu-coai/BPO/

Cloning into 'BPO'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 123 (delta 7), reused 12 (delta 4), pack-reused 98[K
Receiving objects: 100% (123/123), 29.32 MiB | 21.88 MiB/s, done.
Resolving deltas: 100% (39/39), done.


In [15]:
dataset = load_dataset('json', data_files='/content/BPO/data/alpaca_reproduced/data_52k.json')
def tokenize_add_label(sample):
  prompt_template = "[INST] You are an expert prompt engineer. Please help me improve this prompt to get a more helpful and harmless response:\n{} [/INST]"
  prompt = [tokenizer.bos_token + prompt_template.format(examplex) for examplex in sample['instruction']]
  prompt = tokenizer.encode(prompt, add_special_tokens=False)
  summary = tokenizer.encode(sample["optimized_prompt"] +  tokenizer.eos_token, add_special_tokens=False)
  sample = {
      "input_ids": prompt + summary,
      "attention_mask" : [1] * (len(prompt) + len(summary)),
      "labels": [-100] * len(prompt) + summary,
      }
  return sample

tokenized_dataset = dataset.map(tokenize_add_label, remove_columns=list(dataset['train'].features))


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/51968 [00:00<?, ? examples/s]

# FROM EXAMPLE NOTEBOOK
https://github.com/facebookresearch/llama-recipes/blob/main/examples/quickstart.ipynb

In [16]:
base_model.train()

def create_peft_config(model):
    from peft import (
        get_peft_model,
        LoraConfig,
        TaskType,
        prepare_model_for_int8_training,
    )

    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules = ["q_proj", "v_proj"]
    )

    # prepare int-8 model for training
    model = prepare_model_for_int8_training(model)
    model = get_peft_model(model, peft_config)
    model.print_trainable_parameters()
    return model, peft_config

# create peft config overwrite the old model to save memory
base_model, lora_config = create_peft_config(base_model)

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199




In [18]:
from transformers import Trainer, TrainingArguments
from transformers.data import DataCollatorForSeq2Seq

In [19]:
output_dir = "/content/peft_output"

config = {
    'lora_config': lora_config,
    'learning_rate': 1e-4,
    'num_train_epochs': 1,
    'gradient_accumulation_steps': 2,
    'per_device_train_batch_size': 2,
    'gradient_checkpointing': False,
}

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True,  # Use BF16 if available
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch_fused",
    max_steps=-1,
    **{k:v for k,v in config.items() if k != 'lora_config'}
)

In [None]:
# Create Trainer instance
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=DataCollatorForSeq2Seq(tokenizer)#default_data_collator
)

# Start training
trainer.train()



Step,Training Loss
10,3.1205
20,2.5035
30,2.2915
40,2.2127
50,2.1388
60,1.9869
70,2.0104
80,1.8779
90,1.9563
100,1.9973


Buffered data was truncated after reaching the output size limit.

### Saving models after training

In [None]:
base_model.save_pretrained("/content/BPO_model/peft_model")

## Stop execution

In [None]:
assert False