# Environment setup - Vertex AI Colab

In [None]:
!python -m pip install transformers accelerate bitsandbytes
!python -m pip install peft
!python -m pip install datasets
!python -m pip install sentencepiece scipy

Collecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.41.2.post2-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: bitsandbytes, accelerate
Successfully installed accelerate-0.25.0 bitsandbytes-0.41.2.post2
Collecting peft
  Downloading peft-0.6.2-py3-none-any.whl (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: peft
Successfully installed peft-0.6.2
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
C

## Check environment

In [None]:
import torch
print(torch.__version__)

2.1.0+cu118


In [None]:
!nvidia-smi --query-gpu=timestamp,memory.total,memory.used,memory.free --format=csv

timestamp, memory.total [MiB], memory.used [MiB], memory.free [MiB]
2023/12/04 00:46:56.370, 15360 MiB, 0 MiB, 15101 MiB


## Import necessary packages

In [None]:
import json
import os
import gc

import sys
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import LlamaForCausalLM, LlamaTokenizer, LlamaConfig

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

## Login into Google cloud storage to get models

In [None]:
# Cloud project id.
PROJECT_ID = "capstone-engie4800"  # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}
# Cloud Storage bucket for storing experiments output.
BUCKET_URI = "gs://vertex-xt72os9"  # @param {type:"string"}
!gcloud auth login
!gcloud config set project $PROJECT_ID


You are running on a Google Compute Engine virtual machine.
It is recommended that you use service accounts for authentication.

You can run:

  $ gcloud config set account `ACCOUNT`

to switch accounts if necessary.

Your credentials may be visible to others with access to this
virtual machine. Are you sure you want to authenticate with
your personal account?

Do you want to continue (Y/n)?  Y

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=32555940559.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=D9LJHm8AvdQRRR1VNsVzP

## Copy models from Cloud Storage
The base model used was llama2-7b-chat-hf

In [None]:
base_model_name = "llama2-7b-chat-hf"  # @param ["llama2-7b-hf", "llama2-7b-chat-hf", "llama2-13b-hf", "llama2-13b-chat-hf", "llama2-70b-hf", "llama2-70b-chat-hf"]

BPO_MODEL_PATH = os.path.join(BUCKET_URI,
                              "peft",
                              "BPO_model",
                              "BPO_models")

Uncoment the copy command below to copy the model from the bucket to BPO_model

In [None]:
local_model_folder = "/content/"
!gsutil -m cp -R $BPO_MODEL_PATH/BPO_model $local_model_folder

Copying gs://vertex-xt72os9/peft/BPO_model/BPO_models/BPO_model/pytorch_model-00001-of-00002.bin...
/ [0/7 files][    0.0 B/ 12.6 GiB]   0% Done                                    Copying gs://vertex-xt72os9/peft/BPO_model/BPO_models/BPO_model/config.json...
/ [0/7 files][    0.0 B/ 12.6 GiB]   0% Done                                    ==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

Copying gs://vertex-xt72os9/peft/BPO_model/BPO_models/BPO_model/tokenizer.model...
/ [0/7 files][    0.0 B/ 12.6 GiB]   0% Done                                    Copying gs://vertex-xt72os9/peft/BPO_model/BPO_models/BPO_model/pytorch_model-00002-of-00002.bin...
/ [0/7 files][    0.0 B/ 12.6 GiB]   0% Done                                    Copying gs://vertex-xt72os9/peft/BPO_model/BPO_models/BPO_model/pytor

# LOADING MODEL
In Colab Enterprise using L4 GPU with 24G of RAM

In Colab Free using T4 with 14G of RAM

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
precision_loading_mode = "float16"

## Load BPO paper model

In [None]:
bpo_model_id = 'BPO_model'
bpo_model_path = os.path.join(local_model_folder,
                              bpo_model_id)
bpo_model = AutoModelForCausalLM.from_pretrained(bpo_model_path,
                                             load_in_8bit=True,
                                             device_map="auto",
                                             torch_dtype=torch.float16,
                                             use_cache=None)
tokenizer = AutoTokenizer.from_pretrained(bpo_model_path,
                                           device_map="auto",
                                           torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt_template = "[INST] You are an expert prompt engineer. Please help me improve this prompt to get a more helpful response:\n{} [/INST]"
text = 'What is the best company stock to invest my savings?'
prompt = prompt_template.format(text)
model_inputs = tokenizer(prompt, return_tensors="pt").to(device)

In [None]:
output = bpo_model.generate(**model_inputs, max_new_tokens=1024, do_sample=True, top_p=0.9, temperature=0.05, num_beams=1)
resp = tokenizer.decode(output[0], skip_special_tokens=True).split('[/INST]')[1].strip()

print(resp)

What factors should I consider when choosing a company stock to invest my savings in?


In [None]:
assert False

AssertionError: ignored

## Loading Base model

To load the base model in Colab free we have to clear the BPO model.
In Colab Enterprise, we can load both models at the same time

In [None]:
#del base_model
#del tokenizer
#gc.collect()
#torch.cuda.empty_cache()

In [None]:
#!gsutil -m cp -R $BPO_MODEL_PATH/llama2-7b-chat-hf $local_model_folder
#!gsutil -m cp -R $BPO_MODEL_PATH/capstone_peft_adapter $local_model_folder

In [None]:
base_model_name = "llama2-7b-chat-hf"
model_path = os.path.join(local_model_folder,
                          base_model_name)
base_model = LlamaForCausalLM.from_pretrained(model_path,
                                              load_in_8bit=True,
                                              device_map="auto",
                                              torch_dtype=torch.float16,
                                              use_cache=None)
tokenizer = LlamaTokenizer.from_pretrained(model_path,
                                           device_map="auto",
                                           torch_dtype=torch.float16)
tokenizer.pad_token_id = tokenizer.eos_token_id

NameError: ignored

In [None]:
output = base_model.generate(**model_inputs, max_new_tokens=100, do_sample=True, top_p=0.9, temperature=0.05, num_beams=1)
resp = tokenizer.decode(output[0], skip_special_tokens=True).split('[/INST]')[1].strip()

print(resp)

## Load the adapter trained for capstone

THe adapter partially works. The outputs is more concise than plain LLaMa2 but less concise than the paper.

In [None]:
peft_model_id = 'BPO_models/capstone_peft_adapter'
capstone_adapter_path = os.path.join(local_model_folder,
                                     peft_model_id)
base_model.load_adapter(peft_model_id)

In [None]:
output = base_model.generate(**model_inputs, max_new_tokens=1024, do_sample=True, top_p=0.9, temperature=0.05, num_beams=1)
resp = tokenizer.decode(output[0], skip_special_tokens=True).split('[/INST]')[1].strip()

print(resp)

## Stop execution

In [None]:
assert False