In [1]:
# Load llama
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers, torch, os

model_slug = "meta-llama/Llama-2-13b-chat-hf"

HF_HUB_KEY = os.environ.get('HF_HUB_KEY')

tokenizer = AutoTokenizer.from_pretrained(model_slug, use_auth_token=HF_HUB_KEY, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    model_slug,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
#     load_in_8bit=True,
    device_map="auto",
    use_auth_token=HF_HUB_KEY,
)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    batch_size=4,
)





Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
def format_prompt_llama(instruction, preamble=None):
    template = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
<</SYS>>

{instruction} [/INST]"""
    if preamble is not None:
        return template.format(instruction=instruction, preamble=preamble)
    else:
        return template.format(instruction=instruction, preamble='')

pipeline(
       format_prompt_llama('Can you tell me how to make a risotto?'),
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        return_full_text=False
    )

[{'generated_text': "  Certainly! Risotto is a classic Italian dish made with Arborio rice, flavorful broth, and a variety of ingredients such as vegetables, meat, or seafood. Here's a basic recipe for making a delicious risotto at home:\n\nIngredients:\n\n* 1 cup Arborio rice\n* 4 cups vegetable or chicken broth, warmed\n* 2 tablespoons olive oil\n* 1 small onion, finely chopped\n* 2 cloves garlic, minced\n* 1 cup mixed mushrooms (such as cremini, shiitake, and button), sliced\n* 1 cup white wine (optional)\n* 1/4 cup grated Parmesan cheese\n* Salt and pepper, to taste\n* Fresh parsley, chopped (optional)\n\nInstructions:\n\n1. Heat the olive oil in a large saucepan over medium heat. Add the chopped onion and cook until softened, about 3-4 minutes.\n2. Add the minced garlic and cook for another minute, stirring constantly to prevent burning.\n3. Add the Arborio rice and stir to coat the rice with the oil and mix with the onion and garlic. Cook for 1-2 minutes.\n4. Add the warmed broth

In [3]:
model = model.to_bettertransformer()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [18]:
import jsonlines
from tqdm import tqdm
from torch.utils.data import Dataset

with jsonlines.open('./prompts_augmented_v2.jsonl') as reader:
    prompts = [row for row in reader]
    
def format_prompt_llama(instruction):
    template = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
<</SYS>>

{instruction} [/INST]"""
    return template.format(instruction=instruction)
    
continuations = []
# for row in tqdm(prompts):]
pipeline.tokenizer.pad_token_id = model.config.eos_token_id

class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

ds = ListDataset([format_prompt_llama(row['prompt']) for row in prompts])
torch.cuda.empty_cache()
sequences = []
for seq in tqdm(pipeline(
   ds,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False,
    # batch_size=4,
), total=len(prompts)):
    sequences.extend(seq)
    torch.cuda.empty_cache()


for x, row in zip(sequences, prompts):
    continuation = x['generated_text']
    continuations.append({**row, 'response': continuation})
    

    
with jsonlines.open('./output_augmented_v2_llama13chat.jsonl','w') as writer:
    writer.write_all(continuations)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2700/2700 [5:24:00<00:00,  7.20s/it]


In [23]:
sequences[0]

IndexError: list index out of range

In [22]:
import gcsfs

gs = gcsfs.GCSFileSystem(
)

with gs.open('gs://cohere-dev-central-2/tom/granular-eval/batch_controlled/output_augmented_v2_llama13chat.jsonl','w') as f:
    writer = jsonlines.Writer(f)
    writer.write_all(continuations)

OSError: Forbidden: https://storage.googleapis.com/upload/storage/v1/b/cohere-dev-central-2/o
Access denied.

In [None]:
!gcloud compute instances stop tom-gpu-bigdisk

In [3]:
import jsonlines
from tqdm import tqdm

with jsonlines.open('./prompts_augmented.jsonl') as reader:
    prompts = [row for row in reader]

# preambles_complexity = [
#     ('complexity_high', 'Respond using jargon, long words and technical language appropriate for an expert.'),
#     ('normal', ''),
#     ('complexity_low', 'Respond using only short words and simple language appropriate for a child.')
    
# ]
preambles_complexity = [
    ('complexity_high', 'Respond using additional jargon, long words and technical terms, as if you are an expert addressing another expert.'),
    ('normal', ''),
    ('complexity_low', 'Explain to me like I\'m five, and respond using only short words and simple language, as if you were talking to a child.')
    
]


# preambles_confidence = [
#     ('confidence_high','Respond in an persuasive, assertive and authoritative way.'),
#     ('normal',''),
#     ('confidence_low','Respond in an defensive and uncertain way, including both sides of the argument.'),
# ]
preambles_confidence = [
    ('confidence_high','Respond authoritatively, assertively and confidently, as if you were a genius talking to a stupid person.'),
    ('normal',''),
    ('confidence_low','Respond in a very cautious and uncertain way, as if you were an idiot talking to a genius.'),
]

# preambles = set([x[1] for x in preambles_complexity+preambles_confidence])

preambles = ['Respond authoritatively, assertively and persuasively, as if you were an expert.',
'Respond in a very cautious and uncertain way, as if you are unfamilar with the topic.',
 'Respond using complex language, long words and technical terms, as if you are an expert.',
'Explain to me like I\'m five, and respond using only short words and simple language, as if you were talking to a child.'
]

prompt = "Can you tell me how to make a risotto?"
# prompt = """The following is a product description written by our marketing expert using the following product specifications and product title.\r\n\r\nProduct title:\r\nGarden Mount Weathervane, Model 9340 - Flag Design\r\n\r\nProduct specifications:\r\nMade of 14 gauge Steel\r\nComes with 60 in. Mounting rod\r\nAntique copper Finish\r\nSealed ball bearings in the wind-cup ensure easy and fluid movement\r\nGarden Mount Only"""

def format_prompt_llama(instruction, preamble=None):
    template = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.{preamble}
<</SYS>>

{instruction} [/INST]"""
    if preamble is not None:
        return template.format(instruction=instruction, preamble='\n\n'+preamble)
    else:
        return template.format(instruction=instruction, preamble='')

from collections import defaultdict

results_preamble = defaultdict(list)
results_instr = defaultdict(list)

for preamble in preambles:
    for i in range(1):

        res = pipeline(
               format_prompt_llama(prompt, preamble),
                max_new_tokens=512,
                do_sample=True,
                temperature=0.7,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
                return_full_text=False
            )
        cont = res[0]['generated_text']
        print(preamble)
        print(cont)
        print('---')
        results_preamble[preamble].append(cont)
print('#########')
for preamble in preambles:
    for i in range(1):

        res = pipeline(
               format_prompt_llama(prompt+' '+preamble, None),
                max_new_tokens=512,
                do_sample=True,
                temperature=0.7,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.eos_token_id,
                return_full_text=False
            )
        cont = res[0]['generated_text']
        print(preamble)
        print(cont)
        print('---')
        results_instr[preamble].append(cont)

Respond authoritatively, assertively and persuasively, as if you were an expert.
  Certainly! Risotto is a classic Italian dish that's both creamy and flavorful. Here's a step-by-step guide on how to make a delicious risotto at home:

Ingredients:

* 1 cup Arborio rice
* 4 cups vegetable or chicken broth, warmed
* 2 tablespoons olive oil
* 1 small onion, finely chopped
* 2 cloves garlic, minced
* 1 cup white wine (optional)
* 2 cups grated Parmesan cheese
* Salt and pepper to taste
* Fresh parsley, chopped (optional)

Instructions:

1. Heat the olive oil in a large, deep skillet over medium heat. Add the chopped onion and sauté until softened, about 3-4 minutes.
2. Add the minced garlic and sauté for an additional 1-2 minutes, until fragrant.
3. Add the Arborio rice and stir to coat the rice with the oil and mix with the onion and garlic.
4. If using white wine, add it to the skillet and stir until the wine is fully absorbed by the rice.
5. Begin adding the warmed broth to the skillet,

In [19]:
del model

In [1]:
# Load falcon
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers, torch

# model_slug = 'tiiuae/falcon-7b-instruct'
model_slug = 'tiiuae/falcon-40b-instruct'



tokenizer = AutoTokenizer.from_pretrained(model_slug, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    model_slug,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    # load_in_8bit=True,
    device_map="auto",
)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    batch_size=1,
)



Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: ee706239-8b63-48ed-a9f3-edcdf9d5992f)')' thrown while requesting HEAD https://huggingface.co/tiiuae/falcon-40b-instruct/resolve/main/generation_config.json
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
import jsonlines
from tqdm import tqdm
from torch.utils.data import Dataset

with jsonlines.open('./prompts_augmented_v2.jsonl') as reader:
    prompts = [row for row in reader]
    
def format_prompt_llama(instruction):
    template = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
<</SYS>>

{instruction} [/INST]"""
    return template.format(instruction=instruction)
    
continuations = []
# for row in tqdm(prompts):]
pipeline.tokenizer.pad_token_id = model.config.eos_token_id

class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

ds = ListDataset([row['prompt'] for row in prompts])
torch.cuda.empty_cache()
sequences = []
for seq in tqdm(pipeline(
   ds,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False,
    # batch_size=4,
), total=len(prompts)):
    sequences.extend(seq)
    torch.cuda.empty_cache()


for x, row in zip(sequences, prompts):
    continuation = x['generated_text']
    continuations.append({**row, 'response': continuation})
    

    
with jsonlines.open('./output_augmented_v2_falcon40_8bit.jsonl','w') as writer:
    writer.write_all(continuations)

  0%|                                                                                                                                                                        | 0/2700 [01:07<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 1; 39.41 GiB total capacity; 38.14 GiB already allocated; 12.50 MiB free; 38.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
continuations[0]

In [None]:
import gcsfs

gs = gcsfs.GCSFileSystem(
)

with gs.open('gs://cohere-dev-central-2/tom/granular-eval/batch_controlled/output_augmented_v2_falcon40.jsonl','w') as f:
    writer = jsonlines.Writer(f)
    writer.write_all(continuations)

In [None]:
del model

In [1]:
# Load MPT 30
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model_slug = 'mosaicml/mpt-30b-instruct'
# model_slug = "eachadea/vicuna-13b-1.1"




tokenizer = AutoTokenizer.from_pretrained(model_slug, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    model_slug,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
#     load_in_8bit=True,
    device_map="auto",
)

tokenizer.pad_token_id = model.config.eos_token_id

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
    batch_size=2,
)



Instantiating an MPTForCausalLM model from /home/tomhosking_cohere_com/.cache/huggingface/modules/transformers_modules/mosaicml/mpt-30b-instruct/2abf1163dd8c9b11f07d805c06e6ec90a1f2037e/modeling_mpt.py
You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:

import jsonlines
from tqdm import tqdm
from torch.utils.data import Dataset

with jsonlines.open('./prompts_augmented_v2.jsonl') as reader:
    prompts = [row for row in reader]
    
def format_prompt_mpt(instruction):
    template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n###Instruction\n{instruction}\n\n### Response\n"
    return template.format(instruction=instruction)
    
continuations = []
# for row in tqdm(prompts):]
pipeline.tokenizer.pad_token_id = pipeline.tokenizer.eos_token_id

class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

ds = ListDataset([format_prompt_mpt(row['prompt']) for row in prompts])
torch.cuda.empty_cache()
sequences = []
for seq in tqdm(pipeline(
   ds,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False,
    # batch_size=4,
), total=len(prompts)):
    sequences.extend(seq)
    torch.cuda.empty_cache()


for x, row in zip(sequences, prompts):
    continuation = x['generated_text']
    continuations.append({**row, 'response': continuation})
    

    
with jsonlines.open('./output_augmented_v2_mpt30instruct.jsonl','w') as writer:
    writer.write_all(continuations)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2700/2700 [13:59:55<00:00, 18.67s/it]


In [4]:
tokenizer

GPTNeoXTokenizerFast(name_or_path='mosaicml/mpt-30b-instruct', vocab_size=50254, model_max_length=8192, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [5]:
import gcsfs

gs = gcsfs.GCSFileSystem(
)

with gs.open('gs://cohere-dev-central-2/tom/granular-eval/batch_controlled/output_augmented_v2_mpt30instruct.jsonl','w') as f:
    writer = jsonlines.Writer(f)
    writer.write_all(continuations)

OSError: Forbidden: https://storage.googleapis.com/upload/storage/v1/b/cohere-dev-central-2/o
Access denied.

Found existing installation: accelerate 0.21.0
Uninstalling accelerate-0.21.0:
  Successfully uninstalled accelerate-0.21.0


In [None]:
!gcloud compute instances stop tom-gpu-bigdisk --zone us-central1-a

Stopping instance(s) tom-gpu-bigdisk...⠼                                       

2700