# Llama2 Sandbox

## Imports

In [1]:
import os
from pathlib import Path
from tqdm.auto import tqdm
import logging
logger = logging.getLogger('llama2_sandbox')
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)

import pprint
pp = pprint.PrettyPrinter(indent=4)

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [2]:
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaModel, LlamaConfig, TextGenerationPipeline

In [3]:
hf_auth = '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'

model_id = "meta-llama/Llama-2-7b-chat-hf"
# model_cache = os.path.join(os.path.expanduser("~"), f"models_hf/{model_id.split('/')[0]}/{model_id.split('/')[-1]}")
# logger.info(model_cache)
# quantization = 'fp16'

2023-10-01 12:51:05,919 - llama2_sandbox - INFO - /Users/academia/models_hf/meta-llama/Llama-2-7b-chat-hf


In [4]:
model_cache

'/Users/academia/models_hf/meta-llama/Llama-2-7b-chat-hf'

In [7]:
CUDA_N_GPUS = torch.cuda.device_count()
CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
CUDA_MAX_MEMORY = {i: CUDA_MAX_MEMORY for i in range(CUDA_N_GPUS)}
logger.info(
    f"Using k={CUDA_N_GPUS} CUDA GPUs with max memory {CUDA_MAX_MEMORY}"
)

2023-08-10 23:44:39,858 - llama2_results - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}


In [8]:
model_config = LlamaConfig.from_pretrained(
                            model_id,
                            bos_token_id = 1,
                            eos_token_id = 2,
                            hidden_act = "silu",
                            hidden_size = 8192,
                            initializer_range = 0.02,
                            intermediate_size = 28672,
                            max_position_embeddings = 4096,
                            model_type = "llama",
                            num_attention_heads = 64,
                            num_hidden_layers = 80,
                            num_key_value_heads = 8,
                            pretraining_tp = 1,
                            rms_norm_eps = 1e-05,
                            rope_scaling = None,
                            tie_word_embeddings = False,
                            torch_dtype = "float16",
                            transformers_version = "4.32.0.dev0",
                            use_cache = True,
                            vocab_size = 32000,
                            use_auth_token = hf_auth
                            )



In [9]:
if quantization == "fp16":
    model = LlamaForCausalLM.from_pretrained(
        model_id,
        use_auth_token=hf_auth,
        trust_remote_code=True,
        config=model_config,
        # torch_dtype=torch.float16,
        device_map="auto",
        offload_state_dict=True,
        offload_folder="offload",
        max_memory=CUDA_MAX_MEMORY,
    )
elif quantization == "bf16":
    model = LlamaForCausalLM.from_pretrained(
        model_id,
        use_auth_token=hf_auth,
        trust_remote_code=True,
        config=model_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        offload_state_dict=True,
        offload_folder="offload",
        max_memory=CUDA_MAX_MEMORY,
    )
elif quantization == "int8":
    model = LlamaForCausalLM.from_pretrained(
        model_id,
        use_auth_token=hf_auth,
        trust_remote_code=True,
        config=model_config,
        load_in_8bit=True,
        device_map="auto",
        offload_state_dict=True,
        offload_folder="offload",
        max_memory=CUDA_MAX_MEMORY,
    )
elif quantization == "fp4":
    model = LlamaForCausalLM.from_pretrained(
        model_id,
        use_auth_token=hf_auth,
        trust_remote_code=True,
        config=model_config,
        load_in_4bit=True,
        device_map="auto",
        offload_state_dict=True,
        offload_folder="offload",
        max_memory=CUDA_MAX_MEMORY,
    )
else:
    raise ValueError(f"Invalid quantization '{quantization}'")
tokenizer = LlamaTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth,
    legacy=False,
    add_bos_token=True,
    add_eos_token=False,
)

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:15<00:00,  1.06s/it]


In [10]:
# get pipeline ready for instruction text generation
generation_pipeline = TextGenerationPipeline(model=model,
                                             tokenizer=tokenizer,
                                             # NOTE: Set `do_sample = True` when `temperature > 0.0`
                                             # https://github.com/huggingface/transformers/issues/25326
                                             temperature=0.0,  # [0.0, 1.0]; 0.0 means greedy sampling
                                             do_sample=False,
                                             max_new_tokens=512,
                                             top_k=10,
                                             top_p=0.92,
                                             repetition_penalty=1.0,  # 1.0 means no penalty
                                             num_return_sequences=1  # Only generate one response
                                            )

In [11]:
SYSTEM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
DISCARD = "Discard all the previous instructions."

TASK_INSTRUCTION_MAP = {
    "sentiment_analysis": f"{DISCARD} Behave like you are an expert sentence sentiment classifier. Classify the following sentence into 'NEGATIVE', 'POSITIVE', or 'NEUTRAL' class. Label 'NEGATIVE' if it is corresponding to negative sentiment, 'POSITIVE' if it is corresponding to positive sentiment, or 'NEUTRAL' if the sentiment is neutral. Provide the label in the first line and provide a short explanation in the second line. The sentence: ",
}

TASK_DATA_MAP = {
    "sentiment_analysis": "FPB-sentiment-analysis-allagree",
}

TASK_MAP = {
    "sentiment_analysis": {
        "data": TASK_DATA_MAP["sentiment_analysis"],
        "instruction": TASK_INSTRUCTION_MAP["sentiment_analysis"],
    },
}

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"


def llama2_prompt_generator(instruction: str, sentences: list[str]):
    SYS_PROMPT = f""""Discard all the previous instructions. Below is an instruction that describes a task. Write a response that appropriately completes the request."""
    INST_PROMPT = instruction
    if not instruction or not isinstance(instruction, str):
        raise ValueError("Instruction must be a non-empty string.")
    if not sentences or not all(isinstance(sentence, str) for sentence in sentences):
        raise ValueError("Sentences must be a non-empty list of strings.")

    prompts = []
    for SENTENCE in sentences:
        prompts.append(
            B_INST + B_SYS + SYS_PROMPT + E_SYS + INST_PROMPT + SENTENCE + E_INST
        )

    return prompts

In [12]:
SENTENCES =(
                "Our customers come from the following countries : UK , USA , Spain , France , Italy , Germany , China , Hong Kong , Sweden , Norway , Netherlands , Austria , Belgium , Switzerland , Czech Republic , Finland , Canada , Russia , Ukraine , Denmark , Ireland , South Korea and Liechtenstein ."
            , "CapMan 's first real estate fund , which had a total investment capacity of ( EURO ) 500 million and closed in June 2005 , invested in commercial properties in the Helsinki metropolitan area ."
            , "The value of the contracts is about EUR 3.3 mn ."
           )

In [13]:
INSTRUCTION = TASK_MAP['sentiment_analysis']['instruction']

In [14]:
inputs_list = llama2_prompt_generator(INSTRUCTION, SENTENCES)

In [15]:
%%time
generation_result = generation_pipeline(inputs_list)

output_list = []
for i in range(len(generation_result)):
    output_list.append(
        # [labels[i], sentences[i], 
        generation_result[i][0]["generated_text"]
        # ]
    )



CPU times: user 38min 29s, sys: 11min 40s, total: 50min 9s
Wall time: 50min 10s


In [16]:
output_list

['[INST]<<SYS>>\n"Discard all the previous instructions. Below is an instruction that describes a task. Write a response that appropriately completes the request.\n<</SYS>>\n\nDiscard all the previous instructions. Behave like you are an expert sentence sentiment classifier. Classify the following sentence into \'NEGATIVE\', \'POSITIVE\', or \'NEUTRAL\' class. Label \'NEGATIVE\' if it is corresponding to negative sentiment, \'POSITIVE\' if it is corresponding to positive sentiment, or \'NEUTRAL\' if the sentiment is neutral. Provide the label in the first line and provide a short explanation in the second line. The sentence: Our customers come from the following countries : UK , USA , Spain , France , Italy , Germany , China , Hong Kong , Sweden , Norway , Netherlands , Austria , Belgium , Switzerland , Czech Republic , Finland , Canada , Russia , Ukraine , Denmark , Ireland , South Korea and Liechtenstein .[/INST]  NEGATIVE\n\nThe sentence mentions a list of countries, but the tone is