In [None]:
!pip install datasets accelerate peft bitsandbytes trl flash-attn

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import pipeline
import torch

base_model_id = "microsoft/Phi-3-mini-4k-instruct"
groundedai_eval_id = "grounded-ai/phi3-rag-relevance-judge"

config = PeftConfig.from_pretrained(groundedai_eval_id)
base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
model_peft = PeftModel.from_pretrained(base_model, groundedai_eval_id, config=config)

tokenizer = AutoTokenizer.from_pretrained(base_model_id)

merged_model = model_peft.merge_and_unload()
merged_model.to('cuda')

In [None]:
def format_input(text, query):
    input = f"""
      You are comparing a reference text to a question and trying to determine if the reference text
  contains information relevant to answering the question. Here is the data:
      [BEGIN DATA]
      ************
      [Question]: {query}
      ************
      [Reference text]: {text}
      ************
      [END DATA]
  Compare the Question above to the Reference text. You must determine whether the Reference text
  contains information that can answer the Question. Please focus on whether the very specific
  question can be answered by the information in the Reference text.
  Your response must be single word, either "relevant" or "unrelated",
  and should not contain any text or characters aside from that word.
  "unrelated" means that the reference text does not contain an answer to the Question.
  "relevant" means the reference text contains an answer to the Question."""
    return input

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

pipe = pipeline(
    "text-generation",
    model=merged_model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 256,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalL

Check model can still perform general tasks

In [None]:
messages = [
    {"role": "user", "content": 'Why is the sky blue?'}
]
generation_args = {
    "max_new_tokens": 256,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}
output = pipe(messages, **generation_args)

You are not running the flash-attention implementation, expect numerical differences.


In [None]:
output[0]['generated_text']

" The sky appears blue to the human eye because of the way Earth's atmosphere scatters sunlight. Sunlight is made up of different colors of light, which are scattered in all directions by the gases and particles in the Earth's atmosphere. Blue light is scattered more than other colors because it travels as shorter, smaller waves. This phenomenon is known as Rayleigh scattering. When we look at the sky away from the sun, we see more scattered blue light, which is why the sky appears blue during the day."

In [None]:
def run_merged_model(text, query):
  input = format_input(query, text)
  messages = [
      {"role": "user", "content": input}
  ]

  pipe = pipeline(
      "text-generation",
      model=merged_model,
      tokenizer=tokenizer,
  )

  generation_args = {
      "max_new_tokens": 4,
      "return_full_text": False,
      "temperature": 0.01,
      "do_sample": True,
  }

  output = pipe(messages, **generation_args)
  torch.cuda.empty_cache()
  return output[0]['generated_text'].strip().lower()

In [None]:
questions = ["What is quanitzation?", "Tell me about 4, 8, and 16 bit quantization.", "What is intel?", "How many parameters does Claude v3 have?"]

In [None]:
for q in questions:
  result = run_merged_model("""How to further reduce GPU memory required for Llama 2 70B?
    Quantization is a method to reduce the memory footprint. Quantization is able to do this by reducing the precision of the model's parameters from floating-point to lower-bit representations,
    such as 8-bit integers. This process significantly decreases the memory and computational requirements, enabling more efficient deployment of the model, particularly on devices with limited resources.
    However, it requires careful management to maintain the model's performance, as reducing precision can potentially impact the accuracy of the outputs.
    In general, the consensus seems to be that 8 bit quantization achieves similar performance to using 16 bit. However, 4 bit quantization could have a noticeable impact to the model performance.""", q)
  print(result)

relevant
relevant
unrelated
unrelated
