In [1]:
from py_standard.langchain_lit import LlmEmbedding, load_all_documents, create_chroma_vectorstore
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from transformers import AutoModelForCausalLM
from langchain.prompts import PromptTemplate

In [11]:
docs = load_all_documents('./documents')

In [10]:
EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"
llm_embedd = LlmEmbedding(f"../models/{EMB_MODEL}")

In [None]:
vector_store = create_chroma_vectorstore(llm_embedd.embedding, "sample")
vector_store.add_documents(docs)

In [2]:
from transformers import T5ForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

def load_t5_model_in_8bit(model_name_path):
   model = T5ForConditionalGeneration.from_pretrained(model_name_path, device_map="auto", load_in_8bit=True)                                                                 
   return model

def create_model_load_in_4bit_config():
   nf4_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
   )
   return nf4_config

def load_t5_model_in_4bit(model_name_path, device_map="auto"):
   nf4_config = create_model_load_in_4bit_config()
   model = T5ForConditionalGeneration.from_pretrained(model_name_path, 
                                                      device_map=device_map, 
                                                      quantization_config=nf4_config)
   return model
   
   
def load_t5_model_in_cpu_and_gpu(model_name_path, device_map):
   quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
   model = T5ForConditionalGeneration.from_pretrained(
      model_name_path, 
      device_map=device_map, 
      quantization_config=quantization_config,
      offload_folder="offload", 
      offload_state_dict = True)
   return model

In [3]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM, T5Config, AutoModelForSeq2SeqLM
import re

def get_model_device_map(model_name_path):
   config = AutoConfig.from_pretrained(model_name_path)
   with init_empty_weights():
      model = AutoModelForCausalLM.from_config(config)
   device_map = infer_auto_device_map(model)
   device_map = infer_auto_device_map(model, no_split_module_classes=["OPTDecoderLayer"])
   return device_map

def get_t5_device_map(model_name_path):
   config = AutoConfig.from_pretrained(model_name_path)
   with init_empty_weights():
      model = AutoModelForSeq2SeqLM.from_config(config)
   device_map = infer_auto_device_map(model)
   device_map = infer_auto_device_map(model, no_split_module_classes=["OPTDecoderLayer"])
   return device_map

# r'model\.layers\.\d+'
def adjust_device_map(device_map, module_name_pattern, percent=0.2, device='cpu'):
   pattern = re.compile(module_name_pattern)
   match_model_names = [key for key in device_map if pattern.fullmatch(key)]
   catch_model_names = match_model_names[:int(len(match_model_names) * percent)]
   for model_name in catch_model_names:
      device_map[model_name] = device
   return device_map

   
def to_device_map(device_map, source_device, to_device, percent=0.2):
   match_model_names = [key for key in device_map.keys() if key == source_device]
   catch_model_names = match_model_names[:int(len(match_model_names) * percent)]
   for model_name in catch_model_names:
      device_map[model_name] = to_device
   return device_map

# too slow....
def load_t5_model_in_offload(model_name_path):
   device_map = get_t5_device_map(model_name_path)
   for key in ['decoder.embed_tokens.weight', 
               'encoder.embed_tokens.weight', 
               'lm_head.weight', 
               'shared.weight']:
      device_map[key] = 'gpu'
   # device_map = adjust_device_map(device_map, r"encoder\.block\.\d+", 0.2)
   device_map = to_device_map(device_map, "disk", "gpu", 0.3)
   model = T5ForConditionalGeneration.from_pretrained(
      model_name_path,
      device_map=device_map, 
      offload_folder="offload", 
      offload_state_dict = True, 
      torch_dtype=torch.float16)
   return model

def adjust_t5_device_map(model_name_path):
   device_map = get_t5_device_map(model_name_path)
   for key in ['decoder.embed_tokens.weight', 
               'encoder.embed_tokens.weight', 
               'lm_head.weight', 
               'shared.weight']:
      device_map[key] = 'cuda'
   # device_map = adjust_device_map(device_map, r"encoder\.block\.\d+", 0.2)
   device_map = to_device_map(device_map, "disk", "cuda", 0.3)
   device_map = to_device_map(device_map, "disk", "cpu", 0.5)
   return device_map

In [None]:
MODEL_PATH = "../models/flan-ul2"

In [None]:
device_map = get_t5_device_map(MODEL_PATH)
device_map

In [None]:
MODEL_PATH = "../models/Chinese-Llama-2-7b"
device_map = get_model_device_map(MODEL_PATH)
device_map

In [6]:
MODEL_PATH = "../models/flan-ul2"
#llm_model = load_t5_model_in_offload(MODEL_PATH)

device_map = adjust_t5_device_map(MODEL_PATH)
#llm_model = load_t5_model_in_cpu_and_gpu(MODEL_PATH, device_map)

llm_model = load_t5_model_in_4bit(MODEL_PATH, "cuda")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [16]:
def generate_qa_pair(context, question):
    global tokenizer, llm_model
    input_text = f"question: {question} context: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(llm_model.device)
    outputs = llm_model.generate(input_ids)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [17]:
# FAIL 
answer = generate_qa_pair("What is your name?\nMy Name is Astro. Flash Created me.", "What is your name?")
answer

FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.


AssertionError: 

In [7]:
MODEL_PATH = "../models/flan-ul2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [None]:
llm = AutoModelForCausalLM.from_pretrained(
   MODEL_PATH,
   device_map='auto',
   local_files_only=True,
   use_cache=True
)

In [12]:
llm = llm_model
llm_chain = load_qa_with_sources_chain(llm, chain_type="refine")

ValidationError: 2 validation errors for LLMChain
llm
  instance of Runnable expected (type=type_error.arbitrary_type; expected_arbitrary_type=Runnable)
llm
  instance of Runnable expected (type=type_error.arbitrary_type; expected_arbitrary_type=Runnable)

In [None]:
question = "What is your name?"
sub_docs = vector_store.similarity_search_with_score(question, k=5)
sub_docs

In [None]:
llm_chain({"input_documents":sub_docs, "question": question})

In [None]:
prompt_template = """Given the following extracted parts of a long document and a question, create final answer.
If you don't known the answer, just say that you don't known. Don't try to make up an answer.
Response in English.

QUESTION: {question} 
=========
{summaries}
=========
FINAL ANSWER IN English:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["summaries", "question"])

In [None]:
llm_chain2 = load_qa_with_sources_chain(llm, chain_type="stuff")

question = "Who is US president?"
llm_chain2({"input_documents":sub_docs, "question": question})