In [1]:
from py_standard.langchain_lit import LlmEmbedding, load_all_documents, create_chroma_vectorstore
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from transformers import AutoModelForCausalLM
from langchain.prompts import PromptTemplate

In [2]:
docs = load_all_documents('./documents')

In [3]:
EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"
llm_embedd = LlmEmbedding(f"../models/{EMB_MODEL}")

In [None]:
vector_store = create_chroma_vectorstore(llm_embedd.embedding, "sample")
vector_store.add_documents(docs)

In [16]:
from transformers import T5ForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

def load_t5_model_in_8bit(model_name_path):
   model = T5ForConditionalGeneration.from_pretrained(model_name_path, device_map="auto", load_in_8bit=True)                                                                 
   return model

def create_model_load_in_4bit_config():
   nf4_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_use_double_quant=True,
      bnb_4bit_compute_dtype=torch.bfloat16,
   )
   return nf4_config

def load_t5_model_in_4bit(model_name_path):
   nf4_config = create_model_load_in_4bit_config()
   model = T5ForConditionalGeneration.from_pretrained(model_name_path, 
                                                      device_map="auto", 
                                                      quantization_config=nf4_config)
   return model
   
   
def load_t5_model_in_cpu_and_gpu(model_name_path):
   quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
   device_map = {
      "transformer.word_embeddings": 0,
      "transformer.word_embeddings_layernorm": 0,
      "lm_head": "cpu",
      "transformer.h": 0,
      "transformer.ln_f": 0,
   }   
   model = T5ForConditionalGeneration.from_pretrained(model_name_path, 
                                                      device_map="auto", 
                                                      quantization_config=quantization_config)
   return model

In [39]:
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM, T5Config, AutoModelForSeq2SeqLM
import re

def get_model_device_map(model_name_path):
   config = AutoConfig.from_pretrained(model_name_path)
   with init_empty_weights():
      model = AutoModelForCausalLM.from_config(config)
   device_map = infer_auto_device_map(model)
   device_map = infer_auto_device_map(model, no_split_module_classes=["OPTDecoderLayer"])
   return device_map

def get_t5_device_map(model_name_path):
   config = AutoConfig.from_pretrained(model_name_path)
   with init_empty_weights():
      model = AutoModelForSeq2SeqLM.from_config(config)
   device_map = infer_auto_device_map(model)
   device_map = infer_auto_device_map(model, no_split_module_classes=["OPTDecoderLayer"])
   return device_map

# r'model\.layers\.\d+'
def adjust_device_map(device_map, module_name_pattern, percent=0.2, device='cpu'):
   pattern = re.compile(module_name_pattern)
   match_model_names = [key for key in device_map if pattern.fullmatch(key)]
   catch_model_names = match_model_names[:int(len(match_model_names) * percent)]
   for model_name in catch_model_names:
      device_map[model_name] = device
   return device_map


def load_t5_model_in_split(model_name_path):
   device_map = get_t5_device_map(model_name_path)
   for key in ['decoder.embed_tokens.weight', 
               'encoder.embed_tokens.weight', 
               'lm_head.weight', 
               'shared.weight']:
      device_map[key] = 'cpu'
   device_map = adjust_device_map(device_map, r"encoder\.block\.\d+", 0.2)
   device_map = adjust_device_map(device_map, r"decoder\.block\.\d+", 0.2)
   model = T5ForConditionalGeneration.from_pretrained(
      model_name_path,
      device_map=device_map, 
      offload_folder="offload", 
      offload_state_dict = True, 
      torch_dtype=torch.float16)
   return model

In [25]:
MODEL_PATH = "../models/flan-ul2"

In [36]:
device_map = get_t5_device_map(MODEL_PATH)
device_map

{'shared': 0,
 'decoder.embed_tokens': 0,
 'encoder.embed_tokens': 0,
 'encoder.block.0': 0,
 'encoder.block.1': 0,
 'encoder.block.2': 0,
 'encoder.block.3': 0,
 'encoder.block.4': 0,
 'encoder.block.5': 0,
 'encoder.block.6': 0,
 'encoder.block.7': 0,
 'encoder.block.8': 0,
 'encoder.block.9': 0,
 'encoder.block.10': 0,
 'encoder.block.11': 0,
 'encoder.block.12': 0,
 'encoder.block.13': 0,
 'encoder.block.14': 0,
 'encoder.block.15': 0,
 'encoder.block.16': 0,
 'encoder.block.17': 0,
 'encoder.block.18': 0,
 'encoder.block.19': 0,
 'encoder.block.20': 0,
 'encoder.block.22': 'cpu',
 'encoder.block.23': 'cpu',
 'encoder.block.24': 'cpu',
 'encoder.block.25': 'cpu',
 'encoder.block.26': 'cpu',
 'encoder.block.27': 'cpu',
 'encoder.block.28': 'cpu',
 'encoder.block.29': 'cpu',
 'encoder.block.30': 'cpu',
 'encoder.block.31': 'cpu',
 'encoder.final_layer_norm': 'cpu',
 'encoder.dropout': 'cpu',
 'decoder.block.0': 'cpu',
 'decoder.block.1': 'cpu',
 'decoder.block.2': 'cpu',
 'decoder.bl

In [None]:
MODEL_PATH = "../models/Chinese-Llama-2-7b"
device_map = get_model_device_map(MODEL_PATH)
device_map

In [40]:
MODEL_PATH = "../models/flan-ul2"
llm_model = load_t5_model_in_split(MODEL_PATH)

Tied parameters are on different devices: {'decoder.embed_tokens.weight': 0, 'encoder.embed_tokens.weight': 0, 'lm_head.weight': 'disk', 'shared.weight': 0}. Please modify your custom device map or set `device_map='auto'`. 


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [3]:
MODEL_PATH = "../models/flan-ul2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [17]:
llm = load_t5_model_in_cpu_and_gpu(MODEL_PATH)

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.

In [None]:
llm = AutoModelForCausalLM.from_pretrained(
   MODEL_PATH,
   device_map='auto',
   local_files_only=True,
   use_cache=True
)

In [None]:
llm_chain = load_qa_with_sources_chain(llm, chain_type="refine")

In [None]:
question = "What is your name?"
sub_docs = vector_store.similarity_search_with_score(question, k=5)
sub_docs

In [None]:
llm_chain({"input_documents":sub_docs, "question": question})

In [None]:
prompt_template = """Given the following extracted parts of a long document and a question, create final answer.
If you don't known the answer, just say that you don't known. Don't try to make up an answer.
Response in English.

QUESTION: {question} 
=========
{summaries}
=========
FINAL ANSWER IN English:"""

prompt = PromptTemplate(template=prompt_template, input_variables=["summaries", "question"])

In [None]:
llm_chain2 = load_qa_with_sources_chain(llm, chain_type="stuff")

question = "Who is US president?"
llm_chain2({"input_documents":sub_docs, "question": question})