In [13]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from finetune_utils import load_finetune_config
from langchain_lit import load_markdown_documents, LlmEmbedding
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.schema.runnable import RunnablePassthrough


In [4]:
config = load_finetune_config()
device = "cuda"
EMB_MODEL = "bge-base-en"


In [5]:
def load_vector_store():
    print("loading data")
    docs = load_markdown_documents("./data")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=35)
    all_splits = text_splitter.split_documents(docs)
    llm_embedding = LlmEmbedding(f"../models/{EMB_MODEL}")
    print("loading vector")
    vectorstore = Chroma.from_documents(documents=all_splits,
                                        embedding=llm_embedding.embedding)
    return vectorstore


In [8]:
model_name = config['model_name']
base_model = f"../models/{model_name}"
peft_model_id = f"./outputs/{model_name}-qlora"
if not os.path.exists(f"{peft_model_id}/adapter_config.json"):
    peft_model_id = None


In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    # return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    # trust_remote_code=True,
    local_files_only=True,
)
model.load_adapter(peft_model_id)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

In [11]:
generation_config = model.generation_config
generation_config.max_new_tokens = 4096
generation_config.temperature = 0.01  # 0.7
generation_config.top_p = 2
generation_config.do_sample = True
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [12]:
task = "text-generation"
pipe = pipeline(
    task=task,
    model=model,
    tokenizer=tokenizer,
    # max_length=4096,
    temperature=0.01,
    top_p=2,
    repetition_penalty=1.15,
    return_full_text=True,
)

In [14]:
prompt_template = """
### [INST] 
Instruction: Answer the question based on your 
gaming knowledge. 
If the answer cannot be found from the context, try to find the answer from your knowledge. 
If still unable to find the answer, respond with 'I don't know'.
Here is context to help:

{context}

### QUESTION:
{question} 

[/INST]
 """
 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
) 

In [15]:
llm = HuggingFacePipeline(pipeline=pipe)
llm_chain = LLMChain(llm=llm, prompt=prompt)


In [16]:
vectorstore = load_vector_store()
retriever = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 50}
    )


loading data
loading vector


In [17]:
rag_chain = (
    {
        "context": retriever,
        "question": RunnablePassthrough()
    }
    | llm_chain
)


In [18]:
def ask_qa(user_input):
    resp = rag_chain.invoke(user_input)
    doc = resp['context'][0]
    page_content = doc.page_content
    source = doc.metadata['source']
    answer = resp['text']
    print(f"{source=}")
    return answer


In [19]:
answer = ask_qa("Who is Flash's Father?")
print(answer)

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


source='data/Test.md'
 Flash's father is Jack.


In [20]:
answer = ask_qa("Use C# write HELLO string")
print(answer)

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


source='data/Test.md'
 I don't understand what you are asking for. Can you please provide more information or clarify your question?


In [61]:
def ask(user_input):
    prompt_template2 = """
Here is context:

{context}

### QUESTION:
Does this content express a 'I don't known'? Please respond with 'Yes' or 'No'.
"""
    prompt = prompt_template2.format(context=user_input)
    encoding = tokenizer(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        generation_config=generation_config
    )

    resp = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = resp.replace(prompt, "")
    answer = answer.strip().replace("### ANSWER:\n", "")
    return answer

In [62]:
a = ask("I don't understand what you are asking for. Can you please provide more information or clarify your question?")
a

'\n### ANSWER:\nNo.'