In [None]:
# pip install sentence_transformers
# pip install langchain
# pip install pypdf
# pip install typing-extensions
# pip install chromadb
# pip install unstructured

In [1]:
from langchain.llms import LlamaCpp

MODEL_PATH = "../models/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"
MODEL_PATH = "/home/flash/models/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"
MODEL_HOME = "../models"
MODEL_PATH = f"{MODEL_HOME}/amethyst-13b-mistral.Q4_K_M.gguf"
# 38min
# 40s
llm = LlamaCpp(
    model_path=MODEL_PATH,
    n_ctx=1024,
    n_gpu_layers=43, #43
    n_threads=15,
    n_batch=512,
    f16_kv=True,
    #callback_manager=callback_manager,
    verbose=True,
    temperature=0.1,
    max_tokens=1024
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6
llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from ../models/amethyst-13b-mistral.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1, 

In [4]:
def ask(user_input):
   prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{user_prompt}

### Response:
"""
   prompt = prompt_template.format(user_prompt=user_input)
   resp = llm(prompt,  
      max_tokens=500,
      temperature=0.7,
      top_p=0.1,
      repeat_penalty=1.1)
   return resp

In [5]:
answer = ask("What is your name?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time = 68099.97 ms
llama_print_timings:      sample time =     4.58 ms /     9 runs   (    0.51 ms per token,  1966.78 tokens per second)
llama_print_timings: prompt eval time =  3442.47 ms /    40 tokens (   86.06 ms per token,    11.62 tokens per second)
llama_print_timings:        eval time =  2229.00 ms /     8 runs   (  278.63 ms per token,     3.59 tokens per second)
llama_print_timings:       total time =  5698.75 ms


' My name is AI Assistant.'

In [4]:
from py_standard.langchain_lit import split_documents, convert_docs_to_splits, LlmEmbedding, load_markdown_documents
from py_standard.pdf_utils import load_pdf_documents_from_directory
from langchain.vectorstores import Chroma

EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"

def load_all_documents():
   path = "./documents"
   txts = load_markdown_documents(path)
   pdfs = load_pdf_documents_from_directory(path)
   all_docs = txts + pdfs
   docs = split_documents(all_docs, 500)
   for idx, doc in enumerate(docs):
      doc.metadata['doc_id'] = idx+1
   return docs
   
def load_vectorstore(docs):
   llm_embedding = LlmEmbedding(f"../models/{EMB_MODEL}")
   vector_store = Chroma(
      collection_name = "sample",
      embedding_function=llm_embedding.embedding,
      collection_metadata={
         "hnsw:space": "cosine",
         "hnsw:search_ef": 100
      }
   )
   return vector_store


In [5]:
docs = load_all_documents()
len(docs)

7228

In [6]:
vector_store = load_vectorstore(docs)

In [13]:
def search_docs(user_query):
   global vector_store
   docs = vector_store.similarity_search_with_score(user_query, k=5)
   return docs

In [48]:
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown, Latex


def create_parent_document_retriever(vector_store, docs):
   # store for parent
   store = InMemoryStore() 
   parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
   child_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
   big_chunks_retriever = ParentDocumentRetriever(
      vectorstore=vector_store,
      docstore=store,
      parent_splitter=parent_splitter,
      child_splitter=child_splitter,
   )
   big_chunks_retriever.add_documents(docs, ids=None)
   return big_chunks_retriever

def create_qa_chain1(llm, retriever):
   qa_chain = RetrievalQA.from_chain_type(llm,
                                          chain_type='stuff',
                                          retriever=retriever)
   return qa_chain

def ask_qa_chain1(qa_chain, user_question):
   resp = qa_chain(user_question)
   answer = resp['result']
   return answer

def create_qa_chain2(llm, retriever):
   question_prompt = PromptTemplate.from_template(
      """You are QA Bot. If you don't know the answer, just say that you don't know, don't try to make up an answer.""")
   question_prompt = PromptTemplate.from_template(
      """You will use the provided context to answer user questions.
Read the given context before answering questions and think step by step.
If you can not answer a user question based on the provided context, respond with \"I don't know.\".
Do not use any other information for answering user.""")
   qa_chain = ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=retriever,
      condense_question_prompt=question_prompt,
      return_source_documents=True,
      verbose=False
   )
   return qa_chain

def ask_qa2(user_question):
   global qa_chain2
   history = []
   result = qa_chain2({"question": user_question, "chat_history": history})
   history = [(user_question, result["answer"])]
   return result["answer"]

In [None]:
full_retriever = create_parent_document_retriever(vector_store, docs)

In [38]:
qa_chain1 = create_qa_chain1(llm, full_retriever)

In [40]:
resp = qa_chain1("What is your name?")
answer = resp['result']
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     2.39 ms /     6 runs   (    0.40 ms per token,  2509.41 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   518.70 ms /     6 runs   (   86.45 ms per token,    11.57 tokens per second)
llama_print_timings:       total time =   534.37 ms


' My name is Allen.'

In [29]:
qa_chain2 = create_qa_chain2(llm, full_retriever)

In [None]:
# short document
sub_docs = vector_store.similarity_search_with_score("what is your name?", k=5)
sub_docs

In [25]:
# full document
sub_docs = full_retriever.get_relevant_documents("What is your name?")
sub_docs

[Document(page_content='What is your name?\nMy name is Allen.', metadata={'source': 'documents/t1.md', 'doc_id': 2}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).\n\nWhat is your name?\nMy name is Jack.', metadata={'source': 'documents/t2.md', 'doc_id': 3})]

In [None]:
sub_docs = full_retriever.get_relevant_documents("What functionality was released on 8/12/2023?")
sub_docs

In [31]:
sub_docs = full_retriever.get_relevant_documents("What is Five-Count Baccarat?")
sub_docs

[Document(page_content='stituent parts. Once divided, the expression can be translated into another \nlanguage, serialized for remote execution, injected with an asynchronous \nexecution pattern, and much more. Essentially, LINQ providers allow forTABLE  14.2: Aggregate Functions on System.Linq.Enumerable\nComment Type Description\nCount() Provides a total count of the number of items within the \ncollection\nAverage() Calculates the average value for a numeric key selector', metadata={'source': 'documents/Essential.CSharp.4.0.pdf', 'page': 632, 'doc_id': 5848})]

In [35]:
answer = ask_qa2("What is your name?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     3.31 ms /     6 runs   (    0.55 ms per token,  1810.50 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   660.87 ms /     6 runs   (  110.14 ms per token,     9.08 tokens per second)
llama_print_timings:       total time =   680.38 ms


' My name is Allen.'

In [33]:
answer = ask_qa2("What functionality was released on 8/12/2023?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =    30.26 ms /    74 runs   (    0.41 ms per token,  2445.31 tokens per second)
llama_print_timings: prompt eval time =   652.21 ms /   105 tokens (    6.21 ms per token,   160.99 tokens per second)
llama_print_timings:        eval time =  5232.54 ms /    73 runs   (   71.68 ms per token,    13.95 tokens per second)
llama_print_timings:       total time =  6076.62 ms


" On August 12, 2023, the new version of a programming language called F# was released. This release included several significant updates and improvements to the language's features, such as introducing event functions, using events and observables, creating simple reactive applications, declarative event processing using LINQ, and declaring events in F#."

In [41]:
answer = ask_qa2("What is Five-Count Baccarat?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =    59.47 ms /   142 runs   (    0.42 ms per token,  2387.84 tokens per second)
llama_print_timings: prompt eval time =  1517.33 ms /   121 tokens (   12.54 ms per token,    79.75 tokens per second)
llama_print_timings:        eval time = 11707.75 ms /   141 runs   (   83.03 ms per token,    12.04 tokens per second)
llama_print_timings:       total time = 14026.87 ms


' Five-Count Baccarat is a variation of the popular card game Baccarat where players are allowed to make decisions based on the total number of cards in their hand, which is limited to five. The objective remains the same as traditional Baccarat: to bet on whether the player or banker will have a higher score, or if there will be a tie. However, in Five-Count Baccarat, players can choose to stand pat with a hand totaling 5 points by not drawing any additional cards. This rule change increases strategic depth and potential payouts for skilled players who can accurately predict when to hold onto their initial hand value of 5 points.'

In [44]:
# pip install rank_bm25
from langchain.retrievers import BM25Retriever, EnsembleRetriever

bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 4

In [45]:
bm25_retriever.get_relevant_documents("what is your name?")

[Document(page_content='What is your name?\nMy name is Allen.', metadata={'source': 'documents/t1.md', 'doc_id': 2}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).\n\nWhat is your name?\nMy name is Jack.', metadata={'source': 'documents/t2.md', 'doc_id': 3}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).', metadata={'source': 'documents/test.md', 'doc_id': 4}),
 Document(page_content='them as with any other data types. This means that a list of functions is 

In [46]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, full_retriever],
                                       weights=[0.5, 0.5])

In [47]:
qa_chain3 = create_qa_chain1(llm, ensemble_retriever)

In [49]:
answer = ask_qa_chain1(qa_chain3, "What is your name?")
answer

Llama.generate: prefix-match hit



' My name is Astro. Flash created me.'

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     4.54 ms /    11 runs   (    0.41 ms per token,  2420.77 tokens per second)
llama_print_timings: prompt eval time =  2020.20 ms /   312 tokens (    6.48 ms per token,   154.44 tokens per second)
llama_print_timings:        eval time =   805.49 ms /    10 runs   (   80.55 ms per token,    12.41 tokens per second)
llama_print_timings:       total time =  2908.12 ms


####