In [None]:
# pip install sentence_transformers
# pip install langchain
# pip install pypdf
# pip install typing-extensions
# pip install chromadb
# pip install unstructured

In [1]:
from langchain.llms import LlamaCpp

MODEL_PATH = "../models/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"
MODEL_PATH = "/home/flash/models/dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"
MODEL_HOME = "../models"
MODEL_PATH = f"{MODEL_HOME}/amethyst-13b-mistral.Q4_K_M.gguf"
# 38min
# 40s
llm = LlamaCpp(
    model_path=MODEL_PATH,
    n_ctx=1024 * 4,
    n_gpu_layers=43, #43
    n_threads=15,
    n_batch=512,
    f16_kv=True,
    #callback_manager=callback_manager,
    verbose=False,
    temperature=0.1,
    max_tokens=2048
)

ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6
llama_model_loader: loaded meta data with 21 key-value pairs and 363 tensors from ../models/amethyst-13b-mistral.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  5120,  5120,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  5120, 13824,     1, 

In [74]:
def ask(user_input):
   prompt_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{user_prompt}

### Response:
"""
   prompt = prompt_template.format(user_prompt=user_input)
   resp = llm(prompt,  
      max_tokens=500,
      temperature=0.7,
      top_p=0.1,
      repeat_penalty=1.1)
   return resp

In [5]:
answer = ask("What is your name?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time = 68099.97 ms
llama_print_timings:      sample time =     4.58 ms /     9 runs   (    0.51 ms per token,  1966.78 tokens per second)
llama_print_timings: prompt eval time =  3442.47 ms /    40 tokens (   86.06 ms per token,    11.62 tokens per second)
llama_print_timings:        eval time =  2229.00 ms /     8 runs   (  278.63 ms per token,     3.59 tokens per second)
llama_print_timings:       total time =  5698.75 ms


' My name is AI Assistant.'

In [None]:
answer = ask("Generate similar questions for \"What is payout of bet options in baccarat?\"")
answer

In [None]:
question = "What is payout of bet options in baccarat?"
answer = ask(f"Generate multiple search queries related to: {question} \n OUTPUT (4 queries):")
answer

In [79]:
def convert_generate_queries(answer):
   generate_queries = answer.split("\n")

[' 1. How are winnings calculated for different types of bets in baccarat?',
 '2. What is the house edge on various bets available in a game of baccarat?',
 '3. Can you explain the payout structure for banker, player, and tie bets in baccarat?',
 '4. How does the casino determine the amount to be paid out when placing different types of bets on baccarat?']

In [None]:
from py_standard.langchain_lit import split_documents, convert_docs_to_splits, LlmEmbedding, load_markdown_documents

EMB_MODEL = "bge-base-en"
#EMB_MODEL = "bge-large-zh-v1.5"

llm_embedd = LlmEmbedding(f"../models/{EMB_MODEL}")

In [24]:
from py_standard.langchain_lit import split_documents, convert_docs_to_splits, LlmEmbedding, load_markdown_documents
from py_standard.pdf_utils import load_pdf_documents_from_directory
from langchain.vectorstores import Chroma

def load_all_documents():
   path = "./documents"
   txts = load_markdown_documents(path)
   pdfs = load_pdf_documents_from_directory(path)
   all_docs = txts + pdfs
   docs = split_documents(all_docs, 500)
   for idx, doc in enumerate(docs):
      doc.metadata['doc_id'] = idx+1
   return docs
  
def create_vectorstore(persist_directory=None):
   llm_embedding = LlmEmbedding(f"../models/{EMB_MODEL}")
   vector_store = Chroma(
      collection_name = "sample",
      persist_directory=persist_directory,
      embedding_function=llm_embedding.embedding,
      collection_metadata={
         "hnsw:space": "cosine",
         "hnsw:search_ef": 100
      }
   )
   return vector_store


In [3]:
docs = load_all_documents()
len(docs)

7235

In [4]:
vector_store = create_vectorstore()

In [5]:
def search_docs(user_query):
   global vector_store
   docs = vector_store.similarity_search_with_score(user_query, k=5)
   return docs

In [9]:
from langchain.storage._lc_store import create_kv_docstore
from langchain.storage import LocalFileStore

def create_persist_docstore(persist_directory="./results/docstore"):
   fs = LocalFileStore(persist_directory)
   store = create_kv_docstore(fs)
   return store

In [10]:
from langchain.chains import ConversationalRetrievalChain, RetrievalQA
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.prompts import PromptTemplate
from IPython.display import display, Markdown, Latex


def create_parent_document_retriever(vector_store, docs):
   # store for parent
   # store = InMemoryStore() 
   store = create_persist_docstore()
   parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
   child_splitter = RecursiveCharacterTextSplitter(chunk_size=500)
   big_chunks_retriever = ParentDocumentRetriever(
      vectorstore=vector_store,
      docstore=store,
      parent_splitter=parent_splitter,
      child_splitter=child_splitter,
   )
   big_chunks_retriever.add_documents(docs, ids=None)
   return big_chunks_retriever

def create_qa_chain1(llm, retriever):
   qa_chain = RetrievalQA.from_chain_type(llm,
                                          chain_type='stuff',
                                          retriever=retriever)
   return qa_chain

def ask_qa_chain1(qa_chain, user_question):
   resp = qa_chain(user_question)
   answer = resp['result']
   return answer

def create_qa_chain2(llm, retriever):
   question_prompt = PromptTemplate.from_template(
      """You are QA Bot. If you don't know the answer, just say that you don't know, don't try to make up an answer.""")
   question_prompt = PromptTemplate.from_template(
      """You will use the provided context to answer user questions.
Read the given context before answering questions and think step by step.
If you can not answer a user question based on the provided context, respond with \"I don't know.\".
Do not use any other information for answering user.""")
   qa_chain = ConversationalRetrievalChain.from_llm(
      llm=llm,
      retriever=retriever,
      condense_question_prompt=question_prompt,
      return_source_documents=True,
      verbose=False
   )
   return qa_chain

def ask_qa2(user_question):
   global qa_chain2
   history = []
   result = qa_chain2({"question": user_question, "chat_history": history})
   history = [(user_question, result["answer"])]
   return result["answer"]

In [8]:
full_retriever = create_parent_document_retriever(vector_store, docs)

In [38]:
qa_chain1 = create_qa_chain1(llm, full_retriever)

In [None]:
resp = qa_chain1("What is your name?")
answer = resp['result']
answer

In [29]:
qa_chain2 = create_qa_chain2(llm, full_retriever)

In [None]:
# short document
sub_docs = vector_store.similarity_search_with_score("what is your name?", k=5)
sub_docs

In [25]:
# full document
sub_docs = full_retriever.get_relevant_documents("What is your name?")
sub_docs

[Document(page_content='What is your name?\nMy name is Allen.', metadata={'source': 'documents/t1.md', 'doc_id': 2}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).\n\nWhat is your name?\nMy name is Jack.', metadata={'source': 'documents/t2.md', 'doc_id': 3})]

In [None]:
sub_docs = full_retriever.get_relevant_documents("What functionality was released on 8/12/2023?")
sub_docs

In [31]:
sub_docs = full_retriever.get_relevant_documents("What is Five-Count Baccarat?")
sub_docs

[Document(page_content='stituent parts. Once divided, the expression can be translated into another \nlanguage, serialized for remote execution, injected with an asynchronous \nexecution pattern, and much more. Essentially, LINQ providers allow forTABLE  14.2: Aggregate Functions on System.Linq.Enumerable\nComment Type Description\nCount() Provides a total count of the number of items within the \ncollection\nAverage() Calculates the average value for a numeric key selector', metadata={'source': 'documents/Essential.CSharp.4.0.pdf', 'page': 632, 'doc_id': 5848})]

In [35]:
answer = ask_qa2("What is your name?")
answer

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     3.31 ms /     6 runs   (    0.55 ms per token,  1810.50 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   660.87 ms /     6 runs   (  110.14 ms per token,     9.08 tokens per second)
llama_print_timings:       total time =   680.38 ms


' My name is Allen.'

In [None]:
answer = ask_qa2("What functionality was released on 8/12/2023?")
answer

In [None]:
answer = ask_qa2("What is Five-Count Baccarat?")
answer

In [44]:
# pip install rank_bm25
from langchain.retrievers import BM25Retriever, EnsembleRetriever

bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 4

In [25]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from llama_index import StorageContext, ServiceContext, VectorStoreIndex, SimpleDirectoryReader

def load_llama_index_documents(docs_directory):
   reader = SimpleDirectoryReader(input_dir=docs_directory,
                                  required_exts=['.md', '.txt', '.pdf'],
                                  recursive=True)
   return reader.load_data()

def create_persist_bm25_retriever(vector_store, docs_directory):
   documents = load_llama_index_documents(docs_directory)
   storage_context = StorageContext.from_defaults(vector_store=vector_store)
   service_context = ServiceContext.from_defaults(llm=llm, embed_model=llm_embedd.embedding, chunk_size=500)
   
   index = VectorStoreIndex.from_documents(
      documents, 
      storage_context=storage_context, 
      service_context=service_context
   )
   
   # vector_retriever = index.as_retriever(similarity_top_k=5)
   bm25_retriever = BM25Retriever.from_defaults(
      docstore=index.docstore,
      similarity_top_k=5
   )
   return bm25_retriever

In [None]:
bm25_vector_store = create_vectorstore()
bm25_retriever = create_persist_bm25_retriever(bm25_vector_store, "/home/flash/documents/")

In [45]:
bm25_retriever.get_relevant_documents("what is your name?")

[Document(page_content='What is your name?\nMy name is Allen.', metadata={'source': 'documents/t1.md', 'doc_id': 2}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).\n\nWhat is your name?\nMy name is Jack.', metadata={'source': 'documents/t2.md', 'doc_id': 3}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).', metadata={'source': 'documents/test.md', 'doc_id': 4}),
 Document(page_content='them as with any other data types. This means that a list of functions is 

In [46]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, full_retriever],
                                       weights=[0.5, 0.5])

In [47]:
qa_chain3 = create_qa_chain1(llm, ensemble_retriever)

In [49]:
answer = ask_qa_chain1(qa_chain3, "What is your name?")
answer

Llama.generate: prefix-match hit



' My name is Astro. Flash created me.'

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     4.54 ms /    11 runs   (    0.41 ms per token,  2420.77 tokens per second)
llama_print_timings: prompt eval time =  2020.20 ms /   312 tokens (    6.48 ms per token,   154.44 tokens per second)
llama_print_timings:        eval time =   805.49 ms /    10 runs   (   80.55 ms per token,    12.41 tokens per second)
llama_print_timings:       total time =  2908.12 ms


In [57]:
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers import ContextualCompressionRetriever

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
                                                       base_retriever=ensemble_retriever)

# use this

In [58]:
# compressor prompt
compressor.llm_chain.prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=NoOutputParser(), template='Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. \n\nRemember, *DO NOT* edit the extracted parts of the context.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\nExtracted relevant parts:')

In [None]:
compressed_docs = compression_retriever.get_relevant_documents("What is your name?")
compressed_docs

In [83]:
import langchain
langchain.debug = False

In [None]:
compressor_qa_chain = create_qa_chain1(llm, compression_retriever)
answer = ask_qa_chain1(compressor_qa_chain, "How do you conduct a code review?")
answer

In [60]:
from langchain.retrievers.document_compressors import LLMChainFilter

_filter = LLMChainFilter.from_llm(llm)
_filter.llm_chain.prompt

PromptTemplate(input_variables=['context', 'question'], output_parser=BooleanOutputParser(), template="Given the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n\n> Question: {question}\n> Context:\n>>>\n{context}\n>>>\n> Relevant (YES / NO):")

In [55]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

In [65]:
filter_compression_retriever = ContextualCompressionRetriever(base_compressor=_filter, base_retriever=compression_retriever)

sub_docs = filter_compression_retriever.get_relevant_documents("What is your name?")
pretty_print_docs(sub_docs)

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     2.10 ms /     6 runs   (    0.35 ms per token,  2854.42 tokens per second)
llama_print_timings: prompt eval time =  1314.99 ms /    88 tokens (   14.94 ms per token,    66.92 tokens per second)
llama_print_timings:        eval time =   530.62 ms /     5 runs   (  106.12 ms per token,     9.42 tokens per second)
llama_print_timings:       total time =  1893.59 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     4.46 ms /    11 runs   (    0.41 ms per token,  2465.26 tokens per second)
llama_print_timings: prompt eval time =   823.49 ms /    93 tokens (    8.85 ms per token,   112.93 tokens per second)
llama_print_timings:        eval time =   714.81 ms /    10 runs   (   71.48 ms per token,    13.99 tokens per second)
llama_print_timings:       total time =  1607.36 ms
Llama.gene

Document 1:

Consider using an explicit implementation if a member’s purpose is unclear on the implementing class. Explicit interface member implementation will uniquely distinguish a member.



llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     0.64 ms /     2 runs   (    0.32 ms per token,  3110.42 tokens per second)
llama_print_timings: prompt eval time =   464.36 ms /    43 tokens (   10.80 ms per token,    92.60 tokens per second)
llama_print_timings:        eval time =    36.93 ms /     1 runs   (   36.93 ms per token,    27.08 tokens per second)
llama_print_timings:       total time =   512.06 ms


In [None]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

llm_embedd = LlmEmbedding(f"../models/{EMB_MODEL}")

embeddings_filter = EmbeddingsFilter(embeddings=llm_embedd.embedding, similarity_threshold=0.76)
embedd_compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, 
                                                              base_retriever=filter_compression_retriever)


In [64]:
sub_docs = embedd_compression_retriever.get_relevant_documents("What is your name?")
len(sub_docs)

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     2.75 ms /     6 runs   (    0.46 ms per token,  2181.03 tokens per second)
llama_print_timings: prompt eval time =   700.03 ms /    88 tokens (    7.95 ms per token,   125.71 tokens per second)
llama_print_timings:        eval time =   368.79 ms /     5 runs   (   73.76 ms per token,    13.56 tokens per second)
llama_print_timings:       total time =  1085.64 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     3.15 ms /     7 runs   (    0.45 ms per token,  2225.76 tokens per second)
llama_print_timings: prompt eval time =   609.27 ms /    93 tokens (    6.55 ms per token,   152.64 tokens per second)
llama_print_timings:        eval time =   406.22 ms /     6 runs   (   67.70 ms per token,    14.77 tokens per second)
llama_print_timings:       total time =  1033.63 ms
Llama.gene

1

In [66]:
from langchain.chains import LLMChain, HypotheticalDocumentEmbedder

embeddings = HypotheticalDocumentEmbedder.from_llm(llm,
                                                   llm_embedd.embedding,
                                                   prompt_key="web_search"
                                                   )

In [None]:
result = embeddings.embed_query("What is your name?")
result

In [72]:
prompt_template = """Please answer the user's question as related to Large Language Model
Question: {question}
Answer:"""

prompt = PromptTemplate(input_variables=["question"], template=prompt_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)

import langchain
langchain.debug = True

# 建立假設性的答案
embeddings = HypotheticalDocumentEmbedder(
    llm_chain=llm_chain,
    base_embeddings=llm_embedd.embedding
)

In [None]:
result = embeddings.embed_query("What is your name?")

In [71]:
docsearch = Chroma.from_documents(docs, embeddings)
query = "What is your name?"
docs = docsearch.similarity_search(query)
docs

Llama.generate: prefix-match hit

llama_print_timings:        load time =  2112.13 ms
llama_print_timings:      sample time =     2.89 ms /     7 runs   (    0.41 ms per token,  2419.63 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =   845.87 ms /     7 runs   (  120.84 ms per token,     8.28 tokens per second)
llama_print_timings:       total time =   866.78 ms


[Document(page_content='What is your name?\nMy name is Allen.', metadata={'doc_id': 2, 'source': 'documents/t1.md'}),
 Document(page_content='Hello, my name is Inigo Montoya\nFrom the Library of Wow! eBook', metadata={'doc_id': 4240, 'page': 208, 'source': 'documents/Essential.CSharp.4.0.pdf'}),
 Document(page_content='Hello, my name is Inigo Montoya\nFrom the Library of Wow! eBook', metadata={'doc_id': 4248, 'page': 210, 'source': 'documents/Essential.CSharp.4.0.pdf'}),
 Document(page_content='What is your name?\nMy name is Astro. Flash created me.\n\nOn August 12, 2023, the Eureka team released this feature. In Leo, specific handling was implemented to conceal transactions originating from specific hardcoded salary accounts across two pages (Txn and Sal Txn - Approval Sal Txn).\n\nWhat is your name?\nMy name is Jack.', metadata={'doc_id': 3, 'source': 'documents/t2.md'})]

####