In [63]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [64]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Load documents
# mdloader = directory.DirectoryLoader("../../../z/ZcashFoundation/zebra/", glob="**/*.md", silent_errors=True, recursive=True)
# mddocs = lmdoader.load()
loader = GenericLoader.from_filesystem(
    "../../../z/ZcashFoundation/zebra/",
    glob="**/*",
    suffixes=[".rs", ".toml", ".yaml", ".md", ".json"],
    parser=LanguageParser(),
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [83]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [84]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [87]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

question = "How do I build an orchard transaction?"
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion

In [88]:
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

6

In [89]:
docs

[(Document(page_content='orchard_tree.append(*orchard_note_commitment).unwrap();\n                            }\n                        }\n                        new_transactions.push(Arc::new(transaction));\n                    }\n                }', metadata={'content_type': 'functions_classes', 'language': 'rust', 'source': '../../../z/ZcashFoundation/zebra/zebra-chain/src/block/arbitrary.rs'}),
  0.09918032786885246),
 (Document(page_content='## State Management\n\n### Orchard\n- There is a single copy of the latest Orchard Note Commitment Tree for the finalized tip.\n- When finalizing a block, the finalized tip is updated with a serialization of the latest Orchard Note Commitment Tree. (The previous tree should be deleted as part of the same database transaction.)\n- Each non-finalized chain gets its own copy of the Orchard note commitment tree, cloned from the note commitment tree of the finalized tip or fork root.\n- When a block is added to a non-finalized chain tip, the Orch

In [90]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question":question})

'To build an orchard transaction, you can follow the consensus rules outlined in the Orchard Chain Value Pool section of the Zcash protocol. This includes introducing Action transfers, performing spends and outputs, calculating the Orchard balancing value (vbalanceOrchard), encoding vbalanceOrchard as the field valueBalanceOrchard in the transaction, and ensuring that transaction fields are described as per the Transaction Encoding and Consensus guidelines. Additionally, you may need to create orchard_nullifiers, orchard_anchors, orchard_note_commitment_tree, and orchard_note_commitment_subtree as specified in the state database upgrades documentation.'

In [91]:
final_rag_chain.invoke({"question": "What are different dependencies inside the repo?"})

'Some of the dependencies inside the repo include `tokio`, `tower`, `futures`, `color-eyre`, `tinyvec`, `humantime`, `owo-colors`, `spandoc`, `thiserror`, `tracing-subscriber`, `tracing-error`, `tracing`, `tempfile`, `bitflags`, `byteorder`, `bytes`, `chrono`, `dirs`, `hex`, `humantime-serde`, `indexmap`, `itertools`, `lazy_static`, `num-integer`, `ordered-map`, `pin-project`, `rand`, `rayon`, `regex`, `serde`, `howudoin`, `proptest`, `proptest-derive`, `elasticsearch`, `serde_json`, `zebra-chain`, `zebra-test`, among others.'

In [92]:
final_rag_chain.invoke({"question": "What are the important packages within the zebra repo?"})

'The important packages within the zebra repo include Zebra checkpoints, Docker Images updates, Continuous Integration and Deployment improvements, and Zebra Dependencies.'

In [94]:
docs = retrieval_chain_rag_fusion.invoke({"question": "What are the important packages within the zebra repo?"})
docs

[(Document(page_content='# Summary\n\n[Zebra](README.md)', metadata={'source': '../../../z/ZcashFoundation/zebra/book/src/SUMMARY.md'}),
  0.13118237599993285),
 (Document(page_content='- Update Zebra checkpoints (#5130)\n\n#### Docker Images\n\n- Breaking: Allow Docker users to specify a custom `zebrad` config file path (#5163, #5177)\n\n#### Continuous Integration and Deployment\n\n- Wait 1 day before creating cached state image updates  (#5088)\n- Delete cached state images older than 2 days, but keep a few recent images\n  (#5113, #5124, #5082, #5079)\n- Simplify GitHub actions caches (#5104)\n- Use 200GB disks for managed instances (#5084)\n- Improve test reliability and test output readability (#5014)\n\n#### Zebra Dependencies', metadata={'source': '../../../z/ZcashFoundation/zebra/CHANGELOG.md'}),
  0.09706420555177826),
 (Document(page_content='# User Documentation\n\nThis section contains details on how to install, run, and instrument Zebra.', metadata={'source': '../../../z/

In [96]:
retrieval_chain_rag_fusion.invoke({"question": "which elliptic curves are used?"})

[(Document(page_content='0xd6, 0xc3, 0x6e, 0xcf, 0x84, 0xd6, 0x93, 0x67, 0x2c, 0x53, 0xce, 0xd8, 0x79, 0x8c,\n                0xc8, 0xf1, 0xe5, 0x3b, 0x8a, 0x9d, 0xe7, 0xbb, 0xb5, 0xe8, 0xc5, 0xa4, 0x6c, 0x3a,\n                0x74, 0x12, 0xdf, 0x11, 0xc5, 0xda, 0x16, 0xb4, 0xdd, 0x22, 0x90, 0x1a, 0x59, 0x2b,\n                0x0e, 0x93, 0x29, 0x77, 0xba, 0x06, 0x67, 0x3d, 0x6f, 0xd0, 0x38, 0xac, 0xba, 0xa9,\n                0xbf, 0x79, 0xc1, 0x5b, 0xa6, 0x2b, 0x6e, 0x30, 0x74, 0xef, 0x95, 0x3b, 0x81, 0x4c,\n                0xf1, 0xbd, 0xf0, 0x15, 0x77, 0xed, 0x3e, 0x3f, 0xae, 0xf4, 0x71, 0x55, 0xc9, 0x1c,\n                0x68, 0xee, 0x32, 0x88, 0x1b, 0x73, 0x74, 0x94, 0xb3, 0xb4, 0x76, 0x08, 0x3b, 0x3b,\n                0xd1, 0x77, 0x93, 0xc4, 0x98, 0x93, 0x1e, 0xaa, 0x92, 0xb1, 0x7c, 0x7d, 0x10, 0x47,\n                0x58, 0xfc, 0x8b, 0x34, 0x93, 0xd2, 0x47, 0x41, 0x7f, 0x5e, 0xc1, 0x97, 0x9a, 0x35,\n                0x28, 0x93, 0xe9, 0x95, 0x63, 0xb6, 0xc3, 0xab, 0x95, 0xcc, 0x5a, 

In [97]:
final_rag_chain.invoke({"question": "which elliptic curves are used?"})

'The elliptic curves used are jubjub and bellman.'

In [98]:
final_rag_chain.invoke({"question": "where are the elliptic curves implemented in zebra?"})

'The elliptic curves are implemented in the Zebra zk-SNARK Parameters section of the Zebra documentation.'

In [99]:
final_rag_chain.invoke({"question": "explain how the repository is organized"})

'The repository is organized in a way that Docker images are used as a distribution mechanism for the software. The organization uses Docker Hub for end-user images and Google Artifact Registry to build external tools and test images. Additionally, the repository mentions the use of existing update/deployment mechanisms or the possibility of writing their own update/deployment mechanism to distribute the software. The focus is on solutions in the second category, which involves providing Docker images for easy deployment and updates.'

In [100]:
retrieval_chain_rag_fusion.invoke({"question": "explain how the repository is organized"})

[(Document(page_content='One solution in the second category is to publish Docker images.  This has a\nnumber of attractive features.  First, we already produce Docker images for our\nown cloud deployments, so there is little-to-no marginal effort required to\nproduce these for others as a distribution mechanism.  Second, providing Docker\nimages will make it easier for us to provide a collection of related software\nin the future (e.g., providing an easy-to-deploy Prometheus / Grafana instance,\nor a sidecar Tor instance).  Third, Docker has a solid upgrade story, and we\ncan instruct users to use the `:latest` version of the Docker image or steer\nthem to auto-update mechanisms like Watchtower.', metadata={'source': '../../../z/ZcashFoundation/zebra/book/src/dev/rfcs/drafts/xxxx-release-planning.md'}),
  0.13224043715846995),
 (Document(page_content='2.  Use an existing update / deployment mechanism to distribute our software;\n\n3.  Write our own update / deployment mechanism to dis

In [101]:
retrieval_chain_rag_fusion.invoke({"question": "explain how is the source code organized? what different packages are in the repository?"})

[(Document(page_content='2.  Use an existing update / deployment mechanism to distribute our software;\n\n3.  Write our own update / deployment mechanism to distribute our software.\n\nThe first category is mentioned for completeness, but we need to provide users\nwith a way to update their software.  Unfortunately, this means that standalone\nbinaries without an update mechanism are not a workable option for us.  The\nthird category is also unfavorable, because it creates a large amount of work\nfor a task that is not really the focus of our product.  This suggests that we\nfocus on solutions in the second category.', metadata={'source': '../../../z/ZcashFoundation/zebra/book/src/dev/rfcs/drafts/xxxx-release-planning.md'}),
  0.13012431484139575),
 (Document(page_content="This proposal is summarized above in the [guide-level\nexplanation](#guide-level-explanation).\n\n## Release Processes\n\nThe next question is what kind of release processes and automation we should\nuse.  Here are t

In [103]:
final_rag_chain.invoke({"question": "what is NetworkChainTipHeightEstimator all about?"})

"NetworkChainTipHeightEstimator is a struct in Rust that is used for estimating the height of the network chain tip based on various parameters such as current block time, current height, current target spacing, and next target spacings. It is used in the context of estimating the height of the chain tip in the Zcash Foundation's Zebra project."

In [104]:
retrieval_chain_rag_fusion.invoke({"question": "what is NetworkChainTipHeightEstimator all about?"})

[(Document(page_content='pub struct NetworkChainTipHeightEstimator {\n    current_block_time: DateTime<Utc>,\n    current_height: block::Height,\n    current_target_spacing: Duration,\n    next_target_spacings: vec::IntoIter<(block::Height, Duration)>,\n}', metadata={'content_type': 'functions_classes', 'language': 'rust', 'source': '../../../z/ZcashFoundation/zebra/zebra-chain/src/chain_tip/network_chain_tip_height_estimator.rs'}),
  0.13118237599993285),
 (Document(page_content='let estimator =\n            NetworkChainTipHeightEstimator::new(current_block_time, current_height, network);\n\n        let distance_to_tip = estimator.estimate_height_at(Utc::now()) - current_height;\n\n        Some((distance_to_tip, current_height))\n    }\n}', metadata={'content_type': 'functions_classes', 'language': 'rust', 'source': '../../../z/ZcashFoundation/zebra/zebra-chain/src/chain_tip.rs'}),
  0.12906625368285865)]