# Initial Setup

In [1]:
import os
from typing import List,Optional

# llama index imports
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex, StorageContext, Settings, load_index_from_storage
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, LangchainNodeParser
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from llama_index.core.objects import ObjectIndex

# llama index agent imports
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

# llama index llms and embeddings imports
from llama_index.llms.mistralai import MistralAI
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.embeddings.fastembed import FastEmbedEmbedding

from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

# tools
import nest_asyncio # to allow running async functions in jupyter
import chromadb # persistent storage for vectors
# import nbconvert
import tree_sitter
import tree_sitter_languages

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
nest_asyncio.apply() # to allow running async functions in jupyter

# setting flags
create_index = True

# configuration
MISTRAL_API_KEY =  "BWdlihu9sUh5P2g3bHnzjAaHiT4anTVH"
embedding = "BAAI/bge-small-en-v1.5"
llm_model = "mistral-large-latest"
chunk_size = 2000 # number of lines
chunk_overlap = 0 # number of lines to overlap between chunks
language = "python"
data_path = "./data_python"

# setup the llm and embedding
embed_model = FastEmbedEmbedding(model_name=embedding)
Settings.embed_model = embed_model
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY
llm = MistralAI(model=llm_model)
Settings.llm = llm

# setup the persistent storage for vector store
db = chromadb.PersistentClient(path="./chroma_db_mistral_python")
chroma_collection = db.get_or_create_collection("code-agent")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 4997.98it/s]


# Utility functions

In [44]:
# for investigating node outputs

# #load documents
# file_path = "./data_python/tabular_classification_binary.ipynb"
# documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
# print(f"length of nodes")
# # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
# splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=5000, chunk_overlap=0))
# nodes = splitter.get_nodes_from_documents(documents)
# print(f"Length of nodes : {len(nodes)}")

length of nodes
Length of nodes : 52


In [11]:
# function for setting vector and summary tool from a document by creating new vector and summary index
def get_doc_tools(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a jupyter notebook."""
  
  #load documents
  documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
  print(f"length of nodes")
  # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
  splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=chunk_size, chunk_overlap=chunk_overlap))
  nodes = splitter.get_nodes_from_documents(documents)
  print(f"Length of nodes : {len(nodes)}")
  
  #instantiate Vector store
  vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
  vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str) -> str:
    """
    query (str): the string query to be embedded
    """
    query_engine = vector_index.as_query_engine(similarity_top_k =2) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  summary_index = SummaryIndex(nodes)
  summary_index.storage_context.persist(persist_dir="./db_mistral_python") # save the summary index to disk
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic approach for full implementation" "DO NOT USE if you have question for specific implementation.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool


In [12]:
# function for setting vector and summary tool from a document by loading vector and summary index from storage
def get_doc_tools_from_storage(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a document."""
  
  #load vector store
  vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context)
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str, page_numbers:Optional[List[str]] = None) -> str:
    """
    query (str): the string query to be embedded
    page_numbers Optional[List[str]]: List of page numbers to be retrieved.
    Leave as NONE if we want to perform a vector search over all pages. 
    Otherwise, filter by the set of specified pages.
    Always leave page_numbers as None UNLESS there is a specific page you want to search for.
    """
    page_numbers = page_numbers or []
    metadata_dict = [{"key":'page_label', "value":p} for p in page_numbers]
    
    query_engine = vector_index.as_query_engine(similarity_top_k =2, filters = MetadataFilters.from_dicts(metadata_dict, condition=FilterCondition.OR)) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  storage_context_all = StorageContext.from_defaults(persist_dir="./db_mistral_python") # set storage context for summary index
  summary_index = load_index_from_storage(storage_context=storage_context_all) # load summary index from storage
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic summary of the documents." "DO NOT USE if you have specified questions over the documents.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool

In [13]:
# create list of vector and summary tools for all documents in the path
def get_doc_tools_from_path(path:str) -> list:
  file_name = []
  file_path = []
  for file in os.listdir(path):
    if file.endswith(".ipynb"):
      file_name.append(file.split(".")[0])
      file_path.append(os.path.join(path,file))

  papers_to_tools_dict = {}
  for name,filename in zip(file_name,file_path):
    if create_index:
      vector_query_tool,summary_query_tool = get_doc_tools(filename,name)
    else:
      vector_query_tool,summary_query_tool = get_doc_tools_from_storage(filename,name)
    papers_to_tools_dict[name] = [vector_query_tool,summary_query_tool]

  initial_tools = [t for f in file_name for t in papers_to_tools_dict[f]]
  return initial_tools

# Agent Setup

In [14]:
# create object index from the list of tools
initial_tools_for_data = get_doc_tools_from_path(data_path)
obj_index = ObjectIndex.from_objects(initial_tools_for_data, index_cls=VectorStoreIndex)
obj_retriever = obj_index.as_retriever(similarity_top_k=3) # set object retriever with similarity as top 3 results

length of nodes
Length of nodes : 132


In [15]:
# setup single agent
agent_worker = FunctionCallingAgentWorker.from_tools(tool_retriever=obj_retriever, 
                                                     llm=llm, 
                                                     system_prompt="""You are an agent designed to answer queries over a given jupyter notebook. Please always use the tools provided to answer a question.Do not rely on prior knowledge.""", 
                                                     verbose=True) 
agent = AgentRunner(agent_worker)

# Query

In [16]:
# response = agent.query("give me sample code for getting conformal prediction using crepes library from a dataframe")
response = agent.chat("how to train xgboost model for sepsis dataset?")
print(str(response))

Added user message to memory: how to train xgboost model for sepsis dataset?
=== Calling Function ===
Calling function: summary_tool_tabular_classification_binary with args: {"input": "how to train xgboost model for sepsis dataset?"}
=== Function Output ===
To train an XGBoost model for the Sepsis dataset, you would typically follow these steps:

1. **Data Preprocessing**: First, you need to preprocess your data. This could involve loading your dataset, handling missing values, encoding categorical variables, scaling numerical features, and separating the labels and features.

2. **Model Initialization**: Initialize the XGBoost classifier. In the provided context, the model is initialized with a random state of 42, an objective of 'binary:logistic' for binary classification, and other parameters such as learning rate, number of estimators, maximum depth, subsample, and colsample_bytree.

3. **Model Training**: Train the model using the preprocessed data. The `fit` method is used to tra

In [17]:
print(str(response))

To train an XGBoost model for the Sepsis dataset, you would typically follow these steps:

1. **Data Preprocessing**: First, you need to preprocess your data. This could involve loading your dataset, handling missing values, encoding categorical variables, scaling numerical features, and separating the labels and features.

2. **Model Initialization**: Initialize the XGBoost classifier. In the provided context, the model is initialized with a random state of 42, an objective of 'binary:logistic' for binary classification, and other parameters such as learning rate, number of estimators, maximum depth, subsample, and colsample_bytree.

3. **Model Training**: Train the model using the preprocessed data. The `fit` method is used to train the model on the training data.

4. **Model Evaluation**: Evaluate the model's performance. This could involve calculating metrics such as ROC AUC and average precision, and plotting ROC and precision-recall curves.

5. **Prediction**: Use the trained mod

In [18]:
response = agent.chat("how to then compute uncertainty like conformal manually?")
print(str(response))

Added user message to memory: how to then compute uncertainty like conformal manually?
=== LLM Response ===
To compute uncertainty like conformal manually, you can follow these steps:

1. **Model Training**: Train your model on the training data.

2. **Calibration**: Calibrate the model using a calibration set. This involves computing the non-conformity scores for each example in the calibration set. The non-conformity score measures how different an example is from the examples in the training set.

3. **Compute Conformal Prediction Intervals**: Sort the non-conformity scores and compute the conformal prediction intervals. The conformal prediction interval for a new example is the range of predictions that includes a certain fraction of the examples in the calibration set with lower non-conformity scores.

4. **Prediction**: Use the trained model to make predictions on new data. For each prediction, compute the non-conformity score and use it to determine the prediction interval.

Her

In [19]:
print(str(response))

To compute uncertainty like conformal manually, you can follow these steps:

1. **Model Training**: Train your model on the training data.

2. **Calibration**: Calibrate the model using a calibration set. This involves computing the non-conformity scores for each example in the calibration set. The non-conformity score measures how different an example is from the examples in the training set.

3. **Compute Conformal Prediction Intervals**: Sort the non-conformity scores and compute the conformal prediction intervals. The conformal prediction interval for a new example is the range of predictions that includes a certain fraction of the examples in the calibration set with lower non-conformity scores.

4. **Prediction**: Use the trained model to make predictions on new data. For each prediction, compute the non-conformity score and use it to determine the prediction interval.

Here is a simplified example based on the provided context:

```python
# Train the model
clf.fit(X_train, y_tra