# Initial Setup

In [1]:
import os
from typing import List,Optional

# llama index imports
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex, StorageContext, Settings, load_index_from_storage
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, LangchainNodeParser
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from llama_index.core.objects import ObjectIndex
from llama_index.readers.file import IPYNBReader

# llama index agent imports
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

# llama index llms and embeddings imports
from llama_index.llms.mistralai import MistralAI
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding


from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

# tools
import nest_asyncio # to allow running async functions in jupyter
import chromadb # persistent storage for vectors
# import nbconvert
import tree_sitter
import tree_sitter_languages

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nest_asyncio.apply() # to allow running async functions in jupyter

# setting flags
create_index = True

# configuration
MISTRAL_API_KEY =  "BWdlihu9sUh5P2g3bHnzjAaHiT4anTVH"
embedding = "BAAI/bge-small-en-v1.5"
# embedding = "Qdrant/bm42-all-minilm-l6-v2-attentions"
# embedding = "mistral-embed"
# embedding = OllamaEmbedding(
#     model_name="llama2",
#     base_url="http://localhost:11434",
#     ollama_additional_kwargs={"mirostat": 0} 
# )
# embedding = "Salesforce/codet5p-110m-embedding"
llm_model = "mistral-large-latest"
# llm_model = "codellama"
chunk_size = 2000 # number of lines
chunk_overlap = 200 # number of lines to overlap between chunks
language = "python"
data_path = "./data_python"

# setup the llm and embedding
embed_model = FastEmbedEmbedding(model_name=embedding)
# embed_model = MistralAIEmbedding(model_name=embedding, api_key=MISTRAL_API_KEY)
# embed_model =  HuggingFaceEmbedding(model_name=embedding)  
Settings.embed_model = embed_model
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY
llm = MistralAI(model=llm_model)
# llm = Ollama(model=llm_model, request_timeout=60.0)
Settings.llm = llm

# setup the persistent storage for vector store
db = chromadb.PersistentClient(path="./chroma_db_mistral_python")
chroma_collection = db.get_or_create_collection("code-agent")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


# Utility functions

In [18]:
# # for investigating node outputs

# #load documents
# file_path = "./data_python/tabular_classification_binary.ipynb"
# documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
# print(f"length of nodes")
# # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
# splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=2000, chunk_overlap=0))
# nodes = splitter.get_nodes_from_documents(documents)
# print(f"Length of nodes : {len(nodes)}")

In [11]:
# function for setting vector and summary tool from a document by creating new vector and summary index
def get_doc_tools(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a jupyter notebook."""
  
  #load documents
  documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
  print(f"length of nodes")
  # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
  splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=chunk_size, chunk_overlap=chunk_overlap))
  nodes = splitter.get_nodes_from_documents(documents)
  print(f"Length of nodes : {len(nodes)}")
  
  #instantiate Vector store
  vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
  vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str) -> str:
    """
    query (str): the string query to be embedded
    """
    query_engine = vector_index.as_query_engine(similarity_top_k =2) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  summary_index = SummaryIndex(nodes)
  summary_index.storage_context.persist(persist_dir="./db_mistral_python") # save the summary index to disk
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic approach for full implementation" "DO NOT USE if you have question for specific implementation.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool


In [31]:
# function for setting vector and summary tool from a document by loading vector and summary index from storage
def get_doc_tools_from_storage(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a document."""
  
  #load vector store
  vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context)
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str, page_numbers:Optional[List[str]] = None) -> str:
    """
    query (str): the string query to be embedded
    page_numbers Optional[List[str]]: List of page numbers to be retrieved.
    Leave as NONE if we want to perform a vector search over all pages. 
    Otherwise, filter by the set of specified pages.
    Always leave page_numbers as None UNLESS there is a specific page you want to search for.
    """
    page_numbers = page_numbers or []
    metadata_dict = [{"key":'page_label', "value":p} for p in page_numbers]
    
    query_engine = vector_index.as_query_engine(similarity_top_k =2, filters = MetadataFilters.from_dicts(metadata_dict, condition=FilterCondition.OR)) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  storage_context_all = StorageContext.from_defaults(persist_dir="./db_mistral_python") # set storage context for summary index
  summary_index = load_index_from_storage(storage_context=storage_context_all) # load summary index from storage
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic summary of the documents." "DO NOT USE if you have specified questions over the documents.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool

In [4]:
# create list of vector and summary tools for all documents in the path
def get_doc_tools_from_path(path:str) -> list:
  file_name = []
  file_path = []
  for file in os.listdir(path):
    if file.endswith(".ipynb"):
      file_name.append(file.split(".")[0])
      file_path.append(os.path.join(path,file))

  papers_to_tools_dict = {}
  for name,filename in zip(file_name,file_path):
    if create_index:
      vector_query_tool,summary_query_tool = get_doc_tools(filename,name)
    else:
      vector_query_tool,summary_query_tool = get_doc_tools_from_storage(filename,name)
    papers_to_tools_dict[name] = [vector_query_tool,summary_query_tool]

  initial_tools = [t for f in file_name for t in papers_to_tools_dict[f]]
  return initial_tools

# Agent Setup

In [5]:
# create object index from the list of tools
initial_tools_for_data = get_doc_tools_from_path(data_path)
obj_index = ObjectIndex.from_objects(initial_tools_for_data, index_cls=VectorStoreIndex)
obj_retriever = obj_index.as_retriever(similarity_top_k=2) # set object retriever with similarity as top 2 results

length of nodes
Length of nodes : 132


In [6]:
# setup single agent
agent_worker = FunctionCallingAgentWorker.from_tools(tool_retriever=obj_retriever, 
                                                     llm=llm, 
                                                     system_prompt="""You are an agent designed to answer queries over a given jupyter notebook. Please always use the tools provided to answer a question.Do not rely on prior knowledge.""", 
                                                     verbose=True) 
agent = AgentRunner(agent_worker)

# Query

In [7]:
# response = agent.query("give me sample code for getting conformal prediction using crepes library from a dataframe")
response = agent.chat("how to train xgboost model for sepsis dataset?")
print(str(response))

Added user message to memory: how to train xgboost model for sepsis dataset?
=== Calling Function ===
Calling function: summary_tool_tabular_classification_binary with args: {"input": "how to train xgboost model for sepsis dataset?"}
=== Function Output ===
To train an XGBoost model for the Sepsis dataset, you would first need to load and preprocess your data. This might involve handling missing values, one-hot encoding categorical variables, and scaling numerical features. Once your data is prepared, you can split it into training and testing sets.

Next, you would initialize the XGBoost classifier. In the provided context, the classifier is initialized with the following parameters: `iterations=100, learning_rate=0.01, depth=7, verbose=1`.

After initializing the classifier, you can fit the model to your training data using the `fit` method: `clf.fit(X_train_proper, y_train_proper)`.

Once the model is trained, you can make predictions on your test data using the `predict` method: `p

In [8]:
print(str(response))

To train an XGBoost model for the Sepsis dataset, you would first need to load and preprocess your data. This might involve handling missing values, one-hot encoding categorical variables, and scaling numerical features. Once your data is prepared, you can split it into training and testing sets.

Next, you would initialize the XGBoost classifier. In the provided context, the classifier is initialized with the following parameters: `iterations=100, learning_rate=0.01, depth=7, verbose=1`.

After initializing the classifier, you can fit the model to your training data using the `fit` method: `clf.fit(X_train_proper, y_train_proper)`.

Once the model is trained, you can make predictions on your test data using the `predict` method: `predictions = clf.predict(X_test)`.

Finally, you can evaluate the performance of your model using appropriate metrics such as ROC AUC and Average Precision. In the provided context, these are calculated using the `roc_auc_score` and `average_precision_score`

In [9]:
response = agent.chat("how to then compute uncertainty like conformal manually?")
print(str(response))

Added user message to memory: how to then compute uncertainty like conformal manually?
=== LLM Response ===
To compute uncertainty like conformal manually, you can follow these steps:

1. Train your model on the proper training set.
2. Calculate non-conformity scores for the calibration set. Non-conformity scores measure how different a new example is from the training data. In the context of binary classification, a common choice for non-conformity score is the absolute difference between the predicted probability and the true label.
3. Sort the non-conformity scores in increasing order and store them in a list.
4. For a new test example, calculate its non-conformity score.
5. To compute the p-value, calculate the fraction of calibration examples that have a non-conformity score greater than the test example.
6. The uncertainty of the prediction can be computed as 1 - p-value.

Here is a simplified example:

```python
# Train the model
clf.fit(X_train_proper, y_train_proper)

# Calcul

In [10]:
print(str(response))

To compute uncertainty like conformal manually, you can follow these steps:

1. Train your model on the proper training set.
2. Calculate non-conformity scores for the calibration set. Non-conformity scores measure how different a new example is from the training data. In the context of binary classification, a common choice for non-conformity score is the absolute difference between the predicted probability and the true label.
3. Sort the non-conformity scores in increasing order and store them in a list.
4. For a new test example, calculate its non-conformity score.
5. To compute the p-value, calculate the fraction of calibration examples that have a non-conformity score greater than the test example.
6. The uncertainty of the prediction can be computed as 1 - p-value.

Here is a simplified example:

```python
# Train the model
clf.fit(X_train_proper, y_train_proper)

# Calculate non-conformity scores for the calibration set
calibration_scores = []
for i in range(len(X_calib)):
    