# Initial Setup

In [1]:
import os
from typing import List,Optional

# llama index imports
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex, StorageContext, Settings, load_index_from_storage
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.node_parser import SentenceSplitter, CodeSplitter, LangchainNodeParser
from llama_index.core.tools import FunctionTool,QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters,FilterCondition
from llama_index.core.objects import ObjectIndex
from llama_index.readers.file import IPYNBReader

# llama index agent imports
from llama_index.core.agent import FunctionCallingAgentWorker, ReActAgent
from llama_index.core.agent import AgentRunner

# llama index llms and embeddings imports
from llama_index.llms.mistralai import MistralAI
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# langchain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

# tools
import nest_asyncio # to allow running async functions in jupyter
import chromadb # persistent storage for vectors
# import nbconvert
import tree_sitter
import tree_sitter_languages

  from .autonotebook import tqdm as notebook_tqdm


In [42]:
nest_asyncio.apply() # to allow running async functions in jupyter

# setting flags
create_index = True

# configuration
MISTRAL_API_KEY =  "BWdlihu9sUh5P2g3bHnzjAaHiT4anTVH"
embedding = "BAAI/bge-small-en-v1.5"
# embedding = "Qdrant/bm42-all-minilm-l6-v2-attentions"
# embedding = "mistral-embed"
# embedding = OllamaEmbedding(
#     model_name="llama2",
#     base_url="http://localhost:11434",
#     ollama_additional_kwargs={"mirostat": 0} 
# )
# embedding = "Salesforce/codet5p-110m-embedding"
# llm_model = "mistral-large-latest"
llm_model = "codellama"
chunk_size = 2000 # number of lines
chunk_overlap = 200 # number of lines to overlap between chunks
language = "python"
data_path = "./data_python"

# setup the llm and embedding
embed_model = FastEmbedEmbedding(model_name=embedding)
# embed_model = MistralAIEmbedding(model_name=embedding, api_key=MISTRAL_API_KEY)
# embed_model =  HuggingFaceEmbedding(model_name=embedding)  
Settings.embed_model = embed_model
Settings.chunk_size = chunk_size
Settings.chunk_overlap = chunk_overlap
# os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY
# llm = MistralAI(model=llm_model, temperature=0.0)
llm = Ollama(model=llm_model, request_timeout=1200.0, base_url="http://localhost:11434", temperature=0.0)
# temperture = 0.0 for deterministic results
Settings.llm = llm

# setup the persistent storage for vector store
db = chromadb.PersistentClient(path="./chroma_db_mistral_python")
chroma_collection = db.get_or_create_collection("code-agent")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


# Utility functions

In [18]:
# # for investigating node outputs

# #load documents
# file_path = "./data_python/tabular_classification_binary.ipynb"
# documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
# print(f"length of nodes")
# # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
# splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=2000, chunk_overlap=0))
# nodes = splitter.get_nodes_from_documents(documents)
# print(f"Length of nodes : {len(nodes)}")

In [3]:
# function for setting vector and summary tool from a document by creating new vector and summary index
def get_doc_tools(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a jupyter notebook."""
  
  #load documents
  documents = SimpleDirectoryReader(input_files = [file_path]).load_data()
  print(f"length of nodes")
  # splitter = CodeSplitter(language="python", chunk_lines=chunk_size, chunk_lines_overlap=chunk_overlap, max_chars=max_chars)
  splitter = LangchainNodeParser(RecursiveCharacterTextSplitter().from_language(Language.PYTHON, chunk_size=chunk_size, chunk_overlap=chunk_overlap))
  nodes = splitter.get_nodes_from_documents(documents)
  print(f"Length of nodes : {len(nodes)}")
  
  #instantiate Vector store
  vector_index = VectorStoreIndex(nodes,storage_context=storage_context)
  vector_index.storage_context.vector_store.persist(persist_path="/content/chroma_db")
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str) -> str:
    """
    query (str): the string query to be embedded
    """
    query_engine = vector_index.as_query_engine(similarity_top_k =2) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  summary_index = SummaryIndex(nodes)
  summary_index.storage_context.persist(persist_dir="./db_mistral_python") # save the summary index to disk
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic approach for full implementation" "DO NOT USE if you have question for specific implementation.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool


In [4]:
# function for setting vector and summary tool from a document by loading vector and summary index from storage
def get_doc_tools_from_storage(file_path:str, name:str) -> str:
  """Get vector query and summary query tools from a document."""
  
  #load vector store
  vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store, storage_context=storage_context)
  
  # Vector store Auto retrieval query engine method
  def vector_query(query:str, page_numbers:Optional[List[str]] = None) -> str:
    """
    query (str): the string query to be embedded
    page_numbers Optional[List[str]]: List of page numbers to be retrieved.
    Leave as NONE if we want to perform a vector search over all pages. 
    Otherwise, filter by the set of specified pages.
    Always leave page_numbers as None UNLESS there is a specific page you want to search for.
    """
    page_numbers = page_numbers or []
    metadata_dict = [{"key":'page_label', "value":p} for p in page_numbers]
    
    query_engine = vector_index.as_query_engine(similarity_top_k =2, filters = MetadataFilters.from_dicts(metadata_dict, condition=FilterCondition.OR)) # set vector query engine with similarity as top 2 results
    
    response = query_engine.query(query)
    return response
  
  # Prepare Vector Tool
  vector_query_tool = FunctionTool.from_defaults(name=f"vector_tool_{name}", fn=vector_query)
  
  # Prepare Summary Tool
  storage_context_all = StorageContext.from_defaults(persist_dir="./db_mistral_python") # set storage context for summary index
  summary_index = load_index_from_storage(storage_context=storage_context_all) # load summary index from storage
  summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize", use_async=True) # set summary query engine with tree summarization
  summary_query_tool = QueryEngineTool.from_defaults(name=f"summary_tool_{name}",query_engine=summary_query_engine, description=("Use ONLY IF you want to get a holistic summary of the documents." "DO NOT USE if you have specified questions over the documents.")) # set summary query tool with prompt
  return vector_query_tool,summary_query_tool

In [5]:
# create list of vector and summary tools for all documents in the path
def get_doc_tools_from_path(path:str) -> list:
  file_name = []
  file_path = []
  for file in os.listdir(path):
    if file.endswith(".ipynb"):
      file_name.append(file.split(".")[0])
      file_path.append(os.path.join(path,file))

  papers_to_tools_dict = {}
  for name,filename in zip(file_name,file_path):
    if create_index:
      vector_query_tool,summary_query_tool = get_doc_tools(filename,name)
    else:
      vector_query_tool,summary_query_tool = get_doc_tools_from_storage(filename,name)
    papers_to_tools_dict[name] = [vector_query_tool,summary_query_tool]

  initial_tools = [t for f in file_name for t in papers_to_tools_dict[f]]
  return initial_tools

# Agent Setup

In [22]:
# create object index from the list of tools
initial_tools_for_data = get_doc_tools_from_path(data_path)
obj_index = ObjectIndex.from_objects(initial_tools_for_data, index_cls=VectorStoreIndex)
obj_retriever = obj_index.as_retriever(similarity_top_k=2) # set object retriever with similarity as top 2 results

length of nodes
Length of nodes : 132


In [6]:
# # setup single agent
# agent_worker = FunctionCallingAgentWorker.from_tools(tool_retriever=obj_retriever, 
#                                                      llm=llm, 
#                                                      system_prompt="""You are an agent designed to answer queries over a given jupyter notebook. Please always use the tools provided to answer a question.Do not rely on prior knowledge.""", 
#                                                      verbose=True) 
# agent = AgentRunner(agent_worker)

In [43]:
# setup ReAct agent
agent = ReActAgent.from_tools(tool_retriever=obj_retriever, 
                                     llm=llm, 
                                     system_prompt="""You are a proficient python developer. Respond with the syntactically correct code for the question below. Make sure you follow these rules:
                                        1. Use context to understand the APIs and how to use them.
                                        2. Ensure all the requirements in the question are met.
                                        3. Ensure the output code syntax is correct.
                                        4. All required dependencies should be imported above the code.
                                        Question:
                                        {question}
                                        Context:
                                        {context}
                                        Helpful Response:""", 
                                     verbose=True)

# Query

## using CodeLlama (Local)

In [44]:
response = await agent.achat("how to train xgboost model for sepsis dataset?")
print(str(response))

[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: vector_tool_tabular_classification_binary
Action Input: {'query': 'how to train xgboost model for sepsis dataset?'}
[0m[1;3;34mObservation: To train an XGBoost model for the Sepsis dataset using the provided code, you can follow these steps:

1. Import the necessary libraries:
```python
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from xgboost import XGBClassifier
```
2. Load the Sepsis dataset and split it into training and testing sets:
```python
# load the Sepsis dataset
dataset = pd.read_csv("./dataset/Sepsis_Processed_IC.csv")

# split the data into training and testing sets
train_data, test_data = dataset.split(test_size=0.2, random_state=42)
```
3. Preprocess the data:
```python
# labels and features separation
X = train_data.drop(columns=['Target'], axis=1)
y = train_data['Target']

# one hot encoding the catego

In [45]:
for resoning_step in agent.get_completed_tasks()[0].extra_state["current_reasoning"]:
    print(f"{resoning_step.__class__.__name__} -> {resoning_step}\n")

ActionReasoningStep -> thought='The current language of the user is English. I need to use a tool to help me answer the question.' action='vector_tool_tabular_classification_binary' action_input={'query': 'how to train xgboost model for sepsis dataset?'}

ObservationReasoningStep -> observation='To train an XGBoost model for the Sepsis dataset using the provided code, you can follow these steps:\n\n1. Import the necessary libraries:\n```python\nimport pandas as pd\nfrom sklearn.preprocessing import OneHotEncoder, RobustScaler\nfrom xgboost import XGBClassifier\n```\n2. Load the Sepsis dataset and split it into training and testing sets:\n```python\n# load the Sepsis dataset\ndataset = pd.read_csv("./dataset/Sepsis_Processed_IC.csv")\n\n# split the data into training and testing sets\ntrain_data, test_data = dataset.split(test_size=0.2, random_state=42)\n```\n3. Preprocess the data:\n```python\n# labels and features separation\nX = train_data.drop(columns=[\'Target\'], axis=1)\ny = trai

In [46]:
print(str(response))

To train an XGBoost model for the Sepsis dataset, you can follow these steps:

1. Import the necessary libraries:
```python
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from xgboost import XGBClassifier
```
2. Load the Sepsis dataset and split it into training and testing sets:
```python
# load the Sepsis dataset
dataset = pd.read_csv("./dataset/Sepsis_Processed_IC.csv")

# split the data into training and testing sets
train_data, test_data = dataset.split(test_size=0.2, random_state=42)
```
3. Preprocess the data:
```python
# labels and features separation
X = train_data.drop(columns=['Target'], axis=1)
y = train_data['Target']

# one hot encoding the category columns
category_columns = X.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse_output=False)
X_encoded = one_hot_encoder.fit_transform(X[category_columns])
X_encoded = pd.DataFrame(X_encoded, columns=one_hot_encoder.get_feature_names_out(category_columns))
X = p

In [47]:
response = agent.chat("give me the code for computing conformal prediction manually on the trained model")
print(str(response))

[1;3;38;5;200mThought: (Implicit) I can answer without any more tools!
Answer: To compute conformal predictions manually on a trained XGBoost model, you can follow these steps:

1. Load the trained XGBoost model and the testing data:
```python
import xgboost as xgb
from sklearn.metrics import accuracy_score

# load the trained XGBoost model
model = xgb.XGBClassifier()
model.load_model("./models/xgb_model.json")

# load the testing data
test_data = pd.read_csv("./dataset/Sepsis_Processed_IC.csv")
```
2. Preprocess the testing data:
```python
# labels and features separation
X = test_data.drop(columns=['Target'], axis=1)
y = test_data['Target']

# one hot encoding the category columns
category_columns = X.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse_output=False)
X_encoded = one_hot_encoder.fit_transform(X[category_columns])
X_encoded = pd.DataFrame(X_encoded, columns=one_hot_encoder.get_feature_names_out(category_columns))
X = pd.concat([X, X_encoded

In [49]:
print(str(response))

To compute conformal predictions manually on a trained XGBoost model, you can follow these steps:

1. Load the trained XGBoost model and the testing data:
```python
import xgboost as xgb
from sklearn.metrics import accuracy_score

# load the trained XGBoost model
model = xgb.XGBClassifier()
model.load_model("./models/xgb_model.json")

# load the testing data
test_data = pd.read_csv("./dataset/Sepsis_Processed_IC.csv")
```
2. Preprocess the testing data:
```python
# labels and features separation
X = test_data.drop(columns=['Target'], axis=1)
y = test_data['Target']

# one hot encoding the category columns
category_columns = X.select_dtypes(include=['object']).columns
one_hot_encoder = OneHotEncoder(sparse_output=False)
X_encoded = one_hot_encoder.fit_transform(X[category_columns])
X_encoded = pd.DataFrame(X_encoded, columns=one_hot_encoder.get_feature_names_out(category_columns))
X = pd.concat([X, X_encoded], axis=1)
X.drop(columns=category_columns, inplace=True)

# convert bool to int

## using Mistral API

In [37]:
response = await agent.achat("how to train xgboost model for sepsis dataset?")
print(str(response))

[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: summary_tool_tabular_classification_binary
Action Input: {'input': 'how to train xgboost model for sepsis dataset?'}
[0m[1;3;34mObservation: To train an XGBoost model for the Sepsis dataset, you would first need to preprocess your data. This includes loading the dataset, separating labels and features, handling missing values, one-hot encoding categorical columns, converting boolean columns to integers, and standardizing numerical columns.

Once your data is preprocessed, you can split it into a training set and a test set using stratified sampling. Then, you can initialize the XGBoost model with specified parameters such as the number of estimators, learning rate, depth, and objective.

After initializing the model, you can train it on the training set using the `fit` method. Once the model is trained, you can make predictions on the test set using the `

In [38]:
for resoning_step in agent.get_completed_tasks()[0].extra_state["current_reasoning"]:
    print(f"{resoning_step.__class__.__name__} -> {resoning_step}\n")

ActionReasoningStep -> thought='The current language of the user is: English. I need to use a tool to help me answer the question.' action='summary_tool_tabular_classification_binary' action_input={'input': 'how to train xgboost model for sepsis dataset?'}

ObservationReasoningStep -> observation="To train an XGBoost model for the Sepsis dataset, you would first need to preprocess your data. This includes loading the dataset, separating labels and features, handling missing values, one-hot encoding categorical columns, converting boolean columns to integers, and standardizing numerical columns.\n\nOnce your data is preprocessed, you can split it into a training set and a test set using stratified sampling. Then, you can initialize the XGBoost model with specified parameters such as the number of estimators, learning rate, depth, and objective.\n\nAfter initializing the model, you can train it on the training set using the `fit` method. Once the model is trained, you can make prediction

In [39]:
print(str(response))

To train an XGBoost model for the Sepsis dataset, you would first need to preprocess your data. This includes loading the dataset, separating labels and features, handling missing values, one-hot encoding categorical columns, converting boolean columns to integers, and standardizing numerical columns.

Once your data is preprocessed, you can split it into a training set and a test set using stratified sampling. Then, you can initialize the XGBoost model with specified parameters such as the number of estimators, learning rate, depth, and objective.

After initializing the model, you can train it on the training set using the `fit` method. Once the model is trained, you can make predictions on the test set using the `predict` method and calculate the prediction probabilities using the `predict_proba` method.

Finally, you can evaluate the model's performance by calculating metrics such as ROC AUC and average precision, and by plotting the ROC curve and precision-recall curve. If needed,

In [40]:
response = agent.chat("give me the code for computing conformal prediction manually on the trained model")
print(str(response))

[1;3;38;5;200mThought: The current language of the user is English. I need to provide the code for computing conformal prediction manually on a trained model. However, I don't have the necessary tools to generate the code directly. I will explain the process in a way that the user can write the code themselves.
Answer: To compute conformal prediction manually on a trained model, you can follow these steps:

1. Fit your model on the training data.
2. Compute non-conformity scores for the calibration set. Non-conformity scores measure how different a new example is from the examples used to train the model. In the case of classification, a common choice for non-conformity score is the difference between the maximum prediction probability and the probability assigned to the true class.
3. Sort the non-conformity scores computed in step 2 and store them in an array.
4. For a new test example, compute its non-conformity score using the same method as in step 2.
5. To compute the p-value fo

In [41]:
print(str(response))

To compute conformal prediction manually on a trained model, you can follow these steps:

1. Fit your model on the training data.
2. Compute non-conformity scores for the calibration set. Non-conformity scores measure how different a new example is from the examples used to train the model. In the case of classification, a common choice for non-conformity score is the difference between the maximum prediction probability and the probability assigned to the true class.
3. Sort the non-conformity scores computed in step 2 and store them in an array.
4. For a new test example, compute its non-conformity score using the same method as in step 2.
5. To compute the p-value for the test example, calculate the fraction of calibration examples that have a non-conformity score greater than or equal to the test example's non-conformity score.
6. To obtain a prediction set for the test example, include all classes whose p-values are greater than a significance level alpha.

Here is a simplified ex