# Workbook for testing agent + tool functionality  

> Walkthroughs for testing and investigating agent functionality.

## Preliminaries

In [13]:

from alhazen.core import OllamaRunner
from alhazen.schema_sqla import *
from alhazen.tools.basic import *
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool, MetadataExtractionWithRAGTool 
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database
from alhazen.apps.chat import  AlhazenAgentChatBot

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.chat_models import ChatOllama
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

In [14]:
skc = aliased(ScientificKnowledgeCollection)
skc_p = aliased(ScientificKnowledgeCollectionProvenance)
skc_hm = aliased(ScientificKnowledgeCollectionHasMembers)
ske_mo = aliased(ScientificKnowledgeExpressionMemberOf)
ske = aliased(ScientificKnowledgeExpression)
ske_hr = aliased(ScientificKnowledgeExpressionHasRepresentation)
ski = aliased(ScientificKnowledgeItem)
ski_hp = aliased(ScientificKnowledgeItemHasPart)
skf = aliased(ScientificKnowledgeFragment)

n = aliased(Note)
skc_hn = aliased(ScientificKnowledgeCollectionHasNotes)
ske_hn = aliased(ScientificKnowledgeExpressionHasNotes)
ski_hn = aliased(ScientificKnowledgeItemHasNotes)
skf_hn = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the PostGresQL database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save temporary files, downloaded models or other data.   

In [3]:
os.environ['ALHAZEN_DB_NAME'] = 'em_tech'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

In [4]:
if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

In [11]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = ChatOllama(model='mixtral:instruct')

tk = AlhazenToolkit(db=ldb, llm=llm)
print('AVAILABLE TOOLS')
for t in tk.get_tools():
    print('\t'+type(t).__name__)

AVAILABLE TOOLS
	AddCollectionFromEPMCTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	MetadataExtractionTool
	SimpleExtractionWithRAGTool
	PaperQAEmulationTool
	CheckExpressionTool
	IntrospectionTool


In [10]:
q = ldb.session.query(skc.id, skc.name, func.count(skc_hm.has_members_id)) \
    .filter(skc.id==skc_hm.ScientificKnowledgeCollection_id) \
    .group_by(skc.id, skc.name) \
    .order_by(skc.id)
corpora_df = pd.DataFrame(q.all(), columns=['Corpus ID', 'Corpus Name', 'Paper Count'])

paper_count = ldb.session.query(func.count(ske.id)).first()
print('Count of all papers in database: %d'%(paper_count[0]))

corpora_df

Count of all papers in database: 2648


Unnamed: 0,Corpus ID,Corpus Name,Paper Count
0,0,CryoET Portal (10000-10010),10
1,1,Hierarchical phase-contrast tomography,131
2,2,Cryo-Electron Tomography,2516


In [8]:
ldb.session.rollback()

In [9]:
q = ldb.session.query(ske) \
    .outerjoin(ske_mo, ske.id==ske_mo.ScientificKnowledgeExpression_id ) \
    .filter(ske_mo.member_of_id==None)
print(len(q.all()))
for e in tqdm(q.all()):
    ldb.delete_expression(e.id, commit_this=False)
ldb.session.commit()


29411


100%|██████████| 29411/29411 [06:11<00:00, 79.22it/s]


In [12]:
# Run an agent in Jupyter 
cb = AlhazenAgentChatBot()

In [8]:
out = cb.agent_executor.invoke({'input':'remove the collection with the id "cryoet_neurons"'},config={'callbacks': [ConsoleCallbackHandler()]})

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "remove the collection with the id \"cryoet_neurons\""
}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad>] Entering Chain run with input:
[0m{
  "input": ""
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor > 2:chain:RunnableSequence > 3:chain:RunnableAssign<agent_scratchpad> > 4:chain:RunnableParallel<agent_scratchpad> > 5:chain:RunnableLambda] Entering Chain run with input:
[0m{
  "i

In [12]:
t = cb.get_introspection_tool().agent
t.invoke({'intermediate_steps':[], 'input':'How can I evaluate the quality of an embedding of a single cell transcriptomics dataset?'})

AgentAction(tool='add_collection_from_epmc_query', tool_input={'query': 'evaluation of embeddings for single-cell transcriptomics datasets'}, log=' Thought: To answer this question, I would need to find scientific papers related to the evaluation of embeddings for single-cell transcriptomics datasets. I can use the `add_collection_from_epmc_query` tool to search for relevant papers and create a collection for further analysis.\n\nAction:\n```json\n{\n  "action": "add_collection_from_epmc_query",\n  "action_input": {\n    "query": "evaluation of embeddings for single-cell transcriptomics datasets"\n  }\n}\n```\n')