# RNAquarium  

> Using LLMs to extract information from RNA studies in Zebrafish

## Basics

### Python Imports

Setting python imports, environment variables, and other crucial set up parameters here.  

In [None]:
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *


from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

In [None]:
# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_IRI = aliased(ScientificKnowledgeExpressionIri)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_MO = aliased(ScientificKnowledgeExpressionMemberOf)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
NIA = aliased(NoteIsAbout)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

### Environment Variables

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the PostGresQL database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save temporary files, downloaded models or other data.   

In [None]:
os.environ['ALHAZEN_DB_NAME'] = 'rnaquarium'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

In [None]:
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])
    
if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

### Setup utils, agents, and tools 

In [None]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
#llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)

cb = AlhazenAgent(llm, llm)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

test_tk = MetadataExtractionToolkit(db=ldb, llm=llm2)
print('\nTESTING TOOLS')
for t in test_tk.get_tools():
    print('\t'+type(t).__name__)

AGENT TOOLS
	AddCollectionFromEPMCTool
	AddAuthorsToCollectionTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	RetrieveFullTextToolForACollection
	MetadataExtraction_EverythingEverywhere_Tool
	SimpleExtractionWithRAGTool
	PaperQAEmulationTool
	ProcotolExtractionTool
	CheckExpressionTool

TESTING TOOLS
	MetadataExtraction_EverythingEverywhere_Tool
	MetadataExtraction_RAGOnSections_Tool
	SimpleExtractionWithRAGTool


## Building the database


### Scripts to Build / Delete the database

If you need to restore a deleted database from backup, use the following shell commands:

```
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
```

In [None]:
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

In [None]:
create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

### Build CEIFNS database from 900 dois in database

In [None]:
import local_resources.data_files.rnaquarium as rnaquarium
from alhazen.utils.queryTranslator import QueryTranslator, QueryType

df = pd.read_csv(files(rnaquarium).joinpath('RNAquarium_paper_list.tsv'), sep='\t')
dois = df['DOI'].to_list()
df

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,...,Unnamed: 163,Unnamed: 164,Unnamed: 165,Unnamed: 166,Unnamed: 167,Unnamed: 168,Unnamed: 169,Unnamed: 170,Unnamed: 171,Unnamed: 172
0,2FYDLCGI,journalArticle,2014,"Han, Peidong; Zhou, Xiao-Hai; Chang, Nannan; X...",Hydrogen peroxide primes heart regeneration wi...,Cell Research,,1748-7838,10.1038/cr.2014.108,,...,,,,,,,,,,
1,8RZKQRRU,journalArticle,2014,"Marín-Juez, Rubén; Jong-Raadsen, Susanne; Yang...",Hyperinsulinemia induces insulin resistance an...,The Journal of Endocrinology,,1479-6805,10.1530/JOE-14-0178,,...,,,,,,,,,,
2,CPWU52VI,journalArticle,2014,"Benard, Erica L.; Roobol, Stefan J.; Spaink, H...",Phagocytosis of mycobacteria by zebrafish macr...,Developmental and Comparative Immunology,,1879-0089,10.1016/j.dci.2014.07.022,,...,,,,,,,,,,
3,E2XPRJPW,journalArticle,2014,"Howarth, Deanna L.; Lindtner, Claudia; Vacaru,...",Activating transcription factor 6 is necessary...,PLoS genetics,,1553-7404,10.1371/journal.pgen.1004335,,...,,,,,,,,,,
4,RYI6EAFJ,journalArticle,2014,"Desvignes, Thomas; Beam, Michael J.; Batzel, P...",Expanding the annotation of zebrafish microRNA...,Gene,,1879-0038,10.1016/j.gene.2014.05.036,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,MNQQTVH8,journalArticle,2022,"Zhang, Zhicong; Ji, Fengyu; Jiang, Shouwen; Wu...",Scale Development-Related Genes Identified by ...,Fishes,,2410-3888,10.3390/fishes7020064,https://www.mdpi.com/2410-3888/7/2/64,...,,,,,,,,,,
907,E9E2BJC8,journalArticle,2022,"Hu, Ming-Liang; Wu, Bao-Sheng; Xu, Wen-Jie; Zh...",Gene expression responses in zebrafish to shor...,Zoological Research,,2095-8137,10.24272/j.issn.2095-8137.2021.352,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8...,...,,,,,,,,,,
908,ME3REM8F,journalArticle,2022,"Herrera-Rivero, Marisol; Gandhi, Shrey; Witten...",Cardiac chamber-specific genetic alterations s...,Genomics,,8887543,10.1016/j.ygeno.2022.110320,https://linkinghub.elsevier.com/retrieve/pii/S...,...,,,,,,,,,,
909,ICZFTJFQ,journalArticle,2021,"Gandhi, Shrey; Witten, Anika; De Majo, Federic...",Evolutionarily conserved transcriptional lands...,Genomics,,8887543,10.1016/j.ygeno.2021.09.002,https://linkinghub.elsevier.com/retrieve/pii/S...,...,,,,,,,,,,


In [None]:
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 40
for start_i in range(800, len(dois), step):
    query = ' OR '.join(['doi:\"'+dois[i]+'\"' for i in range(start_i, start_i+step) if i < len(dois)])
    addEMPCCollection_tool.run({'id': '0', 'name':'RNAquarium Papers', 'query':query, 'full_text':True})

In [None]:
# Compare contents of database to the list of dois
missing_list = []
titles = []
for doi in dois:
    row = df[df['DOI']==doi]
    doi_in_db = ldb.session.query(SKE).filter(SKE.id=='doi:'+doi.lower()).all()
    if len(doi_in_db) == 0:
        print('DOI: '+doi)
        print('\t%s (%d) %s %s'%(row['Author'].iloc[0],row['Publication Year'].iloc[0],row['Title'].iloc[0],row['Journal Abbreviation'].iloc[0]))
        missing_list.append(doi)
        titles.append(row['Title'].iloc[0])
print('%d Missing DOIs'%(len(missing_list)))

0 Missing DOIs


Use OpenAlex as filler to add papers that were missed on EPMC

In [None]:

from alhazen.utils.searchEngineUtils import load_paper_from_openalex, read_references_from_openalex 
from pyalex import config, Works, Work
config.email = "gully.burns@chanzuckerberg.com"

import requests
import os
ldb.session.rollback()
corpus = ldb.session.query(SKC).filter(SKC.id=='0').first()
count = 0
print(len(corpus.has_members))

papers_to_index = []
for i, doi in enumerate(missing_list):
    p = load_paper_from_openalex(doi)
    ldb.session.add(p)
    corpus.has_members.append(p)
    p.member_of.append(corpus)
    for item in p.has_representation:
        for f in item.has_part:
            #f.content = '\n'.join(self.sent_detector.tokenize(f.content))
            f.part_of = item.id
            ldb.session.add(f)
        item.represented_by = p.id
        ldb.session.add(item)
    papers_to_index.append(p)
    ldb.session.flush()

ldb.embed_expression_list(papers_to_index)

ldb.session.commit()

945


NameError: name 'missing_list' is not defined

#### Get full text copies of all the papers about CryoET


In [None]:
cb.db.session.rollback()
error_doi = 'doi:10.3389/fcell.2021.699796'
cb.agent_executor.invoke({'input':'Retrieve full text for the paper with doi="'+error_doi+'".'})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
  "action": "retrieve_full_text_for_paper_id",
  "action_input": {
    "paper_id": "doi:10.3389/fcell.2021.699796"
  }
}[0mlist index out of range
[33;1m[1;3m{'response': 'The full text paper with doi:`doi:10.3389/fcell.2021.699796` is available in the database.'}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


{'input': 'Retrieve full text for the paper with doi="doi:10.3389/fcell.2021.699796".',
 'output': {'response': 'The full text paper with doi:`doi:10.3389/fcell.2021.699796` is available in the database.'},
 'intermediate_steps': [(AgentAction(tool='retrieve_full_text_for_paper_id', tool_input={'paper_id': 'doi:10.3389/fcell.2021.699796'}, log='{\n  "action": "retrieve_full_text_for_paper_id",\n  "action_input": {\n    "paper_id": "doi:10.3389/fcell.2021.699796"\n  }\n}'),
   {'response': 'The full text paper with doi:`doi:10.3389/fcell.2021.699796` is available in the database.'})]}

In [None]:
cb.db.session.rollback()
cb.agent_executor.invoke({'input':'Retrieve full text for the collection with id="0".'})

## Analyze Collections

In [None]:
q = ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) 
df = pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])    
df.pivot_table(index=['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique()))

Unnamed: 0_level_0,item type,CitationRecord,HTMLFullText,JATSFullText,PDFFullText
id,collection name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,RNAquarium Papers,945,1,610,666


In [None]:
# show all rows in dataframe
pd.set_option('display.max_rows', None)

piv_df = df.pivot_table(index=['doi'], 
                        columns='item type', values='collection name', 
                        aggfunc=lambda x: len(x.unique()))
piv_df.fillna(0, inplace=True)
piv_df = piv_df.reset_index(drop=False).sort_values(by='doi')
print(len(piv_df[(piv_df['JATSFullText']==0.0) & (piv_df['PDFFullText']==0.0)]))
piv_df[(piv_df['JATSFullText']==0.0) & (piv_df['PDFFullText']==0.0)]



68


item type,doi,CitationRecord,HTMLFullText,JATSFullText,PDFFullText
6,doi:10.1002/dvdy.24188,1.0,0.0,0.0,0.0
7,doi:10.1002/dvdy.24600,1.0,0.0,0.0,0.0
13,doi:10.1002/glia.24075,1.0,0.0,0.0,0.0
26,doi:10.1007/s00424-017-2009-8,1.0,0.0,0.0,0.0
31,doi:10.1007/s11427-020-1878-8,1.0,0.0,0.0,0.0
32,doi:10.1007/s11427-021-2223-4,1.0,0.0,0.0,0.0
35,doi:10.1016/bs.adgen.2016.04.004,1.0,0.0,0.0,0.0
39,doi:10.1016/j.aquatox.2013.10.006,1.0,0.0,0.0,0.0
42,doi:10.1016/j.aquatox.2019.105290,1.0,0.0,0.0,0.0
45,doi:10.1016/j.bbrc.2015.03.128,1.0,0.0,0.0,0.0


## Tests + Checks 


### Agent tool selection + execution + interpretation

In [None]:
cb.agent_executor.invoke({'input':'Hi who are you and what can you do?'})


## Run MetaData Extraction Chain over listed papers

Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset. 

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]

# Hack to get the path to the metadata directory as a string
metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]

# Compile the answers from the metadata directory
t2.compile_answers('cryoet', metadata_dir)

# Create a dataframe to store previously extracted metadata
df = pd.DataFrame()
for d in [d for d_id in dois for d in dois[d_id]]:
    item_types = set()
    d_id = 'doi:'+d
    df = pd.concat([df, t2.build_report(d_id, 'cryoet')]) 
     
# Iterate over papers to run the metadata extraction tool
for d in [d for d_id in dois for d in dois[d_id]]:
    item_types = set()
    d_id = 'doi:'+d

    # Skip if the doi is already in the database
    if d_id in df.doi.unique():
        continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d_id, 'extraction_type': 'cryoet'})

    # Add the results to the dataframe
    df = pd.concat([df, t2.build_report(d_id, 'cryoet')]) 
    

In [None]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]
for i, row in empiar_gs_df.iterrows():
    d_id = 'doi:'+row.doi
    if d_id in df.doi.unique():
        continue
    t2.run(tool_input={'paper_id': d_id, 'extraction_type': 'cryoet'})
    df = pd.concat([df, t2.build_report(d_id, 'cryoet')])

In [None]:
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for d in [d for d_id in dois for d in dois[d_id]]:
    d_id = 'doi:'+d
    e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
    notes_to_delete = []
    for n in ldb.read_notes_about_x(e):
        notes_to_delete.append(n.id)
    for n in notes_to_delete:
        ldb.delete_note(n)

## Protocol Modeling + Extraction

In [None]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
slm = ChatOllama(model='stablelm-zephyr') 
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
d = ("This tool attempts to draw a protocol design from the description of a scientific paper.")
t = ProcotolExtractionTool(db=ldb, llm=llm2, description=d)
t.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077', 'extraction_type': 'cryoet'})