# Analyses of CryoET Methods and Terminology  

> Methods to extract metadata and study the structure of scientific protocols based on all available online data and knowledge.

## Introduction to CryoET

Cryo-electron Tomography (CryoET) involves rapidly freezing biological samples in their natural state to preserve their three-dimensional structure without the need for staining or crystallization. This methodology allows researchers to visualize proteins and other biomolecules at near-atomic resolution.

This digital library is based on capturing all papers that mention the technique in their titles, abstracts, or methods sections and then analyzing the various methods used and their applications. Our focus is on supporting the work of the Chan Zuckerberg Imaging Institute, [CZII](https://www.czimaginginstitute.org/) on developing [the CryoET data portal](https://cryoetdataportal.czscience.com/), an open source repository for CryoET-based data. 

## Basics

### Python Imports

Setting python imports, environment variables, and other crucial set up parameters here.  

In [2]:
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import get_langchain_chatmodel, MODEL_TYPE
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, list_databases
from alhazen.utils.searchEngineUtils import *


from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

In [3]:
# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_IRI = aliased(ScientificKnowledgeExpressionIri)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKE_MO = aliased(ScientificKnowledgeExpressionMemberOf)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
NIA = aliased(NoteIsAbout)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

### Environment Variables

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the PostGresQL database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save temporary files, downloaded models or other data.   

In [4]:
os.environ['ALHAZEN_DB_NAME'] = 'em_tech'
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

In [5]:
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])
    
if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

### Setup utils, agents, and tools 

In [6]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
#llm3 = ChatVertexAI(model_name="gemini-pro", convert_system_message_to_human=True)

cb = AlhazenAgent(llm, llm)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)

test_tk = MetadataExtractionToolkit(db=ldb, llm=llm2)
print('\nTESTING TOOLS')
for t in test_tk.get_tools():
    print('\t'+type(t).__name__)

AGENT TOOLS
	AddCollectionFromEPMCTool
	AddAuthorsToCollectionTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	RetrieveFullTextToolForACollection
	MetadataExtraction_EverythingEverywhere_Tool
	SimpleExtractionWithRAGTool
	PaperQAEmulationTool
	ProcotolExtractionTool
	CheckExpressionTool

TESTING TOOLS
	MetadataExtraction_EverythingEverywhere_Tool
	MetadataExtraction_RAGOnSections_Tool
	SimpleExtractionWithRAGTool


### Set Evaluation Dataset

These are cases directly taken from `*.yaml` files that  

In [6]:
dois = {10000: ['10.1101/2022.04.12.488077'], 
        10001: ['10.1101/2022.04.12.488077'], 
        10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'], 
        10004: ['10.1101/2023.04.28.538734'], 
        10005: ['10.1038/s41594-022-00861-0'], 
        10006: ['10.1038/s41586-020-2665-2'], 
        10007: [], 
        10008: ['10.1038/s41586-022-04971-z'], 
        10009: ['10.1126/science.abm6704'], 
        10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}

## Building the database


### Scripts to Build / Delete the database

If you need to restore a deleted database from backup, use the following shell commands:

```
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
```

In [None]:
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

In [None]:
create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

### Build CEIFNS database from queries

#### Retrieve Gold Standard experimental metadata from EMPIAR database.

1. Download the entire database to a local file from: `https://www.ebi.ac.uk/emdb/search/database:EMPIAR`
2. Save the location in a temporary variable: `empiar_metadata_path`
3. Process the downloaded file for (A) EMDB ids, (B) DOI values for publications.


In [10]:
import json
from jsonpath_ng import jsonpath, parse

# local_path to the file downloaded from the EMPIAR search results: 
# https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true
# download the file and save it to a local path
url = "https://www.ebi.ac.uk/emdb/api/empiar/search/database:EMPIAR?wt=json&download=true"
empiar_metadata_path = loc+db_name+'/EMPIAR_search_results.json'
response = requests.get(url, stream=True)
with open(empiar_metadata_path, "wb") as handle:
    for data in response.iter_content():
        handle.write(data)

with open(empiar_metadata_path, 'r') as f:
    empiar_metadata = json.load(f)
empiar_dataset_ids = list(empiar_metadata.keys())
d = {}
for empiar_id in empiar_dataset_ids:
    d[empiar_id] = {'dois':[], 'emd_ids': []}
    for citation in empiar_metadata.get(empiar_id, {}).get('citation', []):
        if citation.get('doi') is not None:
            d[empiar_id]['dois'].append(citation.get('doi'))
    for emd_id in empiar_metadata.get(empiar_id, {}).get('cross_references'):
        d[empiar_id]['emd_ids'].append(emd_id.get('name'))    

def get_nested(data, *args):
    if args and data:
        element  = args[0]
        if element:
            value = data.get(element)
            return value if len(args) == 1 else get_nested(value, *args[1:])

# get metadata from the EMDB entries for each case
metadlist = []

# jsonpath expressions to identify specific metadata from the EMDB entries
# focus mainly on the specimen preparation (grids, buffers, vitrification, etc.)
sd_jp = 'structure_determination_list.structure_determination[*]'
method_jp = parse(sd_jp + '.method')
agg_state_jp = parse(sd_jp + '.aggregation_state')
specprep_list_jp = sd_jp + '.specimen_preparation_list.specimen_preparation[*]'
buffer_jp = parse(specprep_list_jp + '.buffer.ph') 
grid_model_jp = parse(specprep_list_jp + '.grid.model')
grid_material_jp = parse(specprep_list_jp + '.grid.model') 
grid_mesh_jp = parse(specprep_list_jp + '.grid.mesh')
grid_support_topology_jp = parse(specprep_list_jp + '.grid.support_film[*].film_topology')
grid_pretreatment_jp = parse(specprep_list_jp + '.grid.pretreatment.type_')
grid_vitrification_cryogen_jp = parse(specprep_list_jp + '.grid.vitrification.cryogen_name')
grid_vit_ctemp_jp = specprep_list_jp + '.grid.vitrification.chamber_temperature.'
grid_vit_chumid_jp = specprep_list_jp + '.grid.vitrification.chamber_humidity'

jp_method = parse('structure_determination_list.structure_determination[*]')
for k,v in d.items():
    print(k,v)
    for emd_id in v['emd_ids']:
        emd_exp = requests.get('https://www.ebi.ac.uk/emdb/api/entry/experiment/'+emd_id)
        if emd_exp.status_code == 200:
            emd = emd_exp.json()
            methods = ', '.join([m.value for m in method_jp.find(emd)])
            agg_state = ', '.join([m.value for m in agg_state_jp.find(emd)])
            buffer = ', '.join([str(m.value) for m in buffer_jp.find(emd)])
            grid_model = ', '.join([m.value for m in grid_model_jp.find(emd)])
            grid_material = ', '.join([m.value for m in grid_material_jp.find(emd)])
            grid_mesh = ', '.join([str(m.value) for m in grid_mesh_jp.find(emd)])
            grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])
            grid_pretreatment = ', '.join([m.value for m in grid_pretreatment_jp.find(emd)])
            grid_vitrification_cryogen = ', '.join([m.value for m in grid_vitrification_cryogen_jp.find(emd)])
            grid_support_topology = ', '.join([m.value for m in grid_support_topology_jp.find(emd)])

            grid_vit_ctemp_units = [m.value for m in parse(grid_vit_ctemp_jp+'.units').find(emd)]
            grid_vit_ctemp_values = [str(m.value) for m in parse(grid_vit_ctemp_jp+'.valueOf_').find(emd)]
            grid_vit_ctemp = ', '.join(zip(grid_vit_ctemp_values, grid_vit_ctemp_units))

            grid_vit_chumid_units = [m.value for m in parse(grid_vit_ctemp_jp+'.units').find(emd)]
            grid_vit_chumid_values = [str(m.value) for m in parse(grid_vit_ctemp_jp+'.valueOf_').find(emd)]
            grid_vit_chumid = ', '.join(zip(grid_vit_chumid_values, grid_vit_chumid_units))

            for doi in v['dois']:
                metadlist.append({'doi':doi, 
                                  'emd_id': emd_id, 
                                  'methods': methods, 
                                  'agg_state': agg_state, 
                                  'buffer': buffer, 
                                  'grid_model': grid_model, 
                                  'grid_material': grid_material, 
                                  'grid_mesh': grid_mesh, 
                                  'grid_support_topology': grid_support_topology, 
                                  'grid_pretreatment': grid_pretreatment, 
                                  'grid_vitrification_cryogen': grid_vitrification_cryogen, 
                                  'grid_vit_ctemp': grid_vit_ctemp, 
                                  'grid_vit_chumid': grid_vit_chumid})
        else:
            print('ERROR: ', emd_exp.status_code)

EMPIAR-11069 {'dois': ['10.1101/2022.07.15.498668'], 'emd_ids': ['EMD-15189', 'EMD-15214', 'EMD-15215']}
EMPIAR-11327 {'dois': ['10.1101/2022.09.20.507954', '10.1126/sciadv.ade9674'], 'emd_ids': ['EMD-13953', 'EMD-14799', 'EMD-14800', 'EMD-14873']}
EMPIAR-11434 {'dois': ['10.1016/j.neuron.2022.08.006'], 'emd_ids': ['EMD-25401']}
EMPIAR-11435 {'dois': ['10.1038/s41586-022-05258-z'], 'emd_ids': ['EMD-24378']}
EMPIAR-11448 {'dois': ['10.1016/j.neuron.2022.08.006'], 'emd_ids': ['EMD-25402']}
EMPIAR-11460 {'dois': [], 'emd_ids': ['EMD-15721']}
EMPIAR-11490 {'dois': [], 'emd_ids': ['EMD-15346']}
EMPIAR-11491 {'dois': [], 'emd_ids': ['EMD-15345']}
EMPIAR-11493 {'dois': ['10.1016/j.neuron.2022.08.006'], 'emd_ids': ['EMD-25403']}
EMPIAR-11501 {'dois': ['10.1101/2022.11.03.515015'], 'emd_ids': ['EMD-16087']}
EMPIAR-11542 {'dois': ['10.1042/bcj20230450', '10.1101/2023.02.02.526626'], 'emd_ids': ['EMD-18991']}
EMPIAR-11545 {'dois': ['10.1038/s42004-024-01100-x'], 'emd_ids': ['EMD-17046', 'EMD-1704

In [11]:
empiar_df = pd.DataFrame(metadlist)
empiar_df.to_csv(loc+db_name+'/empiar_metadata.csv', index=False)
empiar_dois = sorted(empiar_df['doi'].unique())

In [19]:
empiar_df = pd.read_csv(loc+db_name+'/empiar_metadata.csv')
empiar_df

Unnamed: 0,doi,emd_id,methods,agg_state,buffer,grid_model,grid_material,grid_mesh,grid_support_topology,grid_pretreatment,grid_vitrification_cryogen,grid_vit_ctemp,grid_vit_chumid
0,10.1101/2022.07.15.498668,EMD-15189,singleParticle,particle,7.4,Quantifoil R2/1,Quantifoil R2/1,200.0,HOLEY ARRAY,GLOW DISCHARGE,,,
1,10.1101/2022.07.15.498668,EMD-15214,singleParticle,particle,7.4,Quantifoil R2/1,Quantifoil R2/1,,HOLEY ARRAY,GLOW DISCHARGE,,,
2,10.1101/2022.07.15.498668,EMD-15215,singleParticle,particle,7.4,Quantifoil R2/1,Quantifoil R2/1,,HOLEY ARRAY,GLOW DISCHARGE,,,
3,10.1101/2022.09.20.507954,EMD-13953,singleParticle,particle,8.0,Quantifoil R2/1,Quantifoil R2/1,300.0,HOLEY ARRAY,GLOW DISCHARGE,,,
4,10.1126/sciadv.ade9674,EMD-13953,singleParticle,particle,8.0,Quantifoil R2/1,Quantifoil R2/1,300.0,HOLEY ARRAY,GLOW DISCHARGE,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2967,10.1073/pnas.1307382110,EMD-5447,singleParticle,particle,7.4,,,,,,,,
2968,10.1126/science.1259530,EMD-2788,singleParticle,particle,7.4,,,,,,,,
2969,10.7554/elife.03080,EMD-2660,singleParticle,particle,7.4,,,,,,,,
2970,10.1038/nprot.2016.124,EMD-3228,subtomogramAveraging,particle,,,,,,,,,


In [13]:

in_count = 0
out_count = 0
for doi in empiar_dois:
    d_id = 'doi:'+doi
    if ldb.session.query(exists().where(SKE.id == d_id)).scalar():
        in_count += 1
    else:
        out_count += 1
print(in_count, out_count)


145 690


In [None]:
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20
for start_i in range(0, len(empiar_dois), step):
    query = ' OR '.join(['doi:"'+empiar_dois[i]+'"' for i in range(start_i, start_i+step)])
    addEMPCCollection_tool.run({'id': '3', 'name':'EMDB Papers', 'query':query, 'full_text':True})

In [38]:
def join_set(x):
    out = ''
    try:
        out = ' '.join(set(x))
    except:
        pass
    return out

# identify papers that we have full text for in EMPIAR
q = ldb.session.query(SKE.id) \
        .distinct() \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) \
        .filter(SKC.id == '3') \
        .filter(or_(SKI.type == 'JATSFullText', SKI.type == 'PDFFullText')) 
dois_to_include = [d[0][4:] for d in q.all()]    

empiar_gold_standard = []
for i, row in empiar_df.iterrows():
    if row.doi in dois_to_include:
        empiar_gold_standard.append( row.to_dict() )
empiar_gold_standard_df = pd.DataFrame(empiar_gold_standard)

empiar_gs_df = empiar_gold_standard_df.groupby(['doi']).agg({'methods': join_set, 'agg_state': join_set, 'buffer': join_set, 
                                              'grid_model': join_set, 'grid_material': join_set, 'grid_mesh': join_set, 
                                              'grid_support_topology': join_set, 'grid_pretreatment': join_set, 'grid_vitrification_cryogen': join_set, 
                                              'grid_vit_ctemp': join_set, 'grid_vit_chumid': join_set}).reset_index()
empiar_gs_df

Unnamed: 0,doi,methods,agg_state,buffer,grid_model,grid_material,grid_mesh,grid_support_topology,grid_pretreatment,grid_vitrification_cryogen,grid_vit_ctemp,grid_vit_chumid
0,10.1002/1873-3468.13916,tomography,particle,,,,,,,,,
1,10.1007/s10974-017-9477-5,subtomogramAveraging,tissue,,Quantifoil R3.5/1,Quantifoil R3.5/1,,HOLEY ARRAY,GLOW DISCHARGE,,,
2,10.1016/j.cell.2021.01.033,subtomogramAveraging,particle,,,,,,,,,
3,10.1016/j.celrep.2020.02.003,tomography,threeDArray,,,,,LACEY,,,,
4,10.1016/j.celrep.2023.112107,tomography,cell,,Quantifoil R2/2,Quantifoil R2/2,,HOLEY,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
89,10.3389/fmolb.2021.663121,subtomogramAveraging,cell,,C-flat-2/2,C-flat-2/2,,,,,,
90,10.7554/elife.34257,tomography,particle,,,,,,,,,
91,10.7554/elife.52286,subtomogramAveraging tomography,cell,,,,,,,,,
92,10.7554/elife.53990,helical,filament,,Quantifoil R2/1,Quantifoil R2/1,,,GLOW DISCHARGE,,,


#### Import papers from DOIs pertaining to CryoET-Portal records `10000-10010`

The [CryoET Data portal](https://chanzuckerberg.github.io/cryoet-data-portal/python-api.html) system is based on submitted data to our curation team, accompanied by papers referenced by DOIs. Each dataset is assigned an ID value associated with DOIs. 

In [None]:
# use the EMPCSearchTool to run a query for the dois mentioned
query = ' OR '.join(['doi:"'+d+'"' for d_id in dois for d in dois[d_id] ])
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
addEMPCCollection_tool.run(tool_input={'id': '0', 'name':'CryoET Portal (10000-10010)', 'query':query, 'full_text':True})

In [13]:
cb.agent_executor.invoke({'input':'Describe the contents of the collection with id="0".'})



[1m> Entering new AgentExecutor chain...[0m


KeyboardInterrupt: 

In [1]:
cb.agent_executor.invoke({'input':'Write a short essay titled "Using CryoET to study the structure of the SARS-CoV-2 virus". Use the collection with id="2".'})

NameError: name 'cb' is not defined

#### Extend Database to include all CryoET papers

In [None]:
import local_resources.queries.em_tech as em_tech_queries
from alhazen.utils.queryTranslator import QueryTranslator, QueryType

cols_to_include = ['ID', 'CORPUS_NAME', 'QUERY']
df = pd.read_csv(files(em_tech_queries).joinpath('EM_Methods.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])
df

In [None]:
qt = QueryTranslator(df.sort_values('ID'), 'ID', 'QUERY', 'CORPUS_NAME')
(corpus_ids, epmc_queries) = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS', 'METHODS'])
corpus_names = df['CORPUS_NAME']

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
    if id == 3:
        continue
    addEMPCCollection_tool.run(tool_input={'id': id, 'name':name, 'query':query, 'full_text':False})

#### Get full text copies of all the papers about CryoET


In [None]:
cb.agent_executor.invoke({'input':'Get full text copies of all papers in the collection with id="1".'})

## Analyze Collections

In [10]:
q = ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) 
df = pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])    
df.pivot_table(index=['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique()))

NameError: name 'ldb' is not defined

## Tests + Checks 


### Agent tool selection + execution + interpretation

In [7]:
cb.agent_executor.invoke({'input':'Hi who are you and what can you do?'})




[1m> Entering new AgentExecutor chain...[0m


KeyboardInterrupt: 

## Run MetaData Extraction Chain over listed papers

Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset. 

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_EverythingEverywhere_Tool)][0]

# Hack to get the path to the metadata directory as a string
metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]

# Compile the answers from the metadata directory
t2.compile_answers('cryoet', metadata_dir)

# Create a dataframe to store previously extracted metadata
df = pd.DataFrame()
for d in [d for d_id in dois for d in dois[d_id]]:
    item_types = set()
    d_id = 'doi:'+d
    df = pd.concat([df, t2.build_report(d_id, 'cryoet')]) 
     
# Iterate over papers to run the metadata extraction tool
for d in [d for d_id in dois for d in dois[d_id]]:
    item_types = set()
    d_id = 'doi:'+d

    # Skip if the doi is already in the database
    if d_id in df.doi.unique():
        continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d_id, 'extraction_type': 'cryoet'})

    # Add the results to the dataframe
    df = pd.concat([df, t2.build_report(d_id, 'cryoet')]) 
    

In [None]:
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for d in [d for d_id in dois for d in dois[d_id]]:
    d_id = 'doi:'+d
    e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
    notes_to_delete = []
    for n in ldb.read_notes_about_x(e):
        notes_to_delete.append(n.id)
    for n in notes_to_delete:
        ldb.delete_note(n)

## Protocol Modeling + Extraction

In [None]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
slm = ChatOllama(model='stablelm-zephyr') 
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
d = ("This tool attempts to draw a protocol design from the description of a scientific paper.")
t = ProcotolExtractionTool(db=ldb, llm=llm2, description=d)
t.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077', 'extraction_type': 'cryoet'})