# Methods Metadata Extraction Tool   

> Langchain tools that execute zero-shot extraction over a local database of full text papers previously imported into our database.

In [None]:
#| default_exp tools.metadata_extraction_tool

In [5]:
#| hide
from nbdev import *

In [1]:
from alhazen.core import OllamaRunner
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb
from alhazen.tools.basic import EMPCSearchTool 
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool 
from alhazen.toolkit import AlhazenToolkit
import os

os.environ['LOCAL_FILE_PATH'] = '/Users/gburns/alhazen/'
os.environ['ALHAZEN_DB_NAME'] = 'em_tech'

db = Ceifns_LiteratureDb(loc=os.environ['LOCAL_FILE_PATH'], name=os.environ['ALHAZEN_DB_NAME'])
ollr = OllamaRunner('mixtral')
llm  = ollr.llm

tk = AlhazenToolkit(db=db, ollr=ollr)

In [2]:
t = [t for t in tk.get_tools() if isinstance(t,MetadataExtractionTool)][0]
t.description

'Input to this tool is a doi identifier, a search term for section titles in the paper, and the name of a type of experiment (drawn from a predefined list). The tool will execute an LLM over the paper to extract metadata from available text  and then insert the metadata into the database. The output is a string that returns a completion message (either positive or an error report).'

In [3]:
import yaml
import local_resources.linkml as linkml
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from importlib_resources import files
import re

def dict_generator(indict, pre=None):
    pre = pre[:] if pre else []
    if isinstance(indict, dict):
        for key, value in indict.items():
            if isinstance(value, dict):
                for d in dict_generator(value, pre + [key]):
                    yield d
            elif isinstance(value, list) or isinstance(value, tuple):
                for v in value:
                    for d in dict_generator(v, pre + [key]):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield pre + [indict]

dois = {}
for i in range(10000, 10011):
    d_id = None
    yaml_text = files(cryoet_portal_metadata).joinpath(str(i)+'.yaml').read_text()
    d = yaml.safe_load(yaml_text)
    for l in dict_generator(d):
        if 'dataset_identifier' in l:
            d_id = l[-1]
        if 'dataset_publications' in l and d_id:
            dlist = [d.strip() for d in re.sub('doi:', '', l[-1]).split(',') if '/' in d]
            if dois.get(d_id):
                dois[d_id].extend(dlist)
            else:
                dois[d_id] = dlist
print(dois)

{10000: ['10.1101/2022.04.12.488077'], 10001: ['10.1101/2022.04.12.488077'], 10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'], 10004: ['10.1101/2023.04.28.538734'], 10005: ['10.1038/s41594-022-00861-0'], 10006: ['10.1038/s41586-020-2665-2'], 10007: [], 10008: ['10.1038/s41586-022-04971-z'], 10009: ['10.1126/science.abm6704'], 10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}


In [5]:
for k,v in dois.items():
    for doi in v:
        print(doi)
        t.run(tool_input={'paper_id':doi, 'section_name':'method', 'extraction_type':'cryoet'})
    break


10.1101/2022.04.12.488077
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "section_text": "Online Methods\nYeast cell culture\nS. pombe K972 Sp h- wt haploid was kindly provided by C. Haering, originally from P. Nurse.\nCells were recovered from frozen stock by streaking on YE5S agar plates (YES Broth, Formedium, 20 g agarose/L) and incubated at 30 degreesC for 1-3 days.\nColonies were re-streaked on fresh YES agar plates and incubated 1-3 days at 30 degreesC.\nSingle colonies were inoculated in 5 mL of YES medium (YES Broth, Formedium, PCM0302, FM0618/8573) and grown at 30 degreesC, 170 rpm overnight (NCU-Shaker mini, Benchmark).\nOn the next day, cultures were grown to their log phase at OD600 of 0.5 - 0.6 and diluted beforehand in YES if necessary.\nVitrification of yeast cells\nA Leica EM GP (Leica Microsystems) was utilized to vitrify yeast cells at liquid nitrogen temperature.\nYeast cells were either diluted to OD600 of 0.2-0.

KeyboardInterrupt: 

In [66]:
prompt_element_spec_name = 'cryoet'

pts = PromptTemplateRegistry()
pts.load_prompts_from_yaml('metadata_extraction.yaml')
step_identification_prompt_template = pts.get_prompt_template('protocol step identification').generate_llama2_prompt_template()
metadata_extraction_prompt_template = pts.get_prompt_template('metadata extraction').generate_llama2_prompt_template()
metadata_extraction2_prompt_template = pts.get_prompt_template('metadata extraction2').generate_llama2_prompt_template()
run_name = 'metadata_extraction_' + re.sub(' ','_',prompt_element_spec_name)

# loading the additional elements from the yaml file
# Note that there is an implicit assumption that the additional elements are formatted correctly. 
# This code will throw an exception if the yaml file is not formatted correctly.
prompt_elements_yaml = files(prompt_elements).joinpath('metadata_extraction.yaml').read_text()
prompt_elements_dict = yaml.safe_load(prompt_elements_yaml).get(prompt_element_spec_name)
method_goal = prompt_elements_dict['method goal']
methodology = prompt_elements_dict['methodology']
all_protocol_steps = prompt_elements_dict['all protocol steps']
all_protocol_step_codes = prompt_elements_dict['all protocol step codes']
metadata_specs = prompt_elements_dict['metadata specs']


In [67]:
metadata_extraction2_prompt_template

PromptTemplate(input_variables=['section_text'], template="<s>[INST]\n                <<SYS>>You are an expert biological scientist, skilled at reading scientific papers.<</SYS>>\n                Read the text of a section from the methods section of a research paper shown below  (delimited with triple backticks).\nSection Text:- ```{section_text}```\nThe text is taken from a scientific paper that uses Cryo-Electron Tomography (CryoET) to study the microscopic structure of a biological sample.\nExtract metadata from the section text based on the following list of questions. Record answers as the specified fields of the output. Do not extract any other metadata. \n1. What is the type of biological sample being used? Select from the following list:- cell culture, tissue, organoid, organ, whole organism, virus, micro-organism, other. Record this value in the 'biological_sample_type' field of the output. Record any supporting sentences from the section text in the 'biological_sample_type_o

In [71]:
#with suppress_stdout_stderr():
s2 = {'section_text': text}
extract2_lcel = metadata_extraction2_prompt_template | met.llm | JsonEnclosedByTextOutputParser()
out2 = extract2_lcel.invoke(s2, config={'callbacks': [ConsoleCallbackHandler()]})
if out2 is not None:
    # serialize out2 as json
    note_content = json.dumps(out2)
    # add a fragment to the database                    
    n = Note(
        id=uuid.uuid4().hex[0:10],
        type='NoteAboutFragment', 
        name=run_name,
        content=note_content, 
        creation_date=datetime.now(), 
        format='json')
    #n.is_about.append(ske)
    ske.has_notes.append(n) 
    lldb.session.add(n)
    lldb.session.flush()
else:
    print('out2 is None')
lldb.session.commit()

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "section_text": "Online Methods\nYeast cell culture\nS. pombe K972 Sp h- wt haploid was kindly provided by C. Haering, originally from P. Nurse.\nCells were recovered from frozen stock by streaking on YE5S agar plates (YES Broth, Formedium, 20 g agarose/L) and incubated at 30 degreesC for 1-3 days.\nColonies were re-streaked on fresh YES agar plates and incubated 1-3 days at 30 degreesC.\nSingle colonies were inoculated in 5 mL of YES medium (YES Broth, Formedium, PCM0302, FM0618/8573) and grown at 30 degreesC, 170 rpm overnight (NCU-Shaker mini, Benchmark).\nOn the next day, cultures were grown to their log phase at OD600 of 0.5 - 0.6 and diluted beforehand in YES if necessary.\nVitrification of yeast cells\nA Leica EM GP (Leica Microsystems) was utilized to vitrify yeast cells at liquid nitrogen temperature.\nYeast cells were either diluted to OD600 of 0.2-0.4 in YES medium or, follow

In [70]:
print(out2)

None


In [37]:
protocol_step = None
attempts = 0
for spec in metadata_specs:
    s2 = {'section_text': text, 
            'methodology': methodology,
            'method_goal': method_goal,
            'metadata_specification': spec.get('spec'), 
            'metadata_name': spec.get('name') }
        
    #with suppress_stdout_stderr():
    out2 = met.extract_lcel.invoke(s2, config={'callbacks': [ConsoleCallbackHandler()]})
    if out2 is not None:
        # serialize out2 as json
        note_content = json.dumps(out2)
        print(note_content)
        # add a fragment to the database                    
        n = Note(
            id=uuid.uuid4().hex[0:10],
            type='NoteAboutFragment', 
            name=run_name,
            content=note_content, 
            creation_date=datetime.now(), 
            format='json')
        #n.is_about.append(ske)
        ske.has_notes.append(n) 
        lldb.session.add(n)
        lldb.session.flush()
lldb.session.commit()

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "section_text": "Online Methods\nYeast cell culture\nS. pombe K972 Sp h- wt haploid was kindly provided by C. Haering, originally from P. Nurse.\nCells were recovered from frozen stock by streaking on YE5S agar plates (YES Broth, Formedium, 20 g agarose/L) and incubated at 30 degreesC for 1-3 days.\nColonies were re-streaked on fresh YES agar plates and incubated 1-3 days at 30 degreesC.\nSingle colonies were inoculated in 5 mL of YES medium (YES Broth, Formedium, PCM0302, FM0618/8573) and grown at 30 degreesC, 170 rpm overnight (NCU-Shaker mini, Benchmark).\nOn the next day, cultures were grown to their log phase at OD600 of 0.5 - 0.6 and diluted beforehand in YES if necessary.\nVitrification of yeast cells\nA Leica EM GP (Leica Microsystems) was utilized to vitrify yeast cells at liquid nitrogen temperature.\nYeast cells were either diluted to OD600 of 0.2-0.4 in YES medium or, follow

In [36]:
lldb.session.rollback()

In [34]:
from sqlalchemy import delete
lldb.session.execute(delete(Note))
lldb.session.commit()

In [20]:
item_type = 'JATSFullText'
expression_id = '10.1101/2022.04.12.488077'
q = lldb.session.query(Note)
for n in q.all():
    print(n)
