# Methods Metadata Extraction Tool   

> Langchain tools that execute zero-shot extraction over a local database of full text papers previously imported into our database.

In [7]:
#| default_exp tools.metadata_extraction_tool

In [2]:
#| hide
from nbdev import *

In [1]:
#| export

import local_resources.linkml as linkml

from alhazen.core import OllamaRunner, PromptTemplateRegistry, get_langchain_llm, get_cached_gguf, \
    get_langchain_embeddings, GGUF_LOOKUP_URL, MODEL_TYPE
from alhazen.tools.basic import AlhazenToolMixin
from alhazen.utils.output_parsers import JsonEnclosedByTextOutputParser

from alhazen.utils.ceifns_db import *

from alhazen.schema_sqla import ScientificKnowledgeCollection, ScientificKnowledgeExpression, \
    ScientificKnowledgeFragment, Note, ScientificKnowledgeCollection, \
    ScientificKnowledgeExpression, ScientificKnowledgeCollectionHasMembers, \
    ScientificKnowledgeItem, ScientificKnowledgeExpressionHasRepresentation, \
    ScientificKnowledgeFragment, ScientificKnowledgeItemHasPart, \
    InformationResource

from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.schema import OutputParserException
from langchain.pydantic_v1 import BaseModel, Field, root_validator
from langchain.schema.prompt_template import format_document
from langchain.tools import BaseTool, StructuredTool

from importlib_resources import files
import local_resources.prompt_elements as prompt_elements

from datetime import datetime
from importlib_resources import files
import json
import os
import re
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from time import time,sleep
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
import uuid
import yaml

In [2]:
#| export

class MetadataExtractionToolSchema(BaseModel):
    query: str = Field(description="should be a search query")
    name: str = Field(description="should be the name of the collection we will add papers to")

class MetadataExtractionTool(BaseTool, AlhazenToolMixin):
    '''Runs a specified metadata extraction pipeline over a research paper that has been loaded in the local literature database.'''
    name = 'metadata_extraction'
    description = 'Runs a specified metadata extraction pipeline over a research paper that has been loaded in the local literature database.'
    args_schema = MetadataExtractionToolSchema
    
    def _run(self, paper_id, section_name, extraction_type, item_type='JATSFullText'):
        '''Runs the metadata extraction pipeline over a specified paper.'''

        if self.db.session is None:
            session_class = sessionmaker(bind=self.db.engine)
            self.db.session = session_class()

        ske = self.db.session.query(ScientificKnowledgeExpression) \
                .filter(ScientificKnowledgeExpression.id.like('%'+paper_id+'%')).first()    

        # 1. Load the text of the paper from the local database
        text = '\n'.join([f.content for f in self.db.list_fragments_for_paper(paper_id, item_type) if section_name in f.name.lower()])

        # 2. Build LangChain elements
        pts = PromptTemplateRegistry()
        pts.load_prompts_from_yaml('metadata_extraction.yaml')
        prompt_elements_yaml = files(prompt_elements).joinpath('metadata_extraction.yaml').read_text()
        prompt_elements_dict = yaml.safe_load(prompt_elements_yaml).get(extraction_type)
        method_goal = prompt_elements_dict['method goal']
        methodology = prompt_elements_dict['methodology']
        metadata_specs = prompt_elements_dict.get('metadata specs',[])
        metadata_extraction_prompt_template = pts.get_prompt_template('metadata extraction').generate_llama2_prompt_template()
        run_name = 'metadata_extraction_' + re.sub(' ','_',extraction_type) + ':' + paper_id
        extract_lcel = metadata_extraction_prompt_template | self.llm | JsonEnclosedByTextOutputParser()

        # 3. Compile the extraction questions
        question_text_list = [("%d. %s Record this value in the '%s' field of the output."
                                "Record any supporting sentences from the section text in the"
                                " '%s_original_text' field of the output.")
                                %(i+1, spec.get('spec'), spec.get('name'), spec.get('name')) 
                                for i, spec in enumerate(metadata_specs)]
        questions_output_specification = '\n'.join(question_text_list)
        questions_output_specification += '\nGenerate only JSON formatted output with %d fields:\n'%(len(metadata_specs)*2)
        questions_output_specification += ", ".join(['%s, %s_original_text'%(spec.get('name'), spec.get('name')) for spec in metadata_specs])

        # 4. Assemble chain input
        s1 = {'section_text': text,
                'methodology': methodology,
                'method_goal': method_goal,
                'questions_output_specification': questions_output_specification}
        
        # 5. Run the chain using the Ollama runner with a JsonEnclosedByTextOutputParser failsafe loop 
        output = None
        attempts = 0
        while output is None and attempts < 5:
            try: 
                #with suppress_stdout_stderr():
                output = extract_lcel.invoke(s1, config={'callbacks': [ConsoleCallbackHandler()]})
                if output is None:
                    attempts += 1
                    continue

                for spec in metadata_specs:
                    if spec.get('name') in output is False or spec.get('name')+'_original_text' in output is False:
                        continue
                    vname = spec.get('name')
                    question = spec.get('spec')
                    answer = output.get(vname) 
                    original_text = output.get(vname+'_original_text')
                    note_content = json.dumps({
                        'question': question,
                        'answer': answer,
                        'original_text': original_text}, indent=4)
                    # add a note to the fragment
                    n = Note(
                        id=uuid.uuid4().hex[0:10],
                        type='NoteAboutExpression', 
                        name=run_name+':'+vname,
                        content=note_content, 
                        creation_date=datetime.now(), 
                        format='json')
                    n.is_about.append(ske)
                    self.db.session.add(n)
                    self.db.session.flush()
                else:                     
                    attempts += 1
            except OutputParserException as e:
                attempts += 1
                print(e) 
                print('Retrying...')
                    
        # commit the changes to the database
        self.db.session.commit()
        
        return "Final Answer: completed metadata extraction of an experiment of type '%s' from %s."%(methodology, paper_id)

## How to execute the tool

In [None]:

from alhazen.core import OllamaRunner
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb
from alhazen.tools.basic import EMPCSearchTool 
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool 
from alhazen.toolkit import AlhazenToolkit
import os

# This is the directory where all Alhazen data will be stored
os.environ['LOCAL_FILE_PATH'] = '/tmp/alhazen/'

# This is the name of the database used for this application. 
# This will be (A) a postgres database, and (B) a subdirectory of the above folder where the system will store
# local files (such as full text papers, ontology files, etc.)
os.environ['DATABASE_NAME'] = 'em_tech'

# This sets up the toolkit 
db = Ceifns_LiteratureDb(loc=os.environ['LOCAL_FILE_PATH'], name=os.environ['DATABASE_NAME'])
ollr = OllamaRunner('mixtral')
llm  = ollr.llm
tk = AlhazenToolkit(db=db, ollr=ollr)


In [None]:

t = [t for t in tk.get_tools() if isinstance(t,MetadataExtractionTool)][0]
t.description

In [None]:
# This command will call a predefined set of 20 questions to be asked 
# about de Teresa et al. 2022 (https://www.biorxiv.org/content/10.1101/2022.04.12.488077v1)
#doi = '10.1101/2022.04.12.488077'
#t.run(tool_input={'paper_id':doi, 'section_name':'method', 'extraction_type':'cryoet'})
