# Data set web-search

> Can we run through the papers in a corpus and characterize the existance of datasets described in the paper.

## Basics

### Python Imports

Setting python imports, environment variables, and other crucial set up parameters here.  

In [1]:
from alhazen.aliases import *
from alhazen.core import lookup_chat_models
from alhazen.agent import AlhazenAgent
from alhazen.schema_sqla import *
from alhazen.core import lookup_chat_models
from alhazen.tools.basic import AddCollectionFromEPMCTool, DeleteCollectionTool
from alhazen.tools.paperqa_emulation_tool import PaperQAEmulationTool
from alhazen.tools.metadata_extraction_tool import * 
from alhazen.tools.protocol_extraction_tool import *
from alhazen.tools.tiab_classifier_tool import *
from alhazen.tools.tiab_extraction_tool import *
from alhazen.tools.tiab_mapping_tool import *
from alhazen.toolkit import *
from alhazen.utils.jats_text_extractor import NxmlDoc

from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database, backup_ceifns_database

from alhazen.utils.searchEngineUtils import *


from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector
from langchain_community.chat_models.ollama import ChatOllama
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import ChatOpenAI

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests

from sqlalchemy import text, create_engine, exists, func, or_, and_, not_, desc, asc
from sqlalchemy.orm import sessionmaker, aliased

from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import uuid
import yaml

### Environment Variables

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the PostGresQL database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save temporary files, downloaded models or other data.   

In [2]:
if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])    

loc = os.environ['LOCAL_FILE_PATH']
db_name = 'data_downloader'

### Setup utils, agents, and tools 

In [3]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
llms_lookup = lookup_chat_models()
print(llms_lookup.keys())

  warn_deprecated(
                    stop was transferred to model_kwargs.
                    Please confirm that stop is what you intended.


dict_keys(['ollama_llama3', 'ollama_mixtral', 'gemini1.0', 'databricks_dbrx', 'databricks_mixtral', 'databricks_llama3', 'groq_mixtral', 'groq_llama3', 'gpt4_1106', 'gpt35'])


In [4]:
llm = llms_lookup.get('gpt4_1106')

cb = AlhazenAgent(llm, llm, db_name=db_name)
print('AGENT TOOLS')
for t in cb.tk.get_tools():
    print('\t'+type(t).__name__)



AGENT TOOLS
	AddCollectionFromEPMCTool
	AddAuthorsToCollectionTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	RetrieveFullTextToolForACollection
	MetadataExtraction_EverythingEverywhere_Tool
	MetadataExtraction_MethodsSectionOnly_Tool
	SimpleExtractionWithRAGTool
	PaperQAEmulationTool
	ProcotolEntitiesExtractionTool
	CheckExpressionTool
	TitleAbstractClassifier_OneDocAtATime_Tool
	TitleAbstractDiscourseMappingTool
	TitleAbstractExtraction_OneDocAtATime_Tool


### Set Evaluation Dataset

These are cases directly taken from `*.yaml` files that  

#### Identify cases from the CZI CryoET Portal.


In [5]:
dois = {10000: ['10.1101/2022.04.12.488077'], 
        10001: ['10.1101/2022.04.12.488077'], 
        10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'], 
        10004: ['10.1101/2023.04.28.538734'], 
        10005: ['10.1038/s41594-022-00861-0'], 
        10006: ['10.1038/s41586-020-2665-2'], 
        10007: [], 
        10008: ['10.1038/s41586-022-04971-z'], 
        10009: ['10.1126/science.abm6704'], 
        10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}
dois_flattened = [doi for doi_list in dois.values() for doi in doi_list]
dois_flattened = list(set(dois_flattened))
dois_flattened

['10.1038/s41586-022-05255-2',
 '10.1038/s41594-022-00861-0',
 '10.1038/s41586-020-2665-2',
 '10.1038/s41586-022-04971-z',
 '10.1101/2023.04.28.538734',
 '10.1126/science.abm6704',
 '10.1101/2022.04.12.488077',
 '10.1083/jcb.202204093',
 '10.1038/s41592-020-01054-7',
 '10.1101/2022.01.23.477440']

## Building the database


### Scripts to Build / Delete the database

If you need to restore a deleted database from backup, use the following shell commands:

```
$ createdb em_tech
$ psql -d em_tech -f /local/file/path/em_tech/backup<date_time>.sql
```

In [None]:
drop_ceifns_database(db_name)

In [None]:
loc = os.environ['LOCAL_FILE_PATH']
current_date_time = datetime.now()
formatted_date_time = f'{current_date_time:%Y-%m-%d-%H-%M-%S}'
backup_path = loc+'/'+db_name+'/backup'+formatted_date_time+'.sql'
backup_ceifns_database(db_name, backup_path)

In [8]:
create_ceifns_database(db_name)

100%|██████████| 309/309 [00:00<00:00, 3216.28it/s]


### Build CEIFNS database from queries

#### Add a collection based on EMPIAR papers

In [12]:
addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20
for start_i in range(0, len(dois_flattened), step):
    query = ' OR '.join(['doi:"'+dois_flattened[i]+'"' for i in range(start_i, start_i+step) if i<len(dois_flattened)])
    addEMPCCollection_tool.run({'id': '1', 'name':'CryoET Examples', 'query':query, 'full_text':True})

https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=doi:"10.1101/2022.01.23.477440" OR doi:"10.1038/s41592-020-01054-7" OR doi:"10.1038/s41586-020-2665-2" OR doi:"10.1101/2022.04.12.488077" OR doi:"10.1126/science.abm6704" OR doi:"10.1038/s41586-022-05255-2" OR doi:"10.1038/s41586-022-04971-z" OR doi:"10.1083/jcb.202204093" OR doi:"10.1038/s41594-022-00861-0" OR doi:"10.1101/2023.04.28.538734", 10 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 223.20it/s]
100%|██████████| 9/9 [00:00<00:00, 664.94it/s]


Indexing 9 documents


  warn_deprecated(


# Read papers and extract information about datasets referred to in the paper

In [10]:
system_prompt = """
    You are helpful assistant, helping a scientist to extract information from scientific literature.
"""
prompt = """
    Read the following Markdown from scientific publication (delimited with triple backticks).

    Markdown Code:- ```{markdown_code}```

    From this code, identify any text paragraphs that refer to data sets with URLS links specified by '(URL)[TEXT]' Markdown fragments. 

    You will generate output as a JSON object with "TEXT", and "LINKS" as the keys. 

    The value of "TEXT" will be a list of paragraphs from the Markdown code that describe links to external biological datasets.
    The value of "LINKS" will be a list of dicts where each entry has two attributes: URL and TEXT extracted from each URL link.
    
    Do not provide additional explanation or context for the answer.
   
    Do not include any other response other than a JSON object.
"""

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from splinter import Browser
from selenium.webdriver.chrome.service import Service
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from alhazen.utils.output_parsers import JsonEnclosedByTextOutputParser
from random import randint
from bs4 import BeautifulSoup
from html import unescape
import markdownify

def simplify_html_to_markdown(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')

    # Remove script and style tags
    for script in soup(["script", "style"]):
        script.extract()

    # Get the text content and convert HTML tags to Markdown
    text = soup.get_text()
    #text = re.sub(r'<h([1-6])>(.*?)</h\1>', r'#\1 \2', text)
    #text = re.sub(r'<b>(.*?)</b>', r'**\1**', text)
    #text = re.sub(r'<i>(.*?)</i>', r'*\1*', text)
    #text = re.sub(r'<ul>(.*?)</ul>', lambda m: '\n'.join(['- ' + line.strip() for line in re.split(r'</?li>', m.group(1)) if line.strip()]), text, flags=re.DOTALL)
    #text = re.sub(r'<ol>(.*?)</ol>', lambda m: '\n'.join([str(i+1) + '. ' + line.strip() for i, line in enumerate(re.split(r'</?li>', m.group(1))) if line.strip()]), text, flags=re.DOTALL)
    #text = re.sub(r'<blockquote>(.*?)</blockquote>', r'> \1', text, flags=re.DOTALL)
    #text = re.sub(r'<hr\s?/?>', r'\n---\n', text)
    #text = re.sub(r'<br\s?/?>', r'\n', text)
    #text = re.sub(r'<p>(.*?)</p>', r'\1\n\n', text)

    # Convert HTML entities to their corresponding characters
    text = unescape(text)
    text = re.sub(r'\(', '\\(', text)
    text = re.sub(r'\)', '\\)', text)

    # Convert links
    for link in soup.find_all('a'):
        href = link.get('href')
        text = text.replace(str(link), f'[{link.text}]({href})')

    return text.strip()

def read_about_available_data_from_doidotorg(doi, llm, headless=False):

    if headless:
        browser = Browser(headless=True)
    else:
        browser = Browser()
    doi = doi.replace('https://doi.org/', '').replace('doi', '')
    print ('https://doi.org/'+doi)
    stem_list = doi.split('/')
    stem = '/'.join(stem_list[0:len(stem_list)-1])
    hrefs = set()
    pt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", prompt)])
    lcel = pt | llm | JsonEnclosedByTextOutputParser()

    try:
        # visit doi.org page
        browser.visit('https://doi.org/'+doi)

        html = browser.html
        md = markdownify.markdownify(html)
        s1 = {'markdown_code': md}
        
        output = lcel.invoke(s1, config={'callbacks': [ConsoleCallbackHandler()]})
        print(output)
            
    except Exception as e:
        print(e)
    finally:
        browser.quit()

read_about_available_data_from_doidotorg('10.1038/s41586-022-05255-2', llm, headless=False)

https://doi.org/10.1038/s41586-022-05255-2
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] Entering Prompt run with input:
[0m{
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:prompt:ChatPromptTemplate] [4ms] Exiting Prompt run with output:
[0m[outputs]
[32;1m[1;3m[llm/start][0m [1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
  ]
}
[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:ChatOpenAI] [323.11s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "```json\n{\n  \"TEXT\": [\n    \"Maps have been deposited in the EMDB under accession codes [13234](http://www.ebi.ac.uk/pdbe/entry/EMD-13234), [13272](http://www.ebi.ac.uk/pdbe/entry/EMD-13272), [13273](http://www.ebi.ac.uk/pdbe/entry/EMD-13273), [13274](http://www.ebi.ac.u

KeyboardInterrupt: 

In [None]:
self = ldb
collection_id = '2'

q1 = self.session.query(SKE, SKI) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(or_(SKE.type == 'ScientificPrimaryResearchArticle', SKE.type == 'ScientificPrimaryResearchPreprint')) 

for ske, ski in tqdm(q1.all()):
    b = ''
    om = ''
    rc = ''  

    fragments = []
    for f in ski.has_part:
      if f.type in ['title', 'abstract']:
        fragments.append(f)

    # USE AN LLM HERE INSTEAD OF A DEEP LEARNING CLASSIFER


    for skf in sorted(fragments, key=lambda f: f.offset):
        for s in self.sent_detector.tokenize(skf.content):
            m = classifier(skf.content)
            l = lookup.get(m[0].get('label'))
            if l == 'BACKGROUND':
                if len(b) > 0:
                    b += '\n'
                b += s
            elif l == 'OBJECTIVE' or l == 'METHODS':
                if len(om) > 0:
                    om += '\n'
                om += s
            else: 
                if len(rc) > 0:
                    rc += '\n'
                rc += s
    skf_stem = ske.id+'.'+ski.type+'.'
    if len(b) > 0:
        f_b = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='background_sentences', offset=-1, length=len(b),
                name=skf_stem+'background', content=b)
        self.session.add(f_b)
        ski.has_part.append(f_b)
        f_b.part_of = ski.id    
    if len(om) > 0:
        f_om = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='objective_methods_sentences', offset=-1, length=len(om),
                name=skf_stem+'objective_methods', content=om)
        self.session.add(f_om)
        ski.has_part.append(f_om)
        f_om.part_of = ski.id
    if len(rc) > 0:
        f_rc = ScientificKnowledgeFragment(id=str(uuid.uuid4().hex)[:10], 
                type='results_conclusions_sentences', offset=-1, length=len(rc),
                name=skf_stem+'results_conclusions', content=rc)
        self.session.add(f_rc)
        ski.has_part.append(f_rc)
        f_rc.part_of = ski.id
    self.session.flush()
self.session.commit()

In [None]:
self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKF) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(or_(SKF.type == 'results_conclusions_sentences', \
                SKF.type == 'objective_methods_sentences', \
                SKF.type == 'background_sentences'))
for skf in tqdm(q2.all()):
    self.delete_fragment(skf.id)
   


In [None]:
self = ldb
collection_id = '2'
#self.session.rollback()
q2 = self.session.query(SKE, SKF) \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == collection_id) \
        .filter(SKC_HM.has_members_id == SKE.id) \
        .filter(SKE.id == SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id == SKI.id) \
        .filter(SKI.id == SKI_HP.ScientificKnowledgeItem_id) \
        .filter(SKI_HP.has_part_id == SKF.id) \
        .filter(SKI.type == 'CitationRecord') \
        .filter(SKF.type == 'objective_methods_sentences') \
        .order_by(desc(SKE.publication_date)) \
        .order_by(SKF.name)

for ske, skf in tqdm(q2.all()):
    print(skf)


#### Get full text copies of all the papers about CryoET


In [None]:
cb.agent_executor.invoke({'input':'Get full text copies of all papers in the collection with id="2".'})

In [None]:
ldb.create_new_collection_from_sample('5', 'EMPIAR CryoET Papers Tests', '4', 20, ['ScientificPrimaryResearchArticle', 'ScientificPrimaryResearchPreprint'])

## Analyze Collections

In [None]:
q = ldb.session.query(SKC.id, SKC.name, SKE.id, SKI.type) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
        .filter(SKE_HR.has_representation_id==SKI.id) 
df = pd.DataFrame(q.all(), columns=['id', 'collection name', 'doi', 'item type'])
df.pivot_table(index=['id', 'collection name'], columns='item type', values='doi', aggfunc=lambda x: len(x.unique())).fillna(0)   

### Survey + Run Classifications over Papers

In [None]:
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
l = []
q = ldb.session.query(N, SKE) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \

output = []        
print(len(q.all()))
for n, ske in q.all():
    ldb.delete_note(n.id)    
print(len(q.all()))
    

In [None]:
t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'classification_type':'cryoet_study_types', 'repeat_run':True})

In [None]:
t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractClassifier_OneDocAtATime_Tool)][0]
t.run({'collection_id': '2', 'classification_type':'cryoet_study_types'})

In [None]:
l = []
ldb.session.rollback()
q = ldb.session.query(N, SKE) \
        .join(NIA, NIA.Note_id == N.id) \
        .join(SKE, SKE.id == NIA.is_about_id) \
        .join(SKC_HM, SKE.id == SKC_HM.has_members_id) \
        .filter(N.type == 'TiAbClassificationNote__cryoet_study_types') \
        .filter(SKC_HM.ScientificKnowledgeCollection_id == '5') \
        .order_by(SKE.id, N.provenance)

output = []        
for n, ske in q.all():
        tup = json.loads(n.content)
        tup['doi'] = 'http://doi.org/'+re.sub('doi:', '', ske.id)
        tup['year'] = ske.publication_date.year
        tup['month'] = ske.publication_date.month
        tup['ref'] = ske.content
        output.append(tup)
df = pd.DataFrame(output).sort_values(['year', 'month'], ascending=[False, False])
df.to_csv(loc+'/'+db_name+'/cryoet_study_types.tsv', sep='\t')
df


In [None]:
study_type_lookup = {'A': 'Viral Pathogens', 
                     'B': "Mutated protein structure", 
                     'C': 'Bacterial pathogens', 
                     'D': 'Plant cells', 
                     'E': 'Material science', 
                     'F': 'Intracellular Transport Structure', 
                     'G': 'Synapses or Vesicle Release', 
                     'H': 'Other Intracellular Structure', 
                     'I': 'Cellular Processes',
                     'J': 'Dynamics of molecular interactions',    
                     'K': 'New CryoET imaging methods', 
                     'L': 'New data analysis methods'}

addEMPCCollection_tool = [t for t in cb.tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
step = 20

for k in study_type_lookup.keys():
    df1 = df[df['cryoet_study_type_code'] == k]
    dois_to_add = [re.sub('http://doi.org/', 'doi:', r.doi) for i, r in df1.iterrows()]

    c_id = '2.'+k
    c_name = 'CryoET - ' + study_type_lookup[k]

    corpus = None
    all_existing_query = ldb.session.query(SKC).filter(SKC.id==c_id)
    for c in all_existing_query.all():
      corpus = c
    if corpus is None:      
      corpus = ScientificKnowledgeCollection(id=c_id,
                                           type='skem:ScientificKnowledgeCollection',
                                           name=c_name,
                                           has_members=[])
    ldb.session.add(corpus)
    ldb.session.flush()

    for doi in tqdm(dois_to_add):
        p = ldb.session.query(SKE) \
            .filter(SKE.id==doi).first()
        if p is None:
          continue
        ldb.session.add(p)
        corpus.has_members.append(p)
        p.member_of.append(corpus)
        ldb.session.flush()
ldb.session.commit()

In [None]:
delete_collection_tool = [t for t in cb.tk.get_tools() if isinstance(t, DeleteCollectionTool)][0]
 
for k in study_type_lookup.keys():
    print(k)
    delete_collection_tool.run({'collection_id': '2.'+k})


### Survey + Run Extractions over Papers

In [None]:
t = [t for t in cb.tk.get_tools() if isinstance(t, TitleAbstractExtraction_OneDocAtATime_Tool)][0]
t.run({'collection_id': '5', 'extraction_type':'cryoet'})

## Tests + Checks 


### Agent tool selection + execution + interpretation

In [None]:
cb.agent_executor.invoke({'input':'Hi who are you and what can you do?'})


## Run MetaData Extraction Chain over listed papers

Here, we run various versions of the metadata extraction tool to examine performance over the cryoet dataset. 

In [9]:
q = ldb.session.query(SKE.id) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKC.id=='5')  
dois = [e.id for e in q.all()]
dois


['doi:10.1002/1873-3468.13916',
 'doi:10.1007/s10974-017-9477-5',
 'doi:10.1016/j.celrep.2023.112107',
 'doi:10.1016/j.str.2020.07.018',
 'doi:10.1038/s41467-020-18777-y',
 'doi:10.1038/s41467-022-29322-4',
 'doi:10.1038/s41477-020-00811-y',
 'doi:10.1038/s41586-023-05904-0',
 'doi:10.1038/s41592-019-0630-5',
 'doi:10.1038/s41594-018-0027-7',
 'doi:10.1073/pnas.2209823119',
 'doi:10.1101/2020.06.26.173476',
 'doi:10.1101/2022.03.08.482579',
 'doi:10.1101/2022.04.07.487557',
 'doi:10.1101/2023.08.18.553799',
 'doi:10.1101/230276',
 'doi:10.1126/sciadv.ade2727',
 'doi:10.1126/science.abd5223',
 'doi:10.1371/journal.pone.0266035',
 'doi:10.1371/journal.ppat.1008883']

In [10]:
# need to count tokens submitted to the server as a way of tracking usage. 

import transformers
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device='mps', token=os.environ['HF_API_KEY'])
prompt = "The methods section of the paper is as follows:"
tokenized = tokenizer(prompt, return_tensors="pt")
print(len(tokenized["input_ids"][0]))



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


11


In [11]:
import tiktoken

# How long are methods sections in the CryoET papers?
ldb.session.rollback()
q = ldb.session.query(SKE.id) \
        .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
        .filter(SKC_HM.has_members_id==SKE.id) \
        .filter(SKC.id=='2') \
        .filter(or_(SKE.type=='ScientificPrimaryResearchArticle', SKE.type=='ScientificPrimaryResearchPreprint'))

encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

tups = []
for e in tqdm(q.all()):
    item_types = set()
    item_type = None
    for i in ldb.list_items_for_expression(e.id):
        item_types.add(i.type)
    for i_type in item_types:
        if i_type == 'CitationRecord':
            continue
        item_type = i_type
        break
    if item_type is None:
        continue

    fragments = [f.content for f in ldb.list_fragments_for_paper(e.id, item_type, fragment_types=['section'])]
    on_off = False
    text = ''
    all_text = ''
    for t in fragments:
        all_text += t
        l1 = t.split('\n')[0].lower()
        if 'method' in l1:
            on_off = True
        elif 'results' in l1 or 'discussion' in l1 or 'conclusion' in l1 or 'acknowledgements' in l1 \
                or 'references' in l1 or 'supplementary' in l1 or 'appendix' in l1 or 'introduction' in l1 or 'abstract' in l1 or 'cited' in l1:
            on_off = False
        if on_off:
            if len(text) > 0:
                text += '\n\n'
            text += t

    all_text_length = len(tokenizer(all_text, return_tensors="pt")['input_ids'][0])
    text_length = len(tokenizer(text, return_tensors="pt")['input_ids'][0])
    tups.append({'doi':e.id, 'doc_length': all_text_length, 'method_length': text_length})
df_length = pd.DataFrame(tups)
df_length 

100%|██████████| 2284/2284 [00:50<00:00, 45.39it/s]


Unnamed: 0,doi,doc_length,method_length
0,doi:10.1101/2022.04.12.488077,27084,6420
1,doi:10.1083/jcb.202204093,17912,3472
2,doi:10.1038/s41594-022-00861-0,16528,3378
3,doi:10.1038/s41586-022-05255-2,16484,6224
4,doi:10.1101/2023.04.28.538734,13304,5424
...,...,...,...
1581,doi:10.1038/ncomms5131,12159,4646
1582,doi:10.1038/ncomms13916,12686,7662
1583,doi:10.1074/jbc.m702025200,5892,1
1584,doi:10.1038/nature04719,2633,1


In [None]:
print(len(df_length[df_length['method_length']>8000]))
print(len(df_length[df_length['method_length']<8000]))

# Plot the distribution of the lengths of the methods sections 
import seaborn as sns  
import matplotlib.pyplot as plt
def plot_length_distribution(df_length):
    plt.hist(df_length, bins=10)
    plt.xlabel('Length')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths')
    plt.show()

plot_length_distribution(df_length['method_length'])



In [13]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]


In [None]:

for i, r in tqdm(df_length.iterrows()):
    if len(df[df['doi']==r['doi']]) > 0:
        continue
    # Run the metadata extraction tool on the doi
    try: 
        t2.run(tool_input={'paper_id': r['doi'], 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})
    except Exception as e:
        print(e)
        continue


In [6]:
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df2 = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
        item_types = set()
        l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test')
        if(len(l) == 0):
            continue
        df2 = pd.concat([df2, pd.DataFrame(l)]) 


NameError: name 'df_length' is not defined

In [14]:
# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for i, r in tqdm(df_length.iterrows()):
    if r['method_length'] < 8000:
        item_types = set()
        l = t2.read_metadata_extraction_notes(r['doi'], 'cryoet', 'test_llama3')
        if(len(l) == 0):
            continue
        df = pd.concat([df, pd.DataFrame(l)]) 
df

1586it [00:07, 203.18it/s]


Unnamed: 0,sample_type,sample_preparation_type,sample_preparation_buffer_ph,grid_vitrification_cryogen,sample_preparation_cryo_protectant,grid_model,grid_material,grid_mesh,grid_support_topology,grid_vit_ctemp,grid_vit_chumid,organism_name,tissue,cell_type,cell_strain,cell_component,doi,extraction_type,run_label
0,cell,tomography,not present,[ETHANE],[BSA],Quantifoil R1/2,COPPER,200,HOLEY,liquid nitrogen temperature,99%,S. pombe,,S. pombe,K972 Sp h- wt haploid,"[ribosomes, FAS, NPC, mitochondria, vesicle, t...",doi:10.1101/2022.04.12.488077,cryoet,test_llama3
0,cell,tomography,not present,"[ethane, propane]",DMSO,R (1/4) Carbon 200-mesh gold EM grids,GOLD,200,HOLEY,37 degreesC,100%,mouse,,mouse embryonic fibroblasts,MEFmtGFP,"mitochondria, ER",doi:10.1083/jcb.202204093,cryoet,test_llama3
0,[micro-organism],tomography,not present,[ETHANE],,Quantifoil R 2/2 Au 200 mesh,GOLD,200,HOLEY,not present,95%,Mus musculus,not present,sperm,C57Bl/6J,"[axoneme, central pair complex, microtubule do...",doi:10.1038/s41594-022-00861-0,cryoet,test_llama3
0,cell,tomography,not present,not present,not present,not present,not present,not present,not present,not present,not present,Mycoplasma pneumoniae,,not present,M129,ribosomes,doi:10.1038/s41586-022-05255-2,cryoet,test_llama3
0,[micro-organism],tomography,7.3,[NITROGEN],Ficoll 400,not present,COPPER,"[75, 50, 100/400]",HOLEY ARRAY,not present,not present,"[Caenorhabditis elegans, Drosophila melanogaster]",,L1 larvae,"[AM140, NK2476]","[80S ribosome, tRNAs, elongation factor]",doi:10.1101/2023.04.28.538734,cryoet,test_llama3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,not present,tomography,not present,not present,not present,not present,not present,not present,not present,not present,not present,not present,not present,not present,not present,not present,doi:10.1093/bioinformatics/bty620,cryoet,test_llama3
0,"[cell, tissue]",tomography,7.25,"[ETHANE, NITROGEN]",dextran,Quantifoil-R5/20,GOLD,200,HOLEY,31-36.5 degreesC,not present,Rattus norvegicus,hippocampal,"[neuronal, P19, HeLa, PtK2, rat HTC]",not present,"[microtubules, luminal particles]",doi:10.1083/jcb.200606074,cryoet,test_llama3
0,micro-organism,helical,7.4,[ETHANE],,C-Flat,COPPER,not present,HOLEY,liquid nitrogen temperature,not present,Saccharomyces cerevisiae,,yeast,W303 or S288C,"plasma membrane, eisosomes",doi:10.1083/jcb.201104040,cryoet,test_llama3
0,micro-organism,subtomogram averaging,8.0,[ETHANE],,"Holey carbon Quantifoil copper grids (R2/2, 20...",COPPER,200,HOLEY,-180 degreesC,not present,,,,not present,,doi:10.1038/ncomms5131,cryoet,test_llama3


In [None]:
df[df['doi']=='doi:10.1101/2022.04.12.488077']




In [None]:
df2[df2['doi']=='doi:10.1101/2022.04.12.488077']


In [None]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'
t2.compile_answers('cryoet', metadata_dir)
t2.write_answers_as_notes('cryoet', metadata_dir)
#sorted(list(set([doi for q in t2.examples for doi in t2.examples[q]])))

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata

# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

# Hack to get the path to the metadata directory as a string
#metadata_dir = str(files(cryoet_portal_metadata).joinpath('temp'))[0:-4]
metadata_dir = '/Users/gully.burns/alhazen/em_tech/empiar/'

# Compile the answers from the metadata directory
t2.compile_answers('cryoet', metadata_dir)

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
df = pd.DataFrame()
for d in [d for d in dois]:
    item_types = set()
    l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l)]) 
     
# Iterate over papers to run the metadata extraction tool
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
for d in [d for d in dois]:
    item_types = set()

    # Skip if the doi is already in the database
    if len(df)>0 and d in df.doi.unique():
        continue

    # Run the metadata extraction tool on the doi
    t2.run(tool_input={'paper_id': d, 'extraction_type': 'cryoet', 'run_label': 'test_llama3'})

    # Add the results to the dataframe    
    l2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df = pd.concat([df, pd.DataFrame(l2)]) 

In [None]:

# Create a dataframe to store previously extracted metadata
#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]
df_final = pd.DataFrame()
for d in [d for d in dois]:
    item_types = set()
    l = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    df_final = pd.concat([df_final, pd.DataFrame(l)]) 
df_final

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from rapidfuzz import fuzz
# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

#for d in [d for d_id in dois_to_include for d in dois_to_include[d_id]]:
l = []
for d in [d for d in dois]:
    item_types = set()
    pred1 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test')
    pred2 = t2.read_metadata_extraction_notes(d, 'cryoet', 'test_dbrx')
    gold = t2.read_metadata_extraction_notes(d, 'cryoet', 'gold') 
    if pred1 is None or pred2 is None or gold is None or \
            len(pred1)==0 or len(pred2)==0 or len(gold)!=1:
        continue
    for k in gold[0]:
        g_case = gold[0][k]
        if g_case=='' or g_case is None:
            continue    
        for j, p_case in enumerate(pred1):
            sim = fuzz.ratio(str(g_case), str(p_case.get(k,''))) / 100.0
            print(k, str(g_case), str(p_case.get(k,'')), sim)

In [None]:
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata
from rapidfuzz import fuzz
# Get the metadata extraction tool
t2 = [t for t in test_tk.get_tools() if isinstance(t, MetadataExtraction_MethodsSectionOnly_Tool)][0]

df = t2.report_metadata_extraction_for_collection('5', 'cryoet', 'test').set_index('doi')
df.to_csv(loc+'/'+db_name+'/reports/cryoet_metadata_gpt4.tsv', sep='\t')

In [None]:
ldb.create_zip_archive_of_full_text_files('5', loc+'/'+db_name+'/full_text_files.zip')

In [None]:
q3 = ldb.session.query(SKE.id, N.name, N.provenance, N.content) \
        .filter(N.id == NIA.Note_id) \
        .filter(NIA.is_about_id == SKE.id) \
        .filter(N.type == 'MetadataExtractionNote') 
l = []
for row in q3.all():
    paper = row[0]
    name = row[1]
#    provenance = json.loads(row[2])
    result = json.loads(row[3])
    kv = {k:result[k] for k in result}
    kv['DOI'] = paper
    kv['run'] = name
    l.append(kv)
# create a dataframe from the list of dictionaries with DOI as the index column
if len(l)>0:
    df = pd.DataFrame(l).set_index(['DOI', 'run'])
else: 
    df = pd.DataFrame()
df 

In [None]:
# USE WITH CAUTION - this will delete all extracted metadata notes in the database
# clear all notes across papers listed in `dois` list
for row in q3.all():
    d_id = row[0]
    e = ldb.session.query(SKE).filter(SKE.id==d_id).first()
    notes_to_delete = []
    for n in ldb.read_notes_about_x(e):
        notes_to_delete.append(n.id)
    for n in notes_to_delete:
        ldb.delete_note(n)

## Protocol Modeling + Extraction

In [None]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
slm = ChatOllama(model='stablelm-zephyr') 
llm = ChatOllama(model='mixtral:instruct') 
llm2 = ChatOpenAI(model='gpt-4-1106-preview') 
llm3 = ChatOpenAI(model='gpt-3.5-turbo') 
d = ("This tool attempts to draw a protocol design from the description of a scientific paper.")

In [None]:
t1 = ProcotolEntitiesExtractionTool(db=ldb, llm=llm3, description=d)
entities = t1.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
entities

In [None]:
t2 = ProcotolProcessesExtractionTool(db=ldb, llm=llm3, description=d)
processes = t2.run(tool_input={'paper_id': 'doi:10.1101/2022.04.12.488077'})
processes.get('data')