# Building a database of published works relevant to CryoET-Portal  

> Pragmatic tools for constructing databases of scientific works based on queries defined with Boolean Logic.

## Examples of CEIFNS database code to be used later

In [None]:
# Don't export this but leave it in the notebook code for future reference
from databricks import sql

def get_nxml_from_scipubstore_doi(doi, base_file_path):
    
    if os.environ.get('DB_TOKEN') is None:  
        msg = 'Error attempting to query Databricks for URL data, did you set the DB_TOKEN environment variable?'
        raise Exception(msg)

    get_ft_url_from_doi_sql = '''
        SELECT DISTINCT p.pmc_id, p.doi, YEAR(p.publication_date) as year, p.title, p.abstract, p.full_text_format, p.full_text_url, a.last_name
        FROM scipubstore.ingestion.papers as p 
            JOIN scipubstore.ingestion.authors as a on (p.paper_id=a.paper_id) 
        WHERE p.doi = '{}' and a.author_index=1
        ORDER BY p.full_text_url DESC
    '''.format(doi)

    df = pd.DataFrame()
    with sql.connect(server_hostname = 'czi-shared-infra-czi-sci-general-prod-databricks.cloud.databricks.com',
                        http_path = '/sql/1.0/warehouses/1c4df94f2f1a6305',
                        access_token = 'databricks-access-token-here') as connection:

        with connection.cursor() as cursor:
            cursor.execute(get_ft_url_from_doi_sql)
            result = cursor.fetchall()

    df = pd.DataFrame([row.asDict() for row in result])
    if df.shape[0] == 0:
        return('No paper found with that DOI')
    
    title = df['title'].values[0]
    first_author = df['last_name'].values[0]
    year = df['year'].values[0]  
    url = df['full_text_url'].values[0]
    xml = requests.get(url).text

    file_path = Path(base_file_path + '/' + doi + '.nxml')
    parent_dir = file_path.parent
    if os.path.exists(parent_dir) is False:
        os.makedirs(parent_dir)
    with open(file_path, 'w') as f:
        f.write(xml)

## Preliminaries

In [5]:
#| default_exp utils.db

In [20]:
#| hide
from nbdev import *

In [1]:
#| export

import local_resources.linkml as linkml
import local_resources.data_files.cryoet_portal_metadata as cryoet_portal_metadata

from alhazen.utils.airtableUtils import AirtableUtils
from alhazen.utils.searchEngineUtils import ESearchQuery, EuroPMCQuery
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
from alhazen.schema_sqla import ScientificKnowledgeCollection, ScientificKnowledgeExpression, \
    ScientificKnowledgeFragment, Note
from alhazen.schema_sqla import ScientificKnowledgeCollection, \
    ScientificKnowledgeExpression, ScientificKnowledgeCollectionHasMembers, \
    ScientificKnowledgeItem, ScientificKnowledgeExpressionHasRepresentation, \
    ScientificKnowledgeFragment, ScientificKnowledgeItemHasPart, \
    InformationResource
import alhazen.schema_python as linkml_py
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.pdf_research_article_text_extractor import LAPDFBlockLoader, HuridocsPDFLoader
from alhazen.utils.ceifns_db import *

from langchain.text_splitter import CharacterTextSplitter

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

## Develop queries about CryoET-Portal datasets from `10000-10010`

### Use local YAML files to identify dois for consideration

In [2]:
from alhazen.utils.web_robot import *

def dict_generator(indict, pre=None):
    pre = pre[:] if pre else []
    if isinstance(indict, dict):
        for key, value in indict.items():
            if isinstance(value, dict):
                for d in dict_generator(value, pre + [key]):
                    yield d
            elif isinstance(value, list) or isinstance(value, tuple):
                for v in value:
                    for d in dict_generator(v, pre + [key]):
                        yield d
            else:
                yield pre + [key, value]
    else:
        yield pre + [indict]

dois = {}
for i in range(10000, 10011):
    d_id = None
    yaml_text = files(cryoet_portal_metadata).joinpath(str(i)+'.yaml').read_text()
    d = yaml.safe_load(yaml_text)
    for l in dict_generator(d):
        if 'dataset_identifier' in l:
            d_id = l[-1]
        if 'dataset_publications' in l and d_id:
            dlist = [d.strip() for d in re.sub('doi:', '', l[-1]).split(',') if '/' in d]
            if dois.get(d_id):
                dois[d_id].extend(dlist)
            else:
                dois[d_id] = dlist
print(dois)

{10000: ['10.1101/2022.04.12.488077'], 10001: ['10.1101/2022.04.12.488077'], 10003: ['10.1038/s41586-022-05255-2', '10.1038/s41592-020-01054-7'], 10004: ['10.1101/2023.04.28.538734'], 10005: ['10.1038/s41594-022-00861-0'], 10006: ['10.1038/s41586-020-2665-2'], 10007: [], 10008: ['10.1038/s41586-022-04971-z'], 10009: ['10.1126/science.abm6704'], 10010: ['10.1083/jcb.202204093', '10.1101/2022.01.23.477440']}


### Build a collection from DOIs in `CryoETPortal: 10000 - 10010`

In [3]:
db = Ceifns_LiteratureDb('/Users/gburns/alhazen/', 'em_tech' )

In [4]:
query = ' OR '.join(['doi:"'+d+'"' for d_id in dois for d in dois[d_id] ])
print(query)
try: 
    db.add_corpus_from_epmc_query(0, 'cryoet portal 10000-10010', query)
except Exception as e:
    print(e)

doi:"10.1101/2022.04.12.488077" OR doi:"10.1101/2022.04.12.488077" OR doi:"10.1038/s41586-022-05255-2" OR doi:"10.1038/s41592-020-01054-7" OR doi:"10.1101/2023.04.28.538734" OR doi:"10.1038/s41594-022-00861-0" OR doi:"10.1038/s41586-020-2665-2" OR doi:"10.1038/s41586-022-04971-z" OR doi:"10.1126/science.abm6704" OR doi:"10.1083/jcb.202204093" OR doi:"10.1101/2022.01.23.477440"
https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=doi:"10.1101/2022.04.12.488077" OR doi:"10.1101/2022.04.12.488077" OR doi:"10.1038/s41586-022-05255-2" OR doi:"10.1038/s41592-020-01054-7" OR doi:"10.1101/2023.04.28.538734" OR doi:"10.1038/s41594-022-00861-0" OR doi:"10.1038/s41586-020-2665-2" OR doi:"10.1038/s41586-022-04971-z" OR doi:"10.1126/science.abm6704" OR doi:"10.1083/jcb.202204093" OR doi:"10.1101/2022.01.23.477440", 10 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 100.79it/s]


### Get PDFs from DOIs using a web-robot

In [5]:
dir = db.loc+db.name+'/ft'
for doi in [d for d_id in dois for d in dois[d_id]]:
    print(doi)
    #if os.path.exists(dir+'/'+doi+'.nxml') is False and \
    #        os.path.exists(dir+'/'+doi+'.pdf') is False: 
    #    retrieve_full_text_links_from_biorxiv(doi, '/Users/gburns/alhazen/em_tech/nxml')    
    
    if os.path.exists(dir+'/'+doi+'.pdf') is False:
        retrieve_pdf_from_doidotorg(doi, dir)

10.1101/2022.04.12.488077
10.1101/2022.04.12.488077
10.1038/s41586-022-05255-2
10.1038/s41592-020-01054-7
10.1101/2023.04.28.538734
10.1038/s41594-022-00861-0
10.1038/s41586-020-2665-2
10.1038/s41586-022-04971-z
10.1126/science.abm6704
10.1083/jcb.202204093
10.1101/2022.01.23.477440


### Add Items from available NXML/JATS + PDF files   

In [5]:
db.session.rollback()

In [5]:
dir = '/Users/gburns/alhazen/em_tech/ft'
db = Ceifns_LiteratureDb('/Users/gburns/alhazen/', 'em_tech' )
for doi in [d for d_id in dois for d in dois[d_id]]:
    # open file and see if the tag <body> is present
    nxml_flag = False
    pdf_flag = os.path.exists(dir+'/'+doi+'.pdf')
    if os.path.exists(dir+'/'+doi+'.nxml') is True:
        with open(dir+'/'+doi+'.nxml', 'r') as file:
            soup = BeautifulSoup(file, "lxml-xml")
            if soup.body is not None:
                nxml_flag = True
    print(doi+', nxml: '+str(nxml_flag)+', pdf: '+str(pdf_flag))
    ske = db.get_expression_by_doi(doi)
    db.add_full_text_for_expression(ske, nxml_flag, False)#pdf_flag)
db.session.commit()

10.1101/2022.04.12.488077, nxml: True, pdf: True
10.1101/2022.04.12.488077, nxml: True, pdf: True
10.1038/s41586-022-05255-2, nxml: True, pdf: True
10.1038/s41592-020-01054-7, nxml: True, pdf: True
10.1101/2023.04.28.538734, nxml: True, pdf: True
10.1038/s41594-022-00861-0, nxml: True, pdf: True
10.1038/s41586-020-2665-2, nxml: False, pdf: True
10.1038/s41586-022-04971-z, nxml: False, pdf: True
10.1126/science.abm6704, nxml: False, pdf: True
10.1083/jcb.202204093, nxml: True, pdf: True
10.1101/2022.01.23.477440, nxml: True, pdf: True


### Check out each fragment

In [6]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [7]:
from langchain.vectorstores import PGVector
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.docstore.document import Document 

q = db.session.query(ScientificKnowledgeExpression, ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) 
 
for ske, ski in q.all():
    print(ske.id, ski.type, len(ski.content), len(encoding.encode(ski.content)))

doi:10.1101/2022.04.12.488077 CitationRecord 1299 250
doi:10.1083/jcb.202204093 CitationRecord 2136 377
doi:10.1038/s41594-022-00861-0 CitationRecord 1130 222
doi:10.1038/s41586-022-05255-2 CitationRecord 1564 303
doi:10.1101/2023.04.28.538734 CitationRecord 1152 219
doi:10.1126/science.abm6704 CitationRecord 989 208
doi:10.1038/s41586-022-04971-z CitationRecord 1849 436
doi:10.1038/s41592-020-01054-7 CitationRecord 1173 222
doi:10.1101/2022.01.23.477440 CitationRecord 2066 369
doi:10.1038/s41586-020-2665-2 CitationRecord 1259 282
doi:10.1101/2022.04.12.488077 JATSFullText 127643 31170
doi:10.1101/2022.04.12.488077 JATSFullText 127643 31170
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41592-020-01054-7 JATSFullText 107745 24317
doi:10.1101/2023.04.28.538734 JATSFullText 69655 16761
doi:10.1038/s41594-022-00861-0 JATSFullText 83838 20653
doi:10.1083/jcb.202204093 JATSFullText 105420 24286
doi:10.1101/2022.01.23.477440 JATSFullText 76974 16786


In [8]:
db.session.query(ScientificKnowledgeFragment.id).count()

288

In [9]:
from langchain.vectorstores import PGVector
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.docstore.document import Document 

q = db.session.query(ScientificKnowledgeExpression, ScientificKnowledgeItem, ScientificKnowledgeFragment) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeItem.id == ScientificKnowledgeItemHasPart.ScientificKnowledgeItem_id) \
            .filter(ScientificKnowledgeItemHasPart.has_part_id == ScientificKnowledgeFragment.id) \
            .order_by(ScientificKnowledgeExpression.id, ScientificKnowledgeFragment.offset)
 
documents = []
for ske, ski, skf in q.all():
    print(ske.id, ski.type, len(ski.content), len(encoding.encode(ski.content)))#), skf.content)

#    documents.append(
#        Document(page_content=skf.content, 
#                 metadata={'doi': ske.id,
#                           'citation': ske.name, 
#                           'pub_date': ske.publication_date.strftime('%Y-%m-%d'),
#                           'id': skf.id, 
#                           'section_title': skf.name,
#                           'offset': skf.offset,
#                           'length': skf.length}))
#   
#[d.page_content for d in documents]


doi:10.1038/s41586-020-2665-2 CitationRecord 1259 282
doi:10.1038/s41586-020-2665-2 CitationRecord 1259 282
doi:10.1038/s41586-022-04971-z CitationRecord 1849 436
doi:10.1038/s41586-022-04971-z CitationRecord 1849 436
doi:10.1038/s41586-022-05255-2 CitationRecord 1564 303
doi:10.1038/s41586-022-05255-2 CitationRecord 1564 303
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407
doi:10.1038/s41586-022-05255-2 JATSFullText 83870 20407


In [None]:
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=100, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [17]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "mps"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

CONNECTION_STRING = 'postgresql+psycopg2:///'+db.name
COLLECTION_NAME = db.name
vectorestore = PGVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name='ALHAZEN_TEST2',
    connection_string=CONNECTION_STRING,
)

## Develop broad queries for CryoET-Portal across the field of all papers mentioning CryoET in Pubmed

In [2]:
db = LocalLiteratureDb('/Users/gburns/alhazen/', 'em_tech' )
if db.session is None:
    session_class = sessionmaker(bind=db.engine)
    db.session = session_class()

In [3]:
from io import StringIO

EM_QUERIES_TSV = '''
ID,NAME,QUERY
0,Hierarchical phase-contrast tomography,Hierarchical phase-contrast tomography | HIP-CT | Hierarchical phase contrast tomography
1,Cryo-Electron Tomography,Cryoelectron Tomography | Cryo Electron Tomography | Cryo-Electron Tomography | Cryo-ET | CryoET
2,Volume Electron Microscopy,Volume Electron Microscopy | Volume EM | (serial section & (electron microscopy | EM | transmission electron microscopy | TEM | scanning electron microscopy | SEM | electron tomography )) | (serial block-face & (SEM | scanning electron microscopy)) | (focused ion beam & (SEM | scanning electron microscopy)) | (automated serial & (TEM | transmission electron microscopy)) | ( massively parallel imaging & (SEM | scanning electron microscopy)) | multibeam SEM | FAST-SEM | cryo-TEM
'''
EM_QUERIES_TSV = '''
ID,NAME,QUERY
0,Cryo-Electron Tomography,Cryoelectron Tomography | Cryo Electron Tomography | Cryo-Electron Tomography | Cryo-ET | CryoET
'''

cdf = pd.read_csv(StringIO(EM_QUERIES_TSV), sep=',')
qs = QuerySpec('EM Technology', 'ID', 'QUERY', 'NAME', {}, ['TITLE','ABSTRACT', 'METHODS'])
qt = QueryTranslator(cdf.sort_values('ID'), 'ID', 'QUERY', 'NAME')

db.add_corpus_from_epmc(qt, None, sections=qs.sections)


100%|██████████| 1/1 [00:00<00:00, 1057.30it/s]


100%|██████████| 1/1 [00:00<00:00, 1061.58it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE:"Cryoelectron Tomography" OR ABSTRACT:"Cryoelectron Tomography" OR METHODS:"Cryoelectron Tomography") OR (TITLE:"Cryo Electron Tomography" OR ABSTRACT:"Cryo Electron Tomography" OR METHODS:"Cryo Electron Tomography") OR (TITLE:"Cryo-Electron Tomography" OR ABSTRACT:"Cryo-Electron Tomography" OR METHODS:"Cryo-Electron Tomography") OR (TITLE:"Cryo-ET" OR ABSTRACT:"Cryo-ET" OR METHODS:"Cryo-ET") OR (TITLE:"CryoET" OR ABSTRACT:"CryoET" OR METHODS:"CryoET")), 2463 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:37<00:00, 12.66s/it]


 Returning 2463


  .filter(ScientificKnowledgeCollection.id==p_id).first()
100%|██████████| 2463/2463 [00:03<00:00, 712.98it/s]


In [None]:
db.add_full_text_for_collection(0)

# Run MetaData Extraction Chain

In [10]:
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb

db = Ceifns_LiteratureDb('/Users/gburns/alhazen/', 'em_tech')
met = MetadataExtractionTool(db, 'cryoet')

In [11]:
met.run('10.1101/2022.04.12.488077', 'method')

In [9]:
item_type = 'JATSFullText'
expression_id = '10.1101/2022.04.12.488077'
q = (db.session.query(ScientificKnowledgeExpression, ScientificKnowledgeItem, ScientificKnowledgeFragment) 
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id)
            .filter(ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id == expression_id)
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id)
            .filter(ScientificKnowledgeItem.id == ScientificKnowledgeItemHasPart.ScientificKnowledgeItem_id)
            .filter(ScientificKnowledgeItem.type == item_type) \
            .filter(ScientificKnowledgeItemHasPart.has_part_id == ScientificKnowledgeFragment.id))
skfs = []
for ske, ski, skf in q.all():
    skfs.append(skf)
skfs

[]

In [25]:
print(completed_list)

['epmid:36690741', 'epmid:37176000', 'epmid:37485371', 'epmid:33771860', 'epmid:35508170', 'epmid:30688648', 'epmid:37040766', 'epmid:33298442', 'epmid:36305590', 'epmid:35982043', 'epmid:33028835', 'epmid:35017666', 'epmid:32270040', 'epmid:32241888', 'epmid:24813625', 'epmid:33199282', 'epmid:34468314', 'epmid:35862756', 'epmid:33154161', 'epmid:34643180']


In [27]:
from tqdm import tqdm
for id in tqdm(df.id.to_list()):
    if id in completed_list: 
        continue
    print(id)
    met.run(id, 'method')
    db.session.commit()

  0%|          | 0/25 [00:00<?, ?it/s]

epmid:36311290
epmid:33950014
epmid:32341341
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "section_text": "Strains and growth conditions\nThe list of Vibrio cholerae strains and construction contains the wild-type: Vibrio cholerae C6706, the strain PM6: Deltavca1088, the strain PM7: Deltavca1093, Deltavca1094, Deltavca1095 (DeltaF7), and the strain PM18: Deltavca1092. V. cholerae deletion strains were generated using standard allele exchange34 with the following plasmids.\nPlasmid for deletion of vca1093, vca1094 and vca1095 (pPM045) was constructed by PCR amplification of the up- and down-stream regions of vca1093 and vca1095, respectively. PCR1 was performed with primers CCCCCTCTAGAAATTGGCTAATCCCTCCTAAACTC/AATCTTGCGCAGTTGTTCCATATC and C6706 chromosomal DNA as template. PCR2 was performed with primers GATATGGAACAACTGCGCAAGATT CGCTTAAGCACCACTGCCGAA/CCCCCTCTAGACATCATCAAATTCGTCGTCATGC and C6706 chromosomal DNA as template. A third P

 92%|█████████▏| 23/25 [18:56<01:38, 49.42s/it]

[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:Ollama] [19.76s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " {\"protocol_step\": G}\n\nThe text does not describe any of the specific steps of the protocol listed (A-F), but rather mentions a Nature Research Reporting Summary that provides further information on the research design. Therefore, the appropriate response is \"G) other protocol steps, not listed above.\"",
        "generation_info": {
          "model": "llama2:70b",
          "created_at": "2023-11-20T07:41:29.421935Z",
          "response": "",
          "done": true,
          "context": [
            29961,
            25580,
            29962,
            529,
            29879,
            24566,
            25580,
            29962,
            13,
            18884,
            3532,
            14816,
            29903,
            6778,
            3492,
            526,
            385,
            17924,
 

 96%|█████████▌| 24/25 [28:38<01:20, 80.56s/it]

[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:Ollama] [29.49s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " {\n\"protocol_step\": \"G\"\n}\n\nThe text described the procedures followed as part of the protocol, specifically the step of milling the sample using a focussed ion beam (FIB) to create a thin, electron-transparent lamella. This step is not listed among the options A-F, so the corresponding value for the 'protocol_step' field is \"G\" for \"other protocol steps, not listed above\".",
        "generation_info": {
          "model": "llama2:70b",
          "created_at": "2023-11-20T07:51:11.3315Z",
          "response": "",
          "done": true,
          "context": [
            29961,
            25580,
            29962,
            529,
            29879,
            24566,
            25580,
            29962,
            13,
            18884,
            3532,
            14816,
            29903,
            67

100%|██████████| 25/25 [48:27<00:00, 116.28s/it]

[36;1m[1;3m[llm/end][0m [1m[1:chain:RunnableSequence > 3:llm:Ollama] [22.33s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": " {\n\"metadata_name\": \"reconstruction_method\",\n\"metadata_value\": \"Cryo-Electron Tomography (CryoET)\",\n\"original_text\": \"The C1-IgM model was built in the deposited cryo-ET 3D reconstruction (EMDB-4878) (17) using Pymol version 2.3.0.\"\n}",
        "generation_info": {
          "model": "llama2:70b",
          "created_at": "2023-11-20T08:10:59.832541Z",
          "response": "",
          "done": true,
          "context": [
            29961,
            25580,
            29962,
            529,
            29879,
            24566,
            25580,
            29962,
            13,
            18884,
            3532,
            14816,
            29903,
            6778,
            3492,
            526,
            385,
            17924,
            4768,
            5996,
            9638,
      




In [1]:
import json
from alhazen.schema_sqla import ScientificKnowledgeFragmentHasNotes, Note

q1 = db.session.query(ScientificKnowledgeExpression) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeItem.id == ScientificKnowledgeItemHasPart.ScientificKnowledgeItem_id ) \
            .filter(ScientificKnowledgeItemHasPart.has_part_id == ScientificKnowledgeFragment.id ) \
            .filter(ScientificKnowledgeFragment.id == ScientificKnowledgeFragmentHasNotes.ScientificKnowledgeFragment_id) \
            .filter(ScientificKnowledgeFragmentHasNotes.has_notes_id == Note.id) \
            .filter(ScientificKnowledgeItem.type == 'FullTextPaper') 
completed_list = [e.id for e in q1.all()]
print(completed_list)

NameError: name 'db' is not defined

In [29]:
print(len(completed_list))

23


In [5]:
import json
q1 = db.session.query(ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeItem.type == 'FullTextPaper') \
            .filter(ScientificKnowledgeExpression.id.like('%36690741%')) 
i = q1.first()
l = []  
for f in i.has_part:
    for n in f.has_notes:
        if 'cryoet' in n.name:
            d = json.loads(n.content)
            d['section'] = f.name
            d['offset'] = f.offset
            d['length'] = f.length
            l.append(d)
df = pd.DataFrame(l)
df


Unnamed: 0,metadata_name,metadata_value,original_text,section,offset,length
0,organism_name,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
1,cell_strain,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
2,organism_name,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
3,cell_strain,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
4,sample_preparation,recovered from frozen stock by streaking on YE...,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
...,...,...,...,...,...,...
93,reconstruction_method,threefold cross-validation scheme,"For ribosome and FAS localization, and for mem...",Methods >> Cross validation and performance ev...,63957,1649
94,reconstruction_software,not present,,Methods >> Cross validation and performance ev...,63957,1649
95,reconstruction_method,non-standard threefold cross-validation scheme,"For ribosome and FAS localization, and for mem...",Methods >> Cross validation and performance ev...,63957,1649
96,reconstruction_software,not present,,Methods >> Cross validation and performance ev...,63957,1649


In [18]:
import json
q1 = db.session.query(ScientificKnowledgeExpression, ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeItem.type == 'FullTextPaper')  
papers = {}
for e,i in q1.all():
    for f in i.has_part:
        for n in f.has_notes:
            if 'cryoet' in n.name:
                papers[e.id] = e.content
print(len(papers))
print(papers)

23
{'epmid:36690741': 'de Teresa-Trueba I, Goetz SK, Mattausch A, Stojanovska F, Zimmerli CE, Toro-Nahuelpan M, Cheng DWC, Tollervey F, Pape C, Beck M, Diz-Muñoz A, Kreshuk A, Mahamid J, Zaugg JB. (2023) Convolutional networks for supervised mining of molecular patterns within cellular context.', 'epmid:37176000': 'Kaplan M, Yao Q, Jensen GJ. (2023) Structure and Assembly of the <i>Proteus mirabilis</i> Flagellar Motor by Cryo-Electron Tomography.', 'epmid:37485371': 'Shepherd DC, Kaplan M, Vankadari N, Kim KW, Larson CL, Dutka P, Beare PA, Krzymowski E, Heinzen RA, Jensen GJ, Ghosal D. (2023) Morphological remodeling of <i>Coxiella burnetii</i> during its biphasic developmental cycle revealed by cryo-electron tomography.', 'epmid:33771860': 'Mageswaran SK, Yang WY, Chakrabarty Y, Oikonomou CM, Jensen GJ. (2021) A cryo-electron tomography workflow reveals protrusion-mediated shedding on injured plasma membrane. ', 'epmid:35508170': 'Nicolas WJ, Fäßler F, Dutka P, Schur FKM, Jensen G, M

In [28]:
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool

loc = '/Users/gburns/alhazen/em_tech/reports/'
f = 'gjensen_papers.xlsx'
os.makedirs(loc, exist_ok=True)
with pd.ExcelWriter(f'{loc}{f}') as excel_writer:
    met = MetadataExtractionTool(db, 'cryoet')
    for id in papers:
        print(id)
        try:
            id2 = re.sub('epmid:', '', id)
            first_author = re.compile('^(.*?)[ \,\(]').search(papers[id]).group(1)
            year = re.compile('\((.*?)\)').search(papers[id]).group(1)
            cite  = f'{first_author}_{year}_{id2}'
            df, df_pivot = met.tabulate_fragments(id)
            df_pivot.to_excel(excel_writer, sheet_name=cite)
        except Exception as ex:
            print(ex)
            continue

epmid:36690741
Index contains duplicate entries, cannot reshape
epmid:37176000
epmid:37485371
epmid:33771860
epmid:35508170
epmid:30688648
epmid:37040766
epmid:33298442
epmid:36305590
epmid:35982043
epmid:33028835
epmid:35017666
epmid:32270040
epmid:32241888
epmid:24813625
epmid:33199282
epmid:34468314
epmid:35862756
epmid:33154161
epmid:34643180
epmid:32341341
epmid:33199285
epmid:32849513


In [45]:
extractions_pivot = df.pivot(index='metadata_name', columns=['offset', 'section'], values='metadata_value').fillna('')
extractions_pivot

offset,47657,48282,50095,51322,51843,52896,54693,55177,55547,56083,56753,58468,60817,63122,63957
section,Methods >> Yeast cell culture,Methods >> Vitrification,Methods >> Cryo-ET,Methods >> Tomogram reconstruction,"Methods >> Ground truth annotation for organelles, cytoplasm, and membranes",Methods >> Ground truth particle annotation in VPP data,Methods >> Ground truth particle annotation in defocus data,Methods >> Comparison of cryo-ET-derived particle numbers with proteomics,Methods >> NPC manual localization,Methods >> Voxel-level representation of ground truth,Methods >> Cytoskeletal filaments segmentation and subtomogram averaging,Methods >> Subtomogram analysis for ribosomes and FAS,Methods >> CNN pre- and post-processing,Methods >> Evaluation metrics,Methods >> Cross validation and performance evaluation
metadata_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
annotated_entities,,,,,"mitochondria, vesicle, tube, ER, nuclear envel...",ribosomes and FAS,ribosomes,ribosomes and FAS,NPCs,,,,,,
annotated_software,,,,,Amira49,pyTOM60,not present,not present,not present,,,,,,
annotation_methods,,,,,"manual segmentation, 2D CNN, manual correction",,"manually, template matching, FAS manual pickin...",ground truth annotations,manually,,,,,,
biological_sample_type,cell culture,yeast cells,,,,,,,,,,,,,
camera_manufacturer,,,Gatan,,,,,,,,,,,,
camera_model,,,K2 Summit direct detection camera (Gatan),,,,,,,,,,,,
cell_strain,S. pombe K972 Sp h- wild-type haploid cells,not present,,,,,,,,,,,,,
cryoet_acceleration_voltage,,,not present,,,,,,,,,,,,
cryoet_pixel_spacing,,,3.45 A,,,,,,,,,,,,
microscope_name,,,Titan Krios,,,,,,,,,,,,


In [30]:
for n in db.list_notes_for_fragments_in_paper('cryoet', '36690741'):
    print(n)

In [18]:
item_type = 'FullTextPaper'
paper_id = '36690741'
q1 = db.session.query(ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id.like('%'+str(paper_id)+'%')) 

i = q1.all()
print(i)

[ScientificKnowledgeItem(representation_of=epmid:36690741,creation_date=None,content=Convolutional networks for supervised mining of molecular patterns within cellular context.
Cryo-electron tomograms capture a wealth of structural information on the molecular constituents of cells and tissues. We present DeePiCt (deep picker in context), an open-source deep-learning framework for supervised segmentation and macromolecular complex localization in cryo-electron tomography. To train and benchmark DeePiCt on experimental data, we comprehensively annotated 20 tomograms of Schizosaccharomyces pombe for ribosomes, fatty acid synthases, membranes, nuclear pore complexes, organelles, and cytosol. By comparing DeePiCt to state-of-the-art approaches on this dataset, we show its unique ability to identify low-abundance and low-density complexes. We use DeePiCt to study compositionally distinct subpopulations of cellular ribosomes, with emphasis on their contextual association with mitochondria an

In [4]:
from alhazen.schema_sqla import InformationContentEntity, ScientificKnowledgeFragment

#ScientificKnowledgeFragment.__mapper_args__ = {
#    'concrete': True,
#    'polymorphic_identity': 'ScientificKnowledgeFragment', 
#    'polymorphic_load': 'inline'}
#ScientificKnowledgeFragment.__mapper__.polymorphic_load = 'inline'
#ScientificKnowledgeFragment.__mapper__.polymorphic_identity = 'ScientificKnowledgeFragment'
#ScientificKnowledgeFragment.__mapper__.concrete = True
ScientificKnowledgeFragment.__mapper__.polymorphic_load

In [None]:
import urllib

os.environ['NCBI_API_KEY'] = 'd086451c882fabace54d7b049b6fb8481908'

def get_id(ice, t):
    idmap = {k[:k.find(':')]:k[k.find(':')+1:] for k in ice.xref} 
    return idmap.get(t)

path = '/users/gburns/alhazen/em_tech/nxml_files/'
dois = set()
for ske in db.session.query(ScientificKnowledgeExpression).all():
    doi = get_id(ske, 'doi')
    if doi is None or doi in dois:
        continue
    dois.add(doi)
    if os.path.exists(path+doi+'.nxml'):
        print('Skipping %s, already exists'%(doi))
        continue
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('ID: %s'%(ske.id))  
    print('DOI: %s'%(doi))  
    print(ske.content)
    try:   
        get_nxml_from_pubmed_doi(doi, path)
        dois.add(doi)
    except URLError as ue:
        dois.remove(doi)
    except HTTPError as he:
        dois.remove(p.doi)


In [None]:

pdf_path = '/Users/gburns/alhazen/em_tech/pdf_files/'
nxml_path = '/Users/gburns/alhazen/em_tech/nxml_files/'

l = []
for p in db.session.query(ScientificKnowledgeExpression).all():

    has_full_text = False
    has_full_text_methods = False
    has_pdf = False

    doi = doi = get_id(p, 'doi')
    if doi is None:
        continue
    
    pdf_file_path = pdf_path+doi+'.pdf'
    if os.path.exists(pdf_file_path):
        has_pdf = True

    nxml_file_path = nxml_path+doi+'.nxml'
    if os.path.exists(nxml_file_path):
        with open(nxml_file_path, 'r') as f:
            xml = f.read()
            soup = BeautifulSoup(xml, "lxml")
        body = soup.find_all('body')
        if body:
            has_full_text = True
        d = NxmlDoc(doi, xml)
        m = '\n'.join([d.read_section_text(sec) for sec in d.search_section_titles('methods')])
        if(len(m) > 0):
            has_full_text_methods = True

    l.append({'id':p.id, 'reference':p.content, 'doi':doi, 'pub_date':p.publication_date, 'has_full_text':has_full_text, 'has_full_text_methods':has_full_text_methods, 'has_pdf':has_pdf})

df = pd.DataFrame(l)
df        

In [6]:
from alhazen.schema_sqla import ScientificKnowledgeCollection, \
    ScientificKnowledgeExpression, ScientificKnowledgeCollectionHasMembers, \
    ScientificKnowledgeItem, ScientificKnowledgeExpressionHasRepresentation, \
    ScientificKnowledgeFragment, ScientificKnowledgeItemHasPart, \
    InformationResource

q = db.session.query(ScientificKnowledgeExpression) \
                .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id)

l = []
for e in q.all():
    l.append({'citation':e.content, 
              'items':'|'.join([i.type for i in e.has_representation])})
df = pd.DataFrame(l)
df

Unnamed: 0,citation,items
0,(2023) Streamlined structure determination by...,CitationRecord|FullTextPaper
1,(2023) Visualizing the membrane disruption ac...,CitationRecord|FullTextPaper
2,(2023) Preparing <i>Arabidopsis thaliana</i> ...,CitationRecord|FullTextPaper
3,(2023) Cryo-electron tomography to study vira...,CitationRecord|FullTextPaper
4,(2023) <i>In situ</i>cryo-electron tomography...,CitationRecord
...,...,...
2453,(2019) Orthobunyavirus spike architecture and...,CitationRecord
2454,(2014) Membrane deformation and scission by t...,CitationRecord
2455,(2016) A live RSV vaccine with engineered the...,CitationRecord
2456,(2015) Electron Tomography: A Three-Dimension...,CitationRecord


In [None]:
df[df.has_full_text_methods]

In [None]:
print( df[df.has_pdf==True].shape ) 
print( df[df.has_full_text==True].shape ) 
print( df[df.has_full_text_methods==True].shape ) 
print( df.shape ) 


In [None]:
years = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000]

# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


In [None]:
year1 = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000 and (row.has_full_text | row.has_pdf)]
year2 = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000]


# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


years = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000 and row.has_full_text==True]

# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


In [None]:

db = Ceifns_LiteratureDb('em_literature', 'EuropePMC papers based on EM keywords', '/tmp/alhazen/')
if db.session is None:
    session_class = sessionmaker(bind=db.engine)
    db.session = session_class()

print([c.name for c in db.list_corpora()])

path = '/tmp/alhazen/nxml_files/'
methods = []
for i, p in enumerate(db.list_corpus_publications('1')):
    nxml_path = path+p.doi+'.nxml'
    if os.path.exists(nxml_path):
        with open(nxml_path, 'r') as f:
            xml = f.read()
            d = NxmlDoc(p.doi, xml)
            #print(p.doi)
            #print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            #for sec in d.search_section_titles('methods'):
            #    print(d.read_section_text(sec)) 
            #print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            m = '\n'.join([d.read_section_text(sec) for sec in d.search_section_titles('methods')])
            if(len(m) > 0):
                methods.append(m)

lengths = [(len(m) - len(re.sub('\s+','',m))) for m in methods]
lengths

In [None]:
len(lengths)


In [None]:
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(
    Image(
      url="https://mermaid.ink/img/"
      + base64_string
    )
  )



In [None]:
engine = create_engine("sqlite:////tmp/alhazen/sciknow.db")
session_class = sessionmaker(bind=engine)
session = session_class()
p = linkml_sqla.ScientificPrimaryResearchPreprint(id=100, title='Test', abstract='Test abstract', iri='10.1234/1234', doi='10.1234/1234', publication_date=datetime.now())
session.add(p)
session.commit()

In [None]:
!rm /tmp/alhazen/tmp.db

In [None]:
!sqlite3 /tmp/alhazen/tmp.db "SELECT * FROM Work;" ".exit"

In [None]:
for w in session.query(Work).where(Work.id!=100):
    print(p.title)
