# Building databases of published works  

> Pragmatic tools for constructing databases of scientific works based on queries defined with Boolean Logic.

In [5]:
#| default_exp utils.db

In [6]:
#| hide
from nbdev import *

In [1]:
#| export

import local_resources.linkml as linkml

from alhazen.utils.airtableUtils import AirtableUtils
from alhazen.utils.searchEngineUtils import ESearchQuery, EuroPMCQuery
from alhazen.utils.queryTranslator import QueryTranslator, QueryType
from alhazen.schema_sqla import ScientificKnowledgeCollection, ScientificKnowledgeExpression, \
    ScientificKnowledgeFragment, Note
from alhazen.schema_sqla import ScientificKnowledgeCollection, \
    ScientificKnowledgeExpression, ScientificKnowledgeCollectionHasMembers, \
    ScientificKnowledgeItem, ScientificKnowledgeExpressionHasRepresentation, \
    ScientificKnowledgeFragment, ScientificKnowledgeItemHasPart, \
    InformationResource

import alhazen.schema_python as linkml_py
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.local_literature_db import *

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists
from sqlalchemy.orm import sessionmaker
from time import time,sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError

In [21]:
db = LocalLiteratureDb('/Users/gburns/alhazen/', 'em_tech' )
if db.session is None:
    session_class = sessionmaker(bind=db.engine)
    db.session = session_class()

In [3]:
from io import StringIO

EM_QUERIES_TSV = '''
ID,NAME,QUERY
0,Hierarchical phase-contrast tomography,Hierarchical phase-contrast tomography | HIP-CT | Hierarchical phase contrast tomography
1,Cryo-Electron Tomography,Cryoelectron Tomography | Cryo Electron Tomography | Cryo-Electron Tomography | Cryo-ET | CryoET
2,Volume Electron Microscopy,Volume Electron Microscopy | Volume EM | (serial section & (electron microscopy | EM | transmission electron microscopy | TEM | scanning electron microscopy | SEM | electron tomography )) | (serial block-face & (SEM | scanning electron microscopy)) | (focused ion beam & (SEM | scanning electron microscopy)) | (automated serial & (TEM | transmission electron microscopy)) | ( massively parallel imaging & (SEM | scanning electron microscopy)) | multibeam SEM | FAST-SEM | cryo-TEM
'''
EM_QUERIES_TSV = '''
ID,NAME,QUERY
0,Cryo-Electron Tomography,Cryoelectron Tomography | Cryo Electron Tomography | Cryo-Electron Tomography | Cryo-ET | CryoET
'''

cdf = pd.read_csv(StringIO(EM_QUERIES_TSV), sep=',')
qs = QuerySpec('EM Technology', 'ID', 'QUERY', 'NAME', {}, ['TITLE','ABSTRACT', 'METHODS'])
qt = QueryTranslator(cdf.sort_values('ID'), 'ID', 'QUERY', 'NAME')

db.add_corpus_from_epmc(qt, None, sections=qs.sections)


100%|██████████| 1/1 [00:00<00:00, 1057.30it/s]


100%|██████████| 1/1 [00:00<00:00, 1061.58it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE:"Cryoelectron Tomography" OR ABSTRACT:"Cryoelectron Tomography" OR METHODS:"Cryoelectron Tomography") OR (TITLE:"Cryo Electron Tomography" OR ABSTRACT:"Cryo Electron Tomography" OR METHODS:"Cryo Electron Tomography") OR (TITLE:"Cryo-Electron Tomography" OR ABSTRACT:"Cryo-Electron Tomography" OR METHODS:"Cryo-Electron Tomography") OR (TITLE:"Cryo-ET" OR ABSTRACT:"Cryo-ET" OR METHODS:"Cryo-ET") OR (TITLE:"CryoET" OR ABSTRACT:"CryoET" OR METHODS:"CryoET")), 2463 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:37<00:00, 12.66s/it]


 Returning 2463


  .filter(ScientificKnowledgeCollection.id==p_id).first()
100%|██████████| 2463/2463 [00:03<00:00, 712.98it/s]


In [20]:
db.session.rollback()

In [4]:
db.session.commit()

In [12]:
os.environ['NCBI_API_KEY'] = 'd086451c882fabace54d7b049b6fb8481908'
db.add_full_text_for_collection(0)

https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1101/2023.07.17.549278[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1101/2023.09.05.556310[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1101/2023.10.10.561643[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1016/j.coviro.2023.101338[doi]&retmode=xml
No paper found with that DOI


rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)
rewritetex: likely error invoking catdvi (empty output)


https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1002/1873-3468.14726[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1101/2023.11.10.566563[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1109/tvcg.2022.3186146[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1101/2023.10.05.560879[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1038/s41592-023-02045-0[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?ap

AttributeError: 'NoneType' object has no attribute 'TOP_SECTION'

In [6]:
db.session.commit()

In [6]:
db.session.rollback()

In [22]:
q = db.session.query(ScientificKnowledgeExpression) \
                .filter(ScientificKnowledgeExpression.id.like('%36690741%')) 

os.environ['NCBI_API_KEY'] = 'd086451c882fabace54d7b049b6fb8481908'

l = []
for e in q.all():
    l.append({'id':e.id, 
              'xrefs':e.xref,
              'citation':e.content, 
              'items':'|'.join([i.type for i in e.has_representation])})
    db.add_full_text_for_expression(e)
    db.session.commit()
df = pd.DataFrame(l)
df

Unnamed: 0,id,xrefs,citation,items
0,epmid:36690741,"[doi:10.1038/s41592-022-01746-2, epmid:36690741]","de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",CitationRecord


In [23]:

q = db.session.query(ScientificKnowledgeExpression) \
                .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
                .filter(ScientificKnowledgeItem.type == 'FullTextPaper')
l = []
for e in q.all():
    l.append({'id':e.id, 
              'citation':e.content, 
              'items':'|'.join([i.type for i in e.has_representation])})
df = pd.DataFrame(l)
df

Unnamed: 0,id,citation,items
0,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",CitationRecord|FullTextPaper


In [24]:
print(i.type)

AttributeError: 'list' object has no attribute 'type'

In [25]:
q = db.session.query(ScientificKnowledgeExpression, ScientificKnowledgeItem, ScientificKnowledgeFragment) \
                .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
                .filter(ScientificKnowledgeItem.id == ScientificKnowledgeItemHasPart.ScientificKnowledgeItem_id) \
                .filter(ScientificKnowledgeItemHasPart.has_part_id == ScientificKnowledgeFragment.id) \
                .filter(ScientificKnowledgeItem.type == 'FullTextPaper') \
                .filter(ScientificKnowledgeExpression.id.like('%36690741%')) 
                
l = []
for e,i,f in q.all():
    #print(f)
    l.append({'expression':e.id, 
              'citation':e.content,
              'fragment_type': f.type,  
              'offset': f.offset,  
              'length': f.length,  
              'section_name': f.name,  
              'fragment_text': f.content })
df = pd.DataFrame(l)
df

Unnamed: 0,expression,citation,fragment_type,offset,length,section_name,fragment_text
0,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,173,1478,TIAB,Convolutional networks for supervised mining o...
1,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,5300,3656,Main,Main\nCryo-electron tomography (cryo-ET) produ...
2,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,28977,1249,Results >> DeePiCt domain generalization acros...,DeePiCt domain generalization across acquisiti...
3,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,30231,4509,Results >> DeePiCt predictions result in high-...,DeePiCt predictions result in high-quality sub...
4,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,34746,2701,Results >> DeePiCt-predicted ribosomes reveal ...,DeePiCt-predicted ribosomes reveal functional ...
5,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,37452,1984,Results >> DeePiCt reveals ribosome,DeePiCt reveals ribosome-mitochondria associat...
6,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,39440,3224,Results >> Trained networks can be readily app...,Trained networks can be readily applied to oth...
7,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,42672,4969,Discussion,Discussion\nOur DeePiCt workflow facilitates a...
8,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,47648,7,Methods,Methods
9,epmid:36690741,"de Teresa-Trueba I, Goetz SK, Mattausch A, Sto...",section,47657,621,Methods >> Yeast cell culture,Yeast cell culture\nS. pombe K972 Sp h- wild-t...


In [26]:
from alhazen.tools.metadata_extraction_tool import MetadataExtractionTool
from alhazen.utils.local_literature_db import LocalLiteratureDb


db = LocalLiteratureDb('/Users/gburns/alhazen/', 'em_tech' )
met = MetadataExtractionTool(db, 'cryoet')

In [27]:
met.run('36690741', 'method')

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "section_text": "Yeast cell culture\nS. pombe K972 Sp h- wild-type haploid cells were recovered from frozen stock by streaking on YES agar plates (YES Broth, Formedium, 20 g agarose per liter) and incubated at 30 degreesC for 1-3 days. Colonies were restreaked on fresh YES agar plates and incubated 1-3 days at 30 degreesC. Single colonies were inoculated in 5 ml YES medium (YES Broth, Formedium, PCM0302, FM0618/8573) and grown at 30 degreesC, 170 r.p.m. overnight (NCU-Shaker mini, Benchmark). On the next day, cultures were grown to their log phase at an optical density at 600 nm of 0.5-0.6 and diluted beforehand in YES if necessary.",
  "methodology": "Cryo-Electron Tomography (CryoET)",
  "method_goal": "to study the microscopic structure of a biological sample",
  "all_protocol_steps": "(A) the preparation of a biological sample for imaging (such as cells, tissue, a virus, a microorga

In [44]:
import json
q1 = db.session.query(ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeItem.type == 'FullTextPaper') \
            .filter(ScientificKnowledgeExpression.id.like('%36690741%')) 
i = q1.first()
l = []  
for f in i.has_part:
    for n in f.has_notes:
        if 'cryoet' in n.name:
            d = json.loads(n.content)
            d['section'] = f.name
            d['offset'] = f.offset
            d['length'] = f.length
            l.append(d)
df = pd.DataFrame(l)
df


Unnamed: 0,metadata_name,metadata_value,original_text,section,offset,length
0,organism_name,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
1,cell_strain,S. pombe K972 Sp h- wild-type haploid cells,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
2,sample_preparation,recovered from frozen stock by streaking on YE...,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
3,biological_sample_type,cell culture,S. pombe K972 Sp h- wild-type haploid cells we...,Methods >> Yeast cell culture,47657,621
4,biological_sample_type,yeast cells,Yeast cells were either diluted to optical den...,Methods >> Vitrification,48282,741
5,cell_strain,not present,Vitrification Yeast cells were either diluted ...,Methods >> Vitrification,48282,741
6,tilt_maximum,-40degrees,Up to 14 tilt series were collected on a singl...,Methods >> Cryo-ET,50095,1223
7,cryoet_pixel_spacing,3.45 A,A calibrated pixel size of 3.45 A was used for...,Methods >> Cryo-ET,50095,1223
8,camera_model,K2 Summit direct detection camera (Gatan),Cryo-ET acquisition parameters are summarized ...,Methods >> Cryo-ET,50095,1223
9,microscope_setup,A) an energy filter,A calibrated pixel size of 3.45 A was used for...,Methods >> Cryo-ET,50095,1223


In [42]:
extractions_pivot = df.pivot(index='metadata_name', columns=['offset', 'first_line'], values='metadata_value').fillna('')
extractions_pivot

Unnamed: 0,section,offset,length,metadata_value
0,Methods >> Yeast cell culture,47657,2484,S. pombe K972 Sp h- wild-type haploid cells\nS...
1,Methods >> Vitrification,48282,1482,yeast cells\nnot present
2,Methods >> Cryo-ET,50095,11007,-40degrees\n3.45 A\nK2 Summit direct detection...
3,Methods >> Tomogram reconstruction,51322,1551,weighted back projection\n4-times-binned tilt ...
4,Methods >> Ground truth annotation for organel...,51843,3144,"manual segmentation, 2D CNN, manual correction..."
5,Methods >> Ground truth particle annotation in...,52896,3584,pyTOM60\nribosomes and FAS
6,Methods >> Ground truth particle annotation in...,54693,1440,"manually, template matching, FAS manual pickin..."
7,Methods >> Comparison of cryo-ET-derived parti...,55177,1098,not present\nribosomes and FAS\nground truth a...
8,Methods >> NPC manual localization,55547,1596,not present\nNPCs\nmanually
9,Methods >> Voxel-level representation of groun...,56083,1998,not present\nnot present\nnovaSTA (10.5281/zen...


In [30]:
for n in db.list_notes_for_fragments_in_paper('cryoet', '36690741'):
    print(n)

In [18]:
item_type = 'FullTextPaper'
paper_id = '36690741'
q1 = db.session.query(ScientificKnowledgeItem) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
            .filter(ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id.like('%'+str(paper_id)+'%')) 

i = q1.all()
print(i)

[ScientificKnowledgeItem(representation_of=epmid:36690741,creation_date=None,content=Convolutional networks for supervised mining of molecular patterns within cellular context.
Cryo-electron tomograms capture a wealth of structural information on the molecular constituents of cells and tissues. We present DeePiCt (deep picker in context), an open-source deep-learning framework for supervised segmentation and macromolecular complex localization in cryo-electron tomography. To train and benchmark DeePiCt on experimental data, we comprehensively annotated 20 tomograms of Schizosaccharomyces pombe for ribosomes, fatty acid synthases, membranes, nuclear pore complexes, organelles, and cytosol. By comparing DeePiCt to state-of-the-art approaches on this dataset, we show its unique ability to identify low-abundance and low-density complexes. We use DeePiCt to study compositionally distinct subpopulations of cellular ribosomes, with emphasis on their contextual association with mitochondria an

In [4]:
from alhazen.schema_sqla import InformationContentEntity, ScientificKnowledgeFragment

#ScientificKnowledgeFragment.__mapper_args__ = {
#    'concrete': True,
#    'polymorphic_identity': 'ScientificKnowledgeFragment', 
#    'polymorphic_load': 'inline'}
#ScientificKnowledgeFragment.__mapper__.polymorphic_load = 'inline'
#ScientificKnowledgeFragment.__mapper__.polymorphic_identity = 'ScientificKnowledgeFragment'
#ScientificKnowledgeFragment.__mapper__.concrete = True
ScientificKnowledgeFragment.__mapper__.polymorphic_load

In [None]:
import urllib

os.environ['NCBI_API_KEY'] = 'd086451c882fabace54d7b049b6fb8481908'

def get_id(ice, t):
    idmap = {k[:k.find(':')]:k[k.find(':')+1:] for k in ice.xref} 
    return idmap.get(t)

path = '/users/gburns/alhazen/em_tech/nxml_files/'
dois = set()
for ske in db.session.query(ScientificKnowledgeExpression).all():
    doi = get_id(ske, 'doi')
    if doi is None or doi in dois:
        continue
    dois.add(doi)
    if os.path.exists(path+doi+'.nxml'):
        print('Skipping %s, already exists'%(doi))
        continue
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('ID: %s'%(ske.id))  
    print('DOI: %s'%(doi))  
    print(ske.content)
    try:   
        get_nxml_from_pubmed_doi(doi, path)
        dois.add(doi)
    except URLError as ue:
        dois.remove(doi)
    except HTTPError as he:
        dois.remove(p.doi)


In [None]:

pdf_path = '/Users/gburns/alhazen/em_tech/pdf_files/'
nxml_path = '/Users/gburns/alhazen/em_tech/nxml_files/'

l = []
for p in db.session.query(ScientificKnowledgeExpression).all():

    has_full_text = False
    has_full_text_methods = False
    has_pdf = False

    doi = doi = get_id(p, 'doi')
    if doi is None:
        continue
    
    pdf_file_path = pdf_path+doi+'.pdf'
    if os.path.exists(pdf_file_path):
        has_pdf = True

    nxml_file_path = nxml_path+doi+'.nxml'
    if os.path.exists(nxml_file_path):
        with open(nxml_file_path, 'r') as f:
            xml = f.read()
            soup = BeautifulSoup(xml, "lxml")
        body = soup.find_all('body')
        if body:
            has_full_text = True
        d = NxmlDoc(doi, xml)
        m = '\n'.join([d.read_section_text(sec) for sec in d.search_section_titles('methods')])
        if(len(m) > 0):
            has_full_text_methods = True

    l.append({'id':p.id, 'reference':p.content, 'doi':doi, 'pub_date':p.publication_date, 'has_full_text':has_full_text, 'has_full_text_methods':has_full_text_methods, 'has_pdf':has_pdf})

df = pd.DataFrame(l)
df        

In [6]:
from alhazen.schema_sqla import ScientificKnowledgeCollection, \
    ScientificKnowledgeExpression, ScientificKnowledgeCollectionHasMembers, \
    ScientificKnowledgeItem, ScientificKnowledgeExpressionHasRepresentation, \
    ScientificKnowledgeFragment, ScientificKnowledgeItemHasPart, \
    InformationResource

q = db.session.query(ScientificKnowledgeExpression) \
                .filter(ScientificKnowledgeExpression.id == ScientificKnowledgeExpressionHasRepresentation.ScientificKnowledgeExpression_id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id) \
                .filter(ScientificKnowledgeExpressionHasRepresentation.has_representation_id == ScientificKnowledgeItem.id)

l = []
for e in q.all():
    l.append({'citation':e.content, 
              'items':'|'.join([i.type for i in e.has_representation])})
df = pd.DataFrame(l)
df

Unnamed: 0,citation,items
0,(2023) Streamlined structure determination by...,CitationRecord|FullTextPaper
1,(2023) Visualizing the membrane disruption ac...,CitationRecord|FullTextPaper
2,(2023) Preparing <i>Arabidopsis thaliana</i> ...,CitationRecord|FullTextPaper
3,(2023) Cryo-electron tomography to study vira...,CitationRecord|FullTextPaper
4,(2023) <i>In situ</i>cryo-electron tomography...,CitationRecord
...,...,...
2453,(2019) Orthobunyavirus spike architecture and...,CitationRecord
2454,(2014) Membrane deformation and scission by t...,CitationRecord
2455,(2016) A live RSV vaccine with engineered the...,CitationRecord
2456,(2015) Electron Tomography: A Three-Dimension...,CitationRecord


In [None]:
df[df.has_full_text_methods]

In [None]:
print( df[df.has_pdf==True].shape ) 
print( df[df.has_full_text==True].shape ) 
print( df[df.has_full_text_methods==True].shape ) 
print( df.shape ) 


In [None]:
years = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000]

# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


In [None]:
year1 = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000 and (row.has_full_text | row.has_pdf)]
year2 = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000]


# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


years = [row.pub_date.year for i, row in df.iterrows() if row.pub_date.year>2000 and row.has_full_text==True]

# draw histogram of publication years
import matplotlib.pyplot as plt
import numpy as np
plt.hist(years, bins=np.arange(min(years), max(years) + 1, 1))


In [None]:

db = LocalLiteratureDb('em_literature', 'EuropePMC papers based on EM keywords', '/tmp/alhazen/')
if db.session is None:
    session_class = sessionmaker(bind=db.engine)
    db.session = session_class()

print([c.name for c in db.list_corpora()])

path = '/tmp/alhazen/nxml_files/'
methods = []
for i, p in enumerate(db.list_corpus_publications('1')):
    nxml_path = path+p.doi+'.nxml'
    if os.path.exists(nxml_path):
        with open(nxml_path, 'r') as f:
            xml = f.read()
            d = NxmlDoc(p.doi, xml)
            #print(p.doi)
            #print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            #for sec in d.search_section_titles('methods'):
            #    print(d.read_section_text(sec)) 
            #print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
            m = '\n'.join([d.read_section_text(sec) for sec in d.search_section_titles('methods')])
            if(len(m) > 0):
                methods.append(m)

lengths = [(len(m) - len(re.sub('\s+','',m))) for m in methods]
lengths

In [None]:
len(lengths)


In [None]:
import base64
from IPython.display import Image, display
import matplotlib.pyplot as plt

def mm(graph):
  graphbytes = graph.encode("ascii")
  base64_bytes = base64.b64encode(graphbytes)
  base64_string = base64_bytes.decode("ascii")
  display(
    Image(
      url="https://mermaid.ink/img/"
      + base64_string
    )
  )



In [None]:
engine = create_engine("sqlite:////tmp/alhazen/sciknow.db")
session_class = sessionmaker(bind=engine)
session = session_class()
p = linkml_sqla.ScientificPrimaryResearchPreprint(id=100, title='Test', abstract='Test abstract', iri='10.1234/1234', doi='10.1234/1234', publication_date=datetime.now())
session.add(p)
session.commit()

In [None]:
!rm /tmp/alhazen/tmp.db

In [None]:
!sqlite3 /tmp/alhazen/tmp.db "SELECT * FROM Work;" ".exit"

In [None]:
for w in session.query(Work).where(Work.id!=100):
    print(p.title)
