# Building a chatbot Q/A over the Rare As One Cycle 1 & 2 literature.  

> Pragmatic tools for constructing databases of scientific works based on queries defined with Boolean Logic.

## Preliminaries

Here we set up libraries and methods to create and query the local Postgres database we will be using to store our information from the Alhazen tools and agent

In [1]:
from alhazen.core import OllamaRunner
from alhazen.schema_sqla import * 
from alhazen.tools.basic import *
from alhazen.tools.metadata_extraction_tool import *
from alhazen.toolkit import AlhazenToolkit
from alhazen.utils.jats_text_extractor import NxmlDoc
from alhazen.utils.ceifns_db import Ceifns_LiteratureDb, create_ceifns_database, drop_ceifns_database

from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.pgvector import PGVector

from bs4 import BeautifulSoup,Tag,Comment,NavigableString
from databricks import sql
from datetime import datetime, timedelta
from importlib_resources import files
import os
import pandas as pd
from pathlib import Path
import re
import requests
from sqlalchemy import create_engine, exists, func
from sqlalchemy.orm import sessionmaker, aliased

from time import time, sleep
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote_plus, quote, unquote
from urllib.error import URLError, HTTPError
import yaml

In [2]:
# Using Aliases like this massively simplifies the use of SQLAlchemy
IR = aliased(InformationResource)

SKC = aliased(ScientificKnowledgeCollection)
SKC_PROV = aliased(ScientificKnowledgeCollectionProvenance)
SKC_HM = aliased(ScientificKnowledgeCollectionHasMembers)
SKE = aliased(ScientificKnowledgeExpression)
SKE_XREF = aliased(ScientificKnowledgeExpressionXref)
SKE_HR = aliased(ScientificKnowledgeExpressionHasRepresentation)
SKI = aliased(ScientificKnowledgeItem)
SKI_HP = aliased(ScientificKnowledgeItemHasPart)
SKF = aliased(ScientificKnowledgeFragment)

N = aliased(Note)
SKC_HN = aliased(ScientificKnowledgeCollectionHasNotes)
SKE_HN = aliased(ScientificKnowledgeExpressionHasNotes)
SKI_HN = aliased(ScientificKnowledgeItemHasNotes)
SKF_HN = aliased(ScientificKnowledgeFragmentHasNotes)

Remember to set environmental variables for this code:

* `ALHAZEN_DB_NAME` - the name of the Postgres database you are storing information into
* `LOCAL_FILE_PATH` - the location on disk where you save files for your digital library, downloaded models or other data.   

In [3]:
os.environ['ALHAZEN_DB_NAME'] = 'rare_as_one_diseases'

# Set this path to the location on disk that you'd like to store all Alhazen related files. This
os.environ['LOCAL_FILE_PATH'] = '/users/gully.burns/alhazen/'

if os.path.exists(os.environ['LOCAL_FILE_PATH']) is False:
    os.makedirs(os.environ['LOCAL_FILE_PATH'])

In [4]:
if os.environ.get('ALHAZEN_DB_NAME') is None: 
    raise Exception('Which database do you want to use for this application?')
db_name = os.environ['ALHAZEN_DB_NAME']

if os.environ.get('LOCAL_FILE_PATH') is None: 
    raise Exception('Where are you storing your local literature database?')
loc = os.environ['LOCAL_FILE_PATH']

Run this command to destroy your current database 

**USE WITH CAUTION**

In [13]:
drop_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

Database has been dropped successfully !!


Run this command to create a new, empty database. 

In [14]:
create_ceifns_database(os.environ['ALHAZEN_DB_NAME'])

100%|██████████| 315/315 [00:00<00:00, 3446.51it/s]


This command lists all the tools the Alhazen agent system has access to

In [15]:
ldb = Ceifns_LiteratureDb(loc=loc, name=db_name)
ollr = OllamaRunner('mixtral')
llm  = ollr.llm

tk = AlhazenToolkit(db=ldb, ollr=ollr)
print('AVAILABLE TOOLS')
for t in tk.get_tools():
    print('\t'+type(t).__name__)

AVAILABLE TOOLS
	AddCollectionFromEPMCTool
	DescribeCollectionCompositionTool
	DeleteCollectionTool
	RetrieveFullTextTool
	MetadataExtractionTool


# Build paper collections

This section will build a literature collection across each of the diseases in the Rare As One Cohorts for cycle 1 and 2. 



In [16]:
os.environ['NCBI_API_KEY'] = 'd086451c882fabace54d7b049b6fb8481908'

What diseases are we querying the literature for?

In [18]:
import local_resources.data_files.rao_grantees as rao_files
from alhazen.utils.queryTranslator import QueryTranslator, QueryType

cols_to_include = ['ID', 'CORPUS_NAME', 'TERMS']
df = pd.read_csv(files(rao_files).joinpath('CZI_RAO_diseases.tsv'), sep='\t')
df = df.drop(columns=[c for c in df.columns if c not in cols_to_include])

df

Unnamed: 0,ID,CORPUS_NAME,TERMS
0,1,Adult Polyglucosan Body Disease,adult polyglucosan body disease | adult polygl...
1,2,Creatine transporter deficiency,creatine transporter deficiency | guanidinoace...
2,3,AGAT deficiency,"GATM deficiency | ""AGAT deficiency"" | ""arginin..."
3,4,Guanidinoacetate methyltransferase deficiency,guanidinoacetate methyltransferase deficiency ...
4,5,CLOVES Syndrome,CLOVES syndrome | (congenital lipomatous overg...
...,...,...,...
76,78,TBCK Syndrome,TBCK Syndrome | TBCK Encephalopathy | TBCK-ass...
77,79,Dyskeratosis congenita,dyskeratosis congenita | Zinsser-Engman-Cole s...
78,80,Telomere syndrome,telomere syndrome | short telomere syndrome
79,81,The Stiff Person Syndrome,stiff man syndrome | stiff person syndrome | M...


This command iterates over the list of different collections and runs a query for each one on the European website by processing the `TERMS` column from the  dataframe with the `QueryTranslator` utility. This generates a search query in boolean logic that searches the `TITLE_ABS` field in the remote database (See https://www.ebi.ac.uk/europepmc/webservices/rest/fields for possible fields to search).

In [19]:
qt = QueryTranslator(df.sort_values('ID'), 'ID', 'TERMS', 'CORPUS_NAME')
(corpus_ids, epmc_queries) = qt.generate_queries(QueryType.epmc, sections=['TITLE_ABS'])
corpus_names = df['CORPUS_NAME']

addEMPCCollection_tool = [t for t in tk.get_tools() if isinstance(t, AddCollectionFromEPMCTool)][0]
for (id, name, query) in zip(corpus_ids, corpus_names, epmc_queries):
    #if id != 81:
    #    continue
    addEMPCCollection_tool.run(tool_input={'id': id, 'name':name, 'query':query, 'full_text':False})

100%|██████████| 81/81 [00:00<00:00, 2948.68it/s]
100%|██████████| 81/81 [00:00<00:00, 16096.78it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"adult polyglucosan body disease") OR (TITLE_ABS:"adult polyglucosan body neuropathy")), 104 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:03<00:00,  3.12s/it]


 Returning 92


100%|██████████| 92/92 [00:00<00:00, 508.47it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"creatine transporter deficiency") OR (TITLE_ABS:"guanidinoacetate methyltransferase deficiency") OR (TITLE_ABS:"AGAT deficiency") OR (TITLE_ABS:"cerebral creatine deficiency syndrome 1") OR (TITLE_ABS:"X-linked creatine deficiency syndrome") OR (TITLE_ABS:"Cerebral Creatine Deficiency Syndromes") OR (TITLE_ABS:"creatine transporter defect") OR (TITLE_ABS:"SLC6A8 deficiency") OR (TITLE_ABS:"X-linked creatine transporter deficiency") OR (TITLE_ABS:"X-linked creatine deficiency") OR (TITLE_ABS:"guanidinoacetate N-methyltransferase activity disease") OR (TITLE_ABS:"GAMT deficiency") OR (TITLE_ABS:"glycine amidinotransferase activity disease") OR (TITLE_ABS:"arginine:glycine amidinotransferase deficiency") OR (TITLE_ABS:"GATM deficiency")), 319 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:05<00:00,  5.04s/it]


 Returning 300


100%|██████████| 300/300 [00:00<00:00, 444.91it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"GATM deficiency") OR (TITLE_ABS:"AGAT deficiency") OR (TITLE_ABS:"arginine:glycine amidinotransferase deficiency") OR (TITLE_ABS:"L-arginine:glycine amidinotransferase deficiency")), 38 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.51s/it]


 Returning 34


100%|██████████| 34/34 [00:00<00:00, 315.09it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"guanidinoacetate methyltransferase deficiency") OR (TITLE_ABS:"GAMT deficiency")), 141 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.39s/it]


 Returning 129


100%|██████████| 129/129 [00:00<00:00, 499.98it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"CLOVES syndrome") OR (TITLE_ABS:"CLOVE syndrome") OR ((TITLE_ABS:"congenital lipomatous overgrowth") AND (TITLE_ABS:"vascular malformation epidermal") AND (TITLE_ABS:"nevi-spinal") AND (TITLE_ABS:"syndrome")) OR ((TITLE_ABS:"congenital lipomatous overgrowth") AND (TITLE_ABS:"vascular malformations") AND (TITLE_ABS:"Epidermal nevi") AND (TITLE_ABS:"abnormalities") AND ((TITLE_ABS:"skeletal") OR (TITLE_ABS:"spinal"))) OR ((TITLE_ABS:"congenital lipomatous overgrowth") AND (TITLE_ABS:"vascular malformation") AND (TITLE_ABS:"epidermal nevi"))), 95 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.61s/it]


 Returning 94


100%|██████████| 94/94 [00:00<00:00, 473.15it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Fibroadipose overgrowth") OR (TITLE_ABS:"Fibroadipose hyperplasia")), 17 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.76s/it]


 Returning 17


100%|██████████| 17/17 [00:00<00:00, 352.81it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Hemihyperplasia Multiple Lipomatosis"), 12 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.51s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 446.61it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Macrodactyly"), 419 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


 Returning 331


100%|██████████| 331/331 [00:00<00:00, 547.38it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Megalencephaly-Capillary Malformation"), 59 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


 Returning 57


100%|██████████| 57/57 [00:00<00:00, 382.73it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Congenital Hyperinsulinism") OR (TITLE_ABS:"familial hyperinsulinism") OR (TITLE_ABS:"Neonatal hyperinsulinism") OR (TITLE_ABS:"hyperinsulinemic hypoglycemia") OR (TITLE_ABS:"Nesidioblastosis") OR (TITLE_ABS:"hyperinsulinemia of infancy") OR (TITLE_ABS:"hereditary hyperinsulinism") OR (TITLE_ABS:"congenital hyperinsulinism") OR (TITLE_ABS:"familial hyperinsulinemic hypoglycemia") OR (TITLE_ABS:"Hyperinsulinism Hyperammonemia Syndrome") OR (TITLE_ABS:"Glucokinase Hyperinsulinism")), 2149 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:23<00:00,  7.90s/it]


 Returning 1805


100%|██████████| 1805/1805 [00:03<00:00, 478.28it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Chronic Recurrent Multifocal Osteomyelitis") OR (TITLE_ABS:"chronic non-bacterial osteomyelitis") OR (TITLE_ABS:"chronic recurrent multifocal osteomyelitis") OR (TITLE_ABS:"chronic multifocal osteomyelitis") OR (TITLE_ABS:"chronic nonbacterial osteomyelitis")), 787 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:08<00:00,  8.28s/it]


 Returning 662


100%|██████████| 662/662 [00:01<00:00, 543.92it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"congenital muscular dystrophy") OR (TITLE_ABS:"congenital MD") OR (TITLE_ABS:"Bethlem myopathy") OR (TITLE_ABS:"scleroatonic muscular dystrophy") OR (TITLE_ABS:"Ullrich scleroatonic muscular dystrophy") OR (TITLE_ABS:"scleroatonic Ullrich disease") OR (TITLE_ABS:"Rigid spine syndrome") OR (TITLE_ABS:"Walker-Warburg Syndrome")), 2198 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:24<00:00,  8.19s/it]


 Returning 1848


100%|██████████| 1848/1848 [00:04<00:00, 460.67it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Muscle-eye-brain disease") OR (TITLE_ABS:"Santavuori congenital muscular dystrophy")), 193 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.10s/it]


 Returning 173


100%|██████████| 173/173 [00:00<00:00, 524.28it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Leyden-Mobius") AND (TITLE_ABS:"muscular dystrophy")), 4 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:00<00:00,  1.28it/s]


 Returning 1


100%|██████████| 1/1 [00:00<00:00, 330.16it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"hereditary hemorrhagic telangiectasia") OR (TITLE_ABS:"Osler-Weber-Rendu disease") OR (TITLE_ABS:"Rendu-Osler-Weber disease") OR (TITLE_ABS:"Osler hemorrhagic telangiectasia syndrome") OR (TITLE_ABS:"Rendu-Osler disease") OR ((TITLE_ABS:"telangiectasia") AND (TITLE_ABS:"hereditary hemorrhagic"))), 2918 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:28<00:00,  9.52s/it]


 Returning 2238


100%|██████████| 2238/2238 [00:05<00:00, 434.57it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Landau-Kleffner syndrome"), 445 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:05<00:00,  5.01s/it]


 Returning 336


100%|██████████| 336/336 [00:00<00:00, 544.75it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"GRIN1 Mutation") OR (TITLE_ABS:"acquired epileptiform aphasia") OR (TITLE_ABS:"acquired epileptic aphasia") OR (TITLE_ABS:"Rolandic epilepsy") OR (TITLE_ABS:"acquired aphasia with epilepsy")), 591 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:05<00:00,  5.68s/it]


 Returning 465


100%|██████████| 465/465 [00:00<00:00, 523.60it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"epileptic encephalopathy") AND (TITLE_ABS:"GRIN2B")) OR ((TITLE_ABS:"epileptic encephalopathy") AND (TITLE_ABS:"early infantile") AND (TITLE_ABS:"27"))), 14 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.54s/it]


 Returning 13


100%|██████████| 13/13 [00:00<00:00, 348.57it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"GRIN2B") AND (TITLE_ABS:"autosomal dominant") AND ((TITLE_ABS:"intellectual disability") OR (TITLE_ABS:"mental retardation"))), 2 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:00<00:00,  1.28it/s]


 Returning 1


100%|██████████| 1/1 [00:00<00:00, 321.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"GRIN2A") AND ((TITLE_ABS:"intellectual disability") OR (TITLE_ABS:"Early-onset epileptic encephalopathy")) AND ((TITLE_ABS:"mutation") OR (TITLE_ABS:"mutate"))), 10 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.43s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 312.80it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"epileptic encephalopathy") AND (TITLE_ABS:"GRIN2D")) OR ((TITLE_ABS:"epileptic encephalopathy") AND (TITLE_ABS:"early infantile") AND (TITLE_ABS:"46"))), 12 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.50s/it]


 Returning 11


100%|██████████| 11/11 [00:00<00:00, 368.39it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"GRIA1") OR (TITLE_ABS:"GRIA2") OR (TITLE_ABS:"GRIA3") OR (TITLE_ABS:"GRIA4")) AND ((TITLE_ABS:"disorders") OR (TITLE_ABS:"patient") OR (TITLE_ABS:"disease"))), 189 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.38s/it]


 Returning 182


100%|██████████| 182/182 [00:00<00:00, 518.95it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"GRID1") OR (TITLE_ABS:"GRID2")) AND ((TITLE_ABS:"disorders") OR (TITLE_ABS:"patient") OR (TITLE_ABS:"disease"))), 55 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


 Returning 55


100%|██████████| 55/55 [00:00<00:00, 492.58it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"GRIK1") OR (TITLE_ABS:"GRIK2") OR (TITLE_ABS:"GRIK3") OR (TITLE_ABS:"GRIK4") OR (TITLE_ABS:"GRIK5")) AND ((TITLE_ABS:"disorders") OR (TITLE_ABS:"patient") OR (TITLE_ABS:"disease"))), 153 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.03s/it]


 Returning 143


100%|██████████| 143/143 [00:00<00:00, 518.96it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"GRIN2B") OR (TITLE_ABS:"GRIN2A") OR (TITLE_ABS:"GRIN2D") OR (TITLE_ABS:"GRIN1") OR (TITLE_ABS:"GRIN2C") OR (TITLE_ABS:"GRIN3A") OR (TITLE_ABS:"GRIN3B")) AND ((TITLE_ABS:"disorders") OR (TITLE_ABS:"patient") OR (TITLE_ABS:"disease"))), 585 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.28s/it]


 Returning 561


100%|██████████| 561/561 [00:01<00:00, 525.84it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"adenosine monophosphate deaminase deficiency") OR (TITLE_ABS:"AMP deaminase deficiency") OR (TITLE_ABS:"myoadenylate deaminase deficiency")), 142 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:03<00:00,  3.21s/it]


 Returning 120


100%|██████████| 120/120 [00:00<00:00, 479.11it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"DDX3X-related intellectual disability") OR (TITLE_ABS:"DDX3X non-syndromic X-linked intellectual disability") OR ((TITLE_ABS:"DDX3X") AND (TITLE_ABS:"intellectual disability"))), 43 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


 Returning 42


100%|██████████| 42/42 [00:00<00:00, 394.60it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"fibrolamellar hepatocellular carcinoma") OR (TITLE_ABS:"oncocytic hepatocellular tumor") OR (TITLE_ABS:"liver cell fibrolamellar carcinoma") OR (TITLE_ABS:"polygonal cell type hepatocellular carcinoma with fibrous Stroma") OR (TITLE_ABS:"fibrolamellar cancer") OR (TITLE_ABS:"hepatocellular carcinoma with increased stromal fibrosis") OR (TITLE_ABS:"hepatocellular fibrolamellar carcinoma") OR (TITLE_ABS:"fibrolamellar carcinoma") OR (TITLE_ABS:"fibrolamellar variant of hepatocellular carcinoma") OR (TITLE_ABS:"eosinophilic hepatocellular carcinoma with lamellar fibrosis") OR (TITLE_ABS:"polygonal cell hepatocellular carcinoma with fibrous stroma") OR (TITLE_ABS:"FL-HCC") OR (TITLE_ABS:"fibrolamellar oncocytic hepatoma") OR (TITLE_ABS:"eosinophilic glassy cell hepatoma") OR (TITLE_ABS:"fibrolamellar hepatocarcinoma") OR (TITLE_ABS:"fibrolamellar carcinoma of li

100%|██████████| 1/1 [00:07<00:00,  7.37s/it]


 Returning 631


100%|██████████| 631/631 [00:01<00:00, 533.13it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"GLUT1 deficiency syndrome") OR (TITLE_ABS:"GLUT1 deficient") OR (TITLE_ABS:"De Vivo disease")), 202 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.29s/it]


 Returning 193


100%|██████████| 193/193 [00:00<00:00, 508.42it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Hermansky-Pudlak syndrome") OR (TITLE_ABS:"Kotzot-Richter syndrome")), 742 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.09s/it]


 Returning 641


100%|██████████| 641/641 [00:01<00:00, 531.43it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"infantile neuroaxonal dystrophy") OR (TITLE_ABS:"Seitelberger disease") OR (TITLE_ABS:"phospholipase A2-associated neurodegeneration") OR (TITLE_ABS:"KARAK syndrome")), 272 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


 Returning 243


100%|██████████| 243/243 [00:00<00:00, 523.90it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"autosomal dominant mental retardation 32") OR (TITLE_ABS:"Arboleda-Tham") OR ((TITLE_ABS:"KAT6A") AND ((TITLE_ABS:"syndrome") OR (TITLE_ABS:"mental retardation")))), 46 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.56s/it]


 Returning 46


100%|██████████| 46/46 [00:00<00:00, 438.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"KIF1A-related disorder") OR (TITLE_ABS:"KIF1A hereditary spastic paraplegia") OR (TITLE_ABS:"hereditary spastic paraplegia % KIF1A") OR (TITLE_ABS:"KIF1A disorder") OR (TITLE_ABS:"KIF1A Missense Variant") OR (TITLE_ABS:"KIF1A gene Missense Variant") OR (TITLE_ABS:"Spastic paraplegia 30") OR (TITLE_ABS:"NESCAV syndrome")), 8 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.29s/it]


 Returning 8


100%|██████████| 8/8 [00:00<00:00, 237.93it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"HSN2C") OR ((TITLE_ABS:"hereditary sensory neuropathy") AND ((TITLE_ABS:"2c") OR (TITLE_ABS:"IIC"))) OR ((TITLE_ABS:"hereditary sensory and autonomic neuropathy") AND ((TITLE_ABS:"type 2") OR (TITLE_ABS:"II")))), 75 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.81s/it]


 Returning 68


100%|██████████| 68/68 [00:00<00:00, 248.14it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"KIF1A hereditary spastic paraplegia") OR (TITLE_ABS:"SPG30") OR (TITLE_ABS:"spastic paraplegia 30")), 21 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.68s/it]


 Returning 21


100%|██████████| 21/21 [00:00<00:00, 299.42it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Lennox-Gastaut syndrome") OR (TITLE_ABS:"Lennox syndrome")), 1638 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:14<00:00,  7.46s/it]


 Returning 1351


100%|██████████| 1351/1351 [00:02<00:00, 500.28it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Li-Fraumeni syndrome") OR (TITLE_ABS:"SBLA syndrome") OR (TITLE_ABS:"Li Fraumeni syndrome") OR (TITLE_ABS:"Li-Fraumeni familial cancer susceptibility syndrome") OR (TITLE_ABS:"sarcoma family syndrome of Li and Fraumeni") OR ((TITLE_ABS:"syndrome") AND (TITLE_ABS:"sarcoma") AND (TITLE_ABS:"breast") AND (TITLE_ABS:"leukaemia") AND (TITLE_ABS:"adrenal gland"))), 1383 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:14<00:00,  7.01s/it]


 Returning 1190


100%|██████████| 1190/1190 [00:02<00:00, 515.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Gorham-Stout disease") OR (TITLE_ABS:"kaposiform lymphangiomatosis") OR (TITLE_ABS:"osteolysis massive") OR (TITLE_ABS:"diffuse cystic angiomatosis of bone") OR (TITLE_ABS:"massive osteolysis") OR (TITLE_ABS:"Gorham disease") OR (TITLE_ABS:"progressive massive osteolysis") OR (TITLE_ABS:"idiopathic massive osteolysis") OR (TITLE_ABS:"vanishing bone disease") OR (TITLE_ABS:"Gorham syndrome") OR (TITLE_ABS:"disseminated lymphangiomatosis") OR (TITLE_ABS:"disseminated lymphangioma") OR (TITLE_ABS:"generalized lymphatic anomaly") OR (TITLE_ABS:"diffuse lymphangioma") OR (TITLE_ABS:"disseminated lymphatic malformation") OR (TITLE_ABS:"diffuse lymphangiomatosis") OR (TITLE_ABS:"Central Conducting Lymphatic Anomalies") OR (TITLE_ABS:"Central Conducting Lymphatic Anomaly")), 750 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.48s/it]


 Returning 622


100%|██████████| 622/622 [00:01<00:00, 532.89it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"generalized lymphatic anomaly") OR (TITLE_ABS:"disseminated lymphangiomatosis") OR (TITLE_ABS:"diffuse lymphangiomatosis") OR (TITLE_ABS:"diffuse lymphangioma")), 130 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:03<00:00,  3.41s/it]


 Returning 119


100%|██████████| 119/119 [00:00<00:00, 506.97it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Kaposiform lymphangiomatosis"), 56 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.47s/it]


 Returning 54


100%|██████████| 54/54 [00:00<00:00, 375.59it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"necrotizing enterocolitis"), 8966 European PMC PAPERS FOUND


100%|██████████| 9/9 [01:24<00:00,  9.38s/it]


 Returning 7909


100%|██████████| 7909/7909 [00:48<00:00, 163.77it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"trisomy 8p") OR (TITLE_ABS:"partial deletion of the short arm of chromosome 8") OR (TITLE_ABS:"recombinant 8 syndrome") OR (TITLE_ABS:"Duplication 8p") OR (TITLE_ABS:"trisomy type 8p") OR (TITLE_ABS:"San Luis Valley syndrome") OR (TITLE_ABS:"Recombinant chromosome 8 syndrome") OR (TITLE_ABS:"San Luis Valley recombinant chromosome 8 syndrome") OR (TITLE_ABS:"Rec8 syndrome") OR ((TITLE_ABS:"syndrome") AND (TITLE_ABS:"8p") AND ((TITLE_ABS:"inverted") OR (TITLE_ABS:"duplication") OR (TITLE_ABS:"deletion")))), 133 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:03<00:00,  3.33s/it]


 Returning 109


100%|██████████| 109/109 [00:00<00:00, 532.53it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"8p inverted duplication deletion syndrome") OR ((TITLE_ABS:"8p") AND (TITLE_ABS:"Invdupdel"))), 13 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.47s/it]


 Returning 13


100%|██████████| 13/13 [00:00<00:00, 330.52it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"partial deletion of chromosome 8p") OR (TITLE_ABS:"partial monosomy of chromosome 8p") OR (TITLE_ABS:"partial deletion of the short arm of chromosome type 8") OR (TITLE_ABS:"8p monosomy")), 4 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:00<00:00,  1.01it/s]


 Returning 3


100%|██████████| 3/3 [00:00<00:00, 307.91it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"schizophrenia 6"), 58 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


 Returning 41


100%|██████████| 41/41 [00:00<00:00, 490.19it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"San Luis Valley syndrome") OR (TITLE_ABS:"Recombinant chromosome 8 syndrome") OR (TITLE_ABS:"Recombinant 8 syndrome")), 13 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


 Returning 12


100%|██████████| 12/12 [00:00<00:00, 348.31it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"8p chromosome") AND ((TITLE_ABS:"deletion") OR (TITLE_ABS:"monosomy") OR (TITLE_ABS:"trisomy") OR (TITLE_ABS:"duplications") OR (TITLE_ABS:"translocations") OR (TITLE_ABS:"rings") OR (TITLE_ABS:"mosaicism") OR (TITLE_ABS:"isochromosome") OR (TITLE_ABS:"inversion"))), 10 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.24s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 318.29it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Primary sclerosing cholangitis"), 5902 European PMC PAPERS FOUND


100%|██████████| 6/6 [01:02<00:00, 10.38s/it]


 Returning 5075


100%|██████████| 5075/5075 [00:20<00:00, 249.29it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"systemic-onset juvenile idiopathic arthritis") OR (TITLE_ABS:"systemic juvenile idiopathic arthritis") OR (TITLE_ABS:"Stills disease")), 1184 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:13<00:00,  6.57s/it]


 Returning 986


100%|██████████| 986/986 [00:02<00:00, 452.73it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"tango2") AND ((TITLE_ABS:"mutation") OR (TITLE_ABS:"disease") OR (TITLE_ABS:"deficiency") OR (TITLE_ABS:"disorder"))), 46 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.59s/it]


 Returning 44


100%|██████████| 44/44 [00:00<00:00, 435.14it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"SLC13A5 deficiency") OR (TITLE_ABS:"SLC13A5 deficient") OR (TITLE_ABS:"SLC13A5 Deficiency") OR (TITLE_ABS:"EIEE25") OR (TITLE_ABS:"EIEE-25") OR (TITLE_ABS:"Kohlschutter-Tonz Syndrome")), 47 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.30s/it]


 Returning 46


100%|██████████| 46/46 [00:00<00:00, 483.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Pearson syndrome") OR (TITLE_ABS:"multiple mitochondrial DNA deletion syndrome") OR (TITLE_ABS:"sideroblastic Anemia with marrow cell vacuolization and exocrine pancreatic dysfunction") OR (TITLE_ABS:"Pearson marrow-pancreas syndrome") OR (TITLE_ABS:"Pearson's syndrome") OR (TITLE_ABS:"sideroblastic anemia with marrow cell vacuolization and exocrine pancreatic dysfunction") OR (TITLE_ABS:"multiple mtDNA deletion syndrome")), 216 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:04<00:00,  4.21s/it]


 Returning 190


100%|██████████| 190/190 [00:00<00:00, 527.27it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"multiple mitochondrial DNA deletion syndrome") OR (TITLE_ABS:"multiple mtDNA deletion syndrome")), 2 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:00<00:00,  1.20it/s]


 Returning 1


100%|██████████| 1/1 [00:00<00:00, 121.37it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"epithelioid hemangioendothelioma") OR (TITLE_ABS:"epithelioid angioendothelioma") OR (TITLE_ABS:"malignant epithelioid hemangioendothelioma") OR (TITLE_ABS:"epithelioid angiosarcoma")), 1731 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:14<00:00,  7.38s/it]


 Returning 1491


100%|██████████| 1491/1491 [00:03<00:00, 449.90it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Snyder-Robinson syndrome"), 48 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.14s/it]


 Returning 46


100%|██████████| 46/46 [00:00<00:00, 444.36it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"4H syndrome") OR (TITLE_ABS:"4H Leukodystrophy")), 60 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.87s/it]


 Returning 56


100%|██████████| 56/56 [00:00<00:00, 482.58it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(((TITLE_ABS:"Usher syndrome") AND (TITLE_ABS:"type 1F")) OR ((TITLE_ABS:"USHER syndrome") AND (TITLE_ABS:"type IF"))), 712 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.84s/it]


 Returning 650


100%|██████████| 650/650 [00:01<00:00, 513.36it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"leukoencephalopathy with brain stem and spinal cord involvement") OR ((TITLE_ABS:"leukoencephalopathy") AND ((TITLE_ABS:"brain") OR (TITLE_ABS:"spinal cord")))), 3154 European PMC PAPERS FOUND


100%|██████████| 4/4 [00:33<00:00,  8.31s/it]


 Returning 2744


100%|██████████| 2744/2744 [00:07<00:00, 372.00it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Nemaline Myopathy") OR (TITLE_ABS:"nemaline rod myopathy") OR (TITLE_ABS:"nemaline body disease") OR (TITLE_ABS:"rod myopathy")), 916 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:13<00:00, 13.36s/it]


 Returning 779


100%|██████████| 779/779 [00:01<00:00, 518.51it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Cerebral cavernous malformation") OR (TITLE_ABS:"cavernous angiomatous malformations") OR (TITLE_ABS:"cerebral capillary malformations")), 539 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.14s/it]


 Returning 506


100%|██████████| 506/506 [00:00<00:00, 523.31it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"CACNA1A") AND ((TITLE_ABS:"early") OR (TITLE_ABS:"infant")) AND ((TITLE_ABS:"epileptic") OR (TITLE_ABS:"epilepsy"))), 36 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.28s/it]


 Returning 35


100%|██████████| 35/35 [00:00<00:00, 476.70it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Lafora"), 700 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:07<00:00,  7.13s/it]


 Returning 568


100%|██████████| 568/568 [00:01<00:00, 456.66it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"VCP myopathy") OR (TITLE_ABS:"VCP Disease")), 47 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


 Returning 47


100%|██████████| 47/47 [00:00<00:00, 463.02it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"scn2a") AND ((TITLE_ABS:"epilepsy") OR (TITLE_ABS:"seizure"))), 382 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:11<00:00, 11.41s/it]


 Returning 354


100%|██████████| 354/354 [00:00<00:00, 519.15it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"scn2a") AND ((TITLE_ABS:"epilepsy") OR (TITLE_ABS:"seizure"))), 382 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.75s/it]


 Returning 354


100%|██████████| 354/354 [00:00<00:00, 452.72it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"sarcoidosis"), 27991 European PMC PAPERS FOUND


100%|██████████| 28/28 [03:52<00:00,  8.31s/it]


 Returning 18861


100%|██████████| 18861/18861 [04:47<00:00, 65.61it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Hereditary pancreatitis"), 583 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.20s/it]


 Returning 457


100%|██████████| 457/457 [00:00<00:00, 522.64it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Schuurs-Hoeijmakers syndrome") OR (TITLE_ABS:"PACS1 Syndrome")), 27 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.75s/it]


 Returning 27


100%|██████████| 27/27 [00:00<00:00, 477.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"primary ciliary dyskinesia") OR (TITLE_ABS:"ciliary dyskinesia primary") OR (TITLE_ABS:"Dextrocardia-bronchiectasis-sinusitis syndrome") OR (TITLE_ABS:"kartageners syndrome") OR (TITLE_ABS:"Primary ciliary dyskinesia and situs inversus") OR (TITLE_ABS:"Siewert syndrome") OR (TITLE_ABS:"Kartagener syndrome") OR (TITLE_ABS:"immotile ciliary syndrome") OR (TITLE_ABS:"Kartagener's syndrome") OR (TITLE_ABS:"ciliary motility disorder") OR ((TITLE_ABS:"syndrome") AND (TITLE_ABS:"bronchiectasis") AND (TITLE_ABS:"chronic sinusitis") AND (TITLE_ABS:"dextrocardia"))), 2631 European PMC PAPERS FOUND


100%|██████████| 3/3 [00:25<00:00,  8.44s/it]


 Returning 2072


100%|██████████| 2072/2072 [00:04<00:00, 455.42it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Progressive Familial Intrahepatic Cholestasis"), 726 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:08<00:00,  8.28s/it]


 Returning 678


100%|██████████| 678/678 [00:01<00:00, 504.75it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"craniopharyngioma") OR (TITLE_ABS:"Rathke's pouch tumor") OR (TITLE_ABS:"cystoma") OR (TITLE_ABS:"Rathke pouch neoplasm")), 4308 European PMC PAPERS FOUND


100%|██████████| 5/5 [00:45<00:00,  9.06s/it]


 Returning 3222


100%|██████████| 3222/3222 [00:08<00:00, 362.72it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Recurrent Respiratory Papillomatosis") OR (TITLE_ABS:"glottal papillomatosis") OR (TITLE_ABS:"tracheal papillomatosis")), 1011 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:09<00:00,  4.97s/it]


 Returning 900


100%|██████████| 900/900 [00:01<00:00, 469.37it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"laryngeal papillomatosis") OR ((TITLE_ABS:"Recurrent Respiratory Papillomatosis") AND ((TITLE_ABS:"larynx") OR (TITLE_ABS:"laryngeal")))), 832 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.81s/it]


 Returning 603


100%|██████████| 603/603 [00:01<00:00, 512.73it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Shwachman Diamond") OR (TITLE_ABS:"Shwachman-Bodian-Diamond")), 573 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:06<00:00,  6.39s/it]


 Returning 533


100%|██████████| 533/533 [00:01<00:00, 318.02it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=(TITLE_ABS:"Smith-Kingsmore syndrome"), 10 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


 Returning 10


100%|██████████| 10/10 [00:00<00:00, 312.45it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"Tatton Brown Rahman Syndrome") OR (TITLE_ABS:"DNMT3A Overgrowth Syndrome")), 44 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.41s/it]


 Returning 43


100%|██████████| 43/43 [00:00<00:00, 366.52it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"TBCK Syndrome") OR (TITLE_ABS:"TBCK Encephalopathy") OR (TITLE_ABS:"TBCK-associated encephalopathy") OR (TITLE_ABS:"TBCK Encephaloneuropathy") OR (TITLE_ABS:"TBCK Mutation")), 8 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


 Returning 8


100%|██████████| 8/8 [00:00<00:00, 381.37it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"dyskeratosis congenita") OR (TITLE_ABS:"Zinsser-Engman-Cole syndrome")), 1144 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:12<00:00,  6.11s/it]


 Returning 1017


100%|██████████| 1017/1017 [00:02<00:00, 496.64it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"telomere syndrome") OR (TITLE_ABS:"short telomere syndrome")), 45 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:02<00:00,  2.59s/it]


 Returning 45


100%|██████████| 45/45 [00:00<00:00, 361.72it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"stiff man syndrome") OR (TITLE_ABS:"stiff person syndrome") OR (TITLE_ABS:"Moersch-Woltman syndrome")), 1132 European PMC PAPERS FOUND


100%|██████████| 2/2 [00:14<00:00,  7.00s/it]


 Returning 902


100%|██████████| 902/902 [00:01<00:00, 500.25it/s]


https://www.ebi.ac.uk/europepmc/webservices/rest/search?format=JSON&pageSize=1000&synonym=TRUE&resultType=core&query=((TITLE_ABS:"isolated pontocerebellar hypoplasia") OR (TITLE_ABS:"nonsyndromic pontocerebellar hypoplasia") OR (TITLE_ABS:"pontocerebellar hypoplasia") OR (TITLE_ABS:"pontoneocerebellar atrophy") OR (TITLE_ABS:"pontoneocerebllar hypoplasia")), 380 European PMC PAPERS FOUND


100%|██████████| 1/1 [00:05<00:00,  5.56s/it]


 Returning 366


100%|██████████| 366/366 [00:00<00:00, 487.91it/s]


Query the database for the numbers of papers returned

In [20]:
q = ldb.session.query(SKC.id, SKC.name, func.count(SKC_HM.has_members_id)) \
    .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
    .group_by(SKC.id, SKC.name) \
    .order_by(SKC.id.cast(Integer))
corpora_df = pd.DataFrame(q.all(), columns=['Corpus ID', 'Corpus Name', 'Paper Count'])

paper_count = ldb.session.query(func.count(SKE.id)).first()
print('Count of all papers in database: %d'%(paper_count[0]))

corpora_df

Count of all papers in database: 64861


Unnamed: 0,Corpus ID,Corpus Name,Paper Count
0,1,Adult Polyglucosan Body Disease,92
1,2,Creatine transporter deficiency,300
2,3,AGAT deficiency,34
3,4,Guanidinoacetate methyltransferase deficiency,129
4,5,CLOVES Syndrome,94
...,...,...,...
76,78,TBCK Syndrome,8
77,79,Dyskeratosis congenita,1015
78,80,Telomere syndrome,44
79,81,The Stiff Person Syndrome,901


In [10]:
ldb.session.rollback()

In [24]:
ft_retriever  = [t for t in tk.get_tools() if isinstance(t, RetrieveFullTextTool)][0]

for i, c in corpora_df.iterrows():
    if c['Corpus ID'] != '81':
        continue
    print(c['Corpus Name'])
    ft_count = 0
    no_ft_count = 0
    doi_list = [e.id for e in ldb.list_expressions(collection_id=c['Corpus ID'])]
    for doi in doi_list:
        d2 = doi.replace('doi:', '')
        path = loc+db_name+'/ft/'
        nxml_file_path = path+'/'+d2+'.nxml'
        pdf_file_path = path+'/'+d2+'.pdf'
        html_file_path = path+'/'+d2+'.html'
        if os.path.exists(nxml_file_path) or  \
                os.path.exists(pdf_file_path) or \
                os.path.exists(html_file_path):
            ft_count += 1
        try: 
            no_ft_count += 1
            #print('\t'+doi)
            ft_retriever.run(tool_input={'paper_id': doi})
        except Exception as e:
            print(e)
    print(ft_count)
    print(no_ft_count)

The Stiff Person Syndrome
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=013163/aim.0016[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=013163/aim.0016[doi]&retmode=xml
No paper found with that DOI
min() arg is an empty sequence
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1001/archneur.1960.00450040098012[doi]&retmode=xml
No paper found with that DOI
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1001/archneur.1960.00450040098012[doi]&retmode=xml
No paper found with that DOI
min() arg is an empty sequence
https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?api_key=d086451c882fabace54d7b049b6fb8481908&db=pmc&term=10.1001/archneur.1971.00480310050004[doi]&

KeyboardInterrupt: 

In [28]:
q = ldb.session.query(SKE.id, SKI.id, SKI.type, SKF.id, SKF.type, SKF.offset, SKF.content) \
    .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
    .filter(SKC_HM.has_members_id==SKE.id) \
    .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
    .filter(SKE_HR.has_representation_id==SKI.id) \
    .filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
    .filter(SKI_HP.has_part_id==SKF.id) \
    .filter(SKE_HR.has_representation_id==SKI.id) \
    .filter(SKF.type=='section') \
    .filter(SKI.type.like('%FullText')) \
    .order_by(SKE.id, SKF.offset)
items_df = pd.DataFrame(q.all(), columns=['doi', 'item_id', 'item_type', 'fragment_id', 'fragment_type', 'offset', 'content'])

items_df

Unnamed: 0,doi,item_id,item_type,fragment_id,fragment_type,offset,content
0,doi:10.1001/archinte.1994.00420110133015,7e6a350d2d,PDFFullText,7e6a350d2d.0,section,0,•\n
1,doi:10.1001/archinte.1994.00420110133015,7e6a350d2d,PDFFullText,7e6a350d2d.1,section,2,Triple Threat Broad Eligibility\n
2,doi:10.1001/archinte.1994.00420110133015,7e6a350d2d,PDFFullText,7e6a350d2d.2,section,36,Concerns\n
3,doi:10.1002/ccr3.2538,36af029190,JATSFullText,36af029190.0,section,2727,1\nINTRODUCTION\nAutoantibodies to glutamic ac...
4,doi:10.1002/ccr3.2538,36af029190,JATSFullText,36af029190.1,section,3532,2\nCASE REPORT\nA 61-year-old right-handed wom...
...,...,...,...,...,...,...,...
682,doi:10.3389/fnmol.2018.00291,9e6ac0056e,JATSFullText,9e6ac0056e.19,section,81666,Mouse Models With Defective Trafficking of Gly...
683,doi:10.3389/fnmol.2018.00291,9e6ac0056e,JATSFullText,9e6ac0056e.20,section,94537,Outlook\nThis review summarizes the traffickin...
684,doi:10.3389/fnmol.2018.00291,9e6ac0056e,JATSFullText,9e6ac0056e.21,section,97042,Ethics Statement\nExperiments were approved by...
685,doi:10.3389/fnmol.2018.00291,9e6ac0056e,JATSFullText,9e6ac0056e.22,section,97315,"Author Contributions\nNS, VR, and CV performed..."


# Index the abstracts and run some simple semantic queries

Here we index each paper's title and abstract to build a simple question / answer interface.

In [15]:
ldb.session.rollback()

In [32]:
for i, c in tqdm(corpora_df.iterrows()):
    if c['Corpus ID'] != '81':
        continue
    expressions = ldb.list_expressions(collection_id=c['Corpus ID'])    
    ldb.embed_expression_list(expressions)

81it [01:27,  1.07s/it]


In [21]:
question = 'What is known about genetics underlying Stiff Person Syndrome?'

ldb.query_vectorindex(question, k=10, collection_name='ScienceKnowledgeItem_FullText')

[(Document(page_content='Introduction\nStiff-person syndrome (SPS) is an uncommon disorder characterized by progressive stiffness, rigidity, and painful spasm affecting axial muscle. It can lead to significant debilitation and affects ambulation. Usually, it is associated with autoimmunity as it has a significant overlap with autoantibody in type 1 diabetes. Hypopituitarism, especially hypocortisolism, can lead to axial muscle stiffness and rigidity, similar to an SPS. This case highlights a patient with pituitary adenoma and panhypopituitarism with a stiff person-like syndrome as the initial presentation.', metadata={'c_ids': '81', 'e_id': 'doi:10.1159/000522253', 'e_type': 'ClinicalCaseReport', 'i_id': 'a427205ffe', 'i_type': 'JATSFullText', 'f_id': 'a427205ffe.0', 'citation': 'Goh KG, Yusof Khan AHK, Nasruddin A. (2022) Stiff Person-Like Syndrome: An Unusual Presentation of Pituitary Macroadenoma with Panhypopituitarism.'}),
  0.11609077453613281),
 (Document(page_content='1\nINTROD

## ATTEMPTING TO RECONSTRUCT PAPER-QA PIPELINE IN OUR SYSTEM.

1. Embed paper sections + question
2. Given the question, summarize the retrieved paper sections relative to the question
3. Score and select relevant passages
4. Put summaries into prompt
5. Generate answer with prompt


In [30]:
 
os.environ['PGVECTOR_CONNECTION_STRING'] = "postgresql+psycopg2:///"+ldb.name
vectorstore = PGVector.from_existing_index(
        embedding = ldb.embed_model, 
        collection_name = 'ScienceKnowledgeItem') 
retriever = vectorstore.as_retriever(search_kwargs={'k':15, 'filter': {'skc_ids': 81}})
#retriever = vectorstore.as_retriever()


In [31]:
retriever.invoke(question)

[Document(page_content='[Stiff-person syndrome: a clinical observation].\n\nStiff-person syndrome (SPS) is a rare chronic neurological disease characterized by progressing muscle rigidity and painful muscle spasms. The signs of SPS are pain and stiffness in spinal, abdominal and cervical muscles, increased muscle tonus in extensor muscles of extremities, constant stiffness of paravertebral and abdominal muscles and muscle spasms. A clinical case of a SPS patient T., aged 23 years, is presented. The peculiarity of this case is additional left-sided peripheral upper extremity monoparesis, which is most likely associated with the development of left-sided compression-ischemic brachial plexopathy resulted from profound muscular tonic syndrome in the neck and shoulder girdles.', metadata={'skc_ids': '81', 'ske_id': 'doi:10.17116/jnevro201911906196', 'ski_id': 'fb81803b8f', 'skf_id': 'fb81803b8f.0', 'type': 'CitationRecord', 'citation': 'Isaeva NV, Prokopenko SV, Rodikov MV, Abroskina MV, On

In [12]:
from langchain.schema import format_document
from langchain_core.messages import AIMessage, HumanMessage, get_buffer_string
from langchain_core.runnables import RunnableParallel

In [29]:
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough, RunnableLambda
from operator import itemgetter
from langchain.chat_models import ChatOllama
from langchain.schema import get_buffer_string, OutputParserException, format_document
from langchain.callbacks.tracers import ConsoleCallbackHandler
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from alhazen.utils.output_parsers import JsonEnclosedByTextOutputParser

#from paperqa.prompts import summary_prompt as paperqa_summary_prompt, qa_prompt as paperqa_qa_prompt, select_paper_prompt, citation_prompt, default_system_prompt

hum_p = '''First, read through the following JSON encoding of {k} research articles: 

Each document has three attributes: (A) a digital object identifier ('DOI') code, (B) a CITATION string containing the authors, publication year, title and publication location, and the (C) CONTENT field with the title and abstract of the paper.  

```json:{context}```

Then, generate a JSON list of summaries of each article in order to help answer the following question:

Question: {question}

Do NOT directly answer the question, instead summarize to give evidence to help answer the question. 
Focus on specific details, including numbers, equations, or specific quotes. 
Reply "Not applicable" if text is irrelevant. 
Restrict each summary to {summary_length} words. 
Also, provide a score from 1-10 indicating relevance to question. Do not explain your score. 

Write this answer as JSON formatted output. Provide a list of {k} dict objects with the following fields: DOI, SUMMARY, RELEVANCE SCORE. 

Do not provide additional explanation for the answer.
Do not include any other response other than a JSON object.
'''
sys_p = '''Answer in a direct and concise tone. Your audience is an expert, so be highly specific. If there are ambiguous terms or acronyms, first define them.'''

DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="'DOI': '{ske_id}', CITATION: '{citation}', CONTENT:'{page_content}'")
def combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="},{\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return '[{'+document_separator.join(doc_strings)+'}]'

template = ChatPromptTemplate.from_messages([
            ("system", sys_p),
            ("human", hum_p)])

qa_chain = (
    RunnableParallel({
        "k": itemgetter("k"),
        "question": itemgetter("question"),
        "summary_length": itemgetter("summary_length"),
        "context": itemgetter("question") | retriever | combine_documents,
    })
    | {
        "summary": template | ChatOllama(model='mixtral') | JsonEnclosedByTextOutputParser(),
        "context": itemgetter("context"),
    }
)

input = {'question': question, 'summary_length': 1000, 'k':5}    
out = qa_chain.invoke(input, config={'callbacks': [ConsoleCallbackHandler()]})
print(json.dumps(out, indent=4))




[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "What clinical indicators are present for patients suffering from stiff person syndrome (SPS)?",
  "summary_length": 1000,
  "k": 5
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel] Entering Chain run with input:
[0m{
  "question": "What clinical indicators are present for patients suffering from stiff person syndrome (SPS)?",
  "summary_length": 1000,
  "k": 5
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableLambda] Entering Chain run with input:
[0m{
  "question": "What clinical indicators are present for patients suffering from stiff person syndrome (SPS)?",
  "summary_length": 1000,
  "k": 5
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel > 3:chain:RunnableLambda] [0ms] Exiting Chain run with output:
[0m{
  "output": 5
}
[32;1m[

# Run Fast HuggingFace Deep Learning Tools over corpus 

In [80]:
from transformers import pipeline, AutoModel, AutoTokenizer
import torch

model_path = '/Users/gully.burns/Documents/2024H1/models/discourse_tagger'
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", 
                                          truncation=True, 
                                          max_length=512)
labels = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
lookup = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
model = AutoModel.from_pretrained(model_path)
model.eval()

classifier = pipeline("text-classification", 
                      model = model_path, 
                      tokenizer=tokenizer, 
                      truncation=True,
                      batch_size=8,
                      device='mps')


In [85]:
# Try an out-of-the-box classifier on the data for discourse tagging.
from transformers import pipeline

ldb.session.rollback()
one_year_ago = (datetime.now() - timedelta(days=1*365))

q = ldb.session.query(SKE, SKF) \
    .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
    .filter(SKC_HM.has_members_id==SKE.id) \
    .filter(SKE.id==SKE_HR.ScientificKnowledgeExpression_id) \
    .filter(SKE_HR.has_representation_id==SKI.id) \
    .filter(SKI.id==SKI_HP.ScientificKnowledgeItem_id) \
    .filter(SKI_HP.has_part_id==SKF.id) \
    .filter(SKE_HR.has_representation_id==SKI.id) \
    .filter(SKI.type == 'CitationRecord' ) \
    .order_by(SKE.id)

#   .filter(SKC.name == 'The Stiff Person Syndrome' ) \
#   .filter(SKE.publication_date >= one_year_ago) \

s_list = []
for e, f in q.all():
    for i, s in enumerate(ldb.sent_detector.tokenize(f.content)):
        s_list.append([e.id, f.id, i, s])
sent_df = pd.DataFrame(s_list, columns=['doi', 'f_id', 's_id', 'text'])
sent_df

Unnamed: 0,doi,f_id,s_id,text
0,doi:/s0034-98872008000200015,9f3919db23.0,0,[Cholangiocarcinoma].
1,doi:/s0034-98872008000200015,9f3919db23.1,0,Cholangiocarcinoma is a malignant lesion of th...
2,doi:/s0034-98872008000200015,9f3919db23.1,1,Its incidence and prevalence are low.
3,doi:/s0034-98872008000200015,9f3919db23.1,2,It appears from the sixth decade of life and t...
4,doi:/s0034-98872008000200015,9f3919db23.1,3,It is most frequently found in the confluence ...
...,...,...,...,...
538367,doi:huon.2005.49.1.0065,9f08f2e243.1,9,Of the 59 primary epithelial tumours 62.7% was...
538368,doi:huon.2005.49.1.0065,9f08f2e243.1,10,The differential diagnosis and management are ...
538369,doi:huon.2005.49.1.0065,9f08f2e243.1,11,The prognosis of pleomorphic adenomas depends ...
538370,doi:huon.2005.49.1.0065,9f08f2e243.1,12,In cases of suspected malignant epithelial tum...


In [86]:

# Predict multipe texts on single CPU and time the inference duration
start = time()

df = sent_df

preds = classifier([row.text for i, row in df.iterrows()])
pred_df = pd.DataFrame(preds)
df['label'] = [lookup[row.label] for i, row in pred_df.iterrows()]
df['score'] = [row.score for i, row in pred_df.iterrows()]

end = time()

print('Prediction time:', str(timedelta(seconds=end-start)))

Prediction time: 0:26:27.240843


In [100]:
df

Unnamed: 0,doi,f_id,s_id,text,label,score
0,doi:/s0034-98872008000200015,9f3919db23.0,0,[Cholangiocarcinoma].,BACKGROUND,0.556335
1,doi:/s0034-98872008000200015,9f3919db23.1,0,Cholangiocarcinoma is a malignant lesion of th...,BACKGROUND,0.710530
2,doi:/s0034-98872008000200015,9f3919db23.1,1,Its incidence and prevalence are low.,BACKGROUND,0.769202
3,doi:/s0034-98872008000200015,9f3919db23.1,2,It appears from the sixth decade of life and t...,BACKGROUND,0.772743
4,doi:/s0034-98872008000200015,9f3919db23.1,3,It is most frequently found in the confluence ...,BACKGROUND,0.726111
...,...,...,...,...,...,...
538367,doi:huon.2005.49.1.0065,9f08f2e243.1,9,Of the 59 primary epithelial tumours 62.7% was...,RESULTS,0.916542
538368,doi:huon.2005.49.1.0065,9f08f2e243.1,10,The differential diagnosis and management are ...,RESULTS,0.558720
538369,doi:huon.2005.49.1.0065,9f08f2e243.1,11,The prognosis of pleomorphic adenomas depends ...,BACKGROUND,0.629027
538370,doi:huon.2005.49.1.0065,9f08f2e243.1,12,In cases of suspected malignant epithelial tum...,BACKGROUND,0.707331


In [110]:
ldb.session.rollback()

In [111]:
# Generate fragment sentences and add them as Notes
ldb.session.rollback()
for i, row in df.iterrows():
    f_q = ldb.session.query(SKF).filter(SKF.id == row.f_id).first()
    i_q = ldb.session.query(SKI).filter(SKI.id == row.f_id.split('.')[0]).first()
    o = i_q.content.find(row.text)
    l = len(row.text)
    sentence_fragment = ScientificKnowledgeFragment(id=f_q.id+'.'+str(row.s_id), \
                                                    content=row.text, \
                                                    offset=o, \
                                                    length=l, \
                                                    type='sentence')
    i_q.has_part.append(sentence_fragment)
    note_content = {'discourse_label': row.label, 'score': row.score}
    n = Note(id=f_q.id+'.'+str(row.s_id)+'.discourse_type',
             content=json.dumps(note_content, indent=4),
             format='json',
             type='NoteAboutFragment')
    sentence_fragment.has_notes.append(n)
    ldb.session.flush()
ldb.session.commit()


# Running DRSM Classifiers.

In [None]:
from transformers import pipeline, AutoModel, AutoTokenizer
import torch

model_path = '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", 
                                          truncation=True, 
                                          max_length=512)
labels = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
lookup = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
model = AutoModel.from_pretrained(model_path)
model.eval()

classifier = pipeline("text-classification", 
                      model = model_path, 
                      tokenizer=tokenizer, 
                      truncation=True,
                      batch_size=8,
                      device='mps')


# Topic Modeling over the corpus. 

What are the main topics being discussed in each paper?

In [None]:
from transformers import pipeline, AutoModel, AutoTokenizer
import torch

model_path = '/Users/gully.burns/Documents/2024H1/models/drsm_classifier'
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1", 
                                          truncation=True, 
                                          max_length=512)
labels = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
lookup = {'LABEL_%d'%(i):l for i, l in enumerate(labels)}
model = AutoModel.from_pretrained(model_path)
model.eval()

classifier = pipeline("text-classification", 
                      model = model_path, 
                      tokenizer=tokenizer, 
                      truncation=True,
                      batch_size=8,
                      device='mps')

# Search for and download Full Text Papers.

Can we search for all Stiff Person Syndrome papers published in the last 10 years?



In [44]:
ldb.session.rollback()

ten_years_ago = (datetime.now() - timedelta(days=10*365))
print(ten_years_ago)

q = ldb.session.query(func.extract('year', SKE.publication_date.cast(Date)), func.count(SKE.id) ) \
    .filter(SKC.id==SKC_HM.ScientificKnowledgeCollection_id) \
    .filter(SKC_HM.has_members_id==SKE.id) \
    .filter(SKE.publication_date >= ten_years_ago) \
    .filter(SKC.name == 'The Stiff Person Syndrome' ) \
    .group_by(func.extract('year', SKE.publication_date.cast(Date))) \
    .order_by(func.extract('year', SKE.publication_date.cast(Date)))
sps_pubcount_df = pd.DataFrame(q.all(), columns=['doi', 'date'])
sps_pubcount_df

2014-01-25 10:10:34.434342


Unnamed: 0,doi,date
0,2014,25
1,2015,35
2,2016,34
3,2017,26
4,2018,24
5,2019,43
6,2020,40
7,2021,40
8,2022,35
9,2023,49
