## Pubtator mapping QA
This script checks the quality of Pubtator results

To do this, we create a subset of records that have both pubtator results and citation pmids. Then we pull the MeSH terms for each citation and check if the MeSH from the pubtator result is within the set.

These are the following repositories for which Pubtator results have been applied:
* ImmPort
* Data Discovery Engine
* VEuPathDB
* Vivli

Note that Medline/PubMed stores the MH values as MeSH terms without the IDs.
An additional search to MeSH might be needed to convert the names to IDs
Terms with ',' in it are part of the term
Terms with '/' are combinations of terms and should be split before searching

In [90]:
import requests
import pandas as pd
import json
import os
from Bio import Entrez
from Bio import Medline

script_path = os.getcwd()
data_path = os.path.join(script_path,'data')

In [5]:
Entrez.email = "gtsueng@scripps.edu"

In [39]:
def parse_pubtator_hit(hitlist):
    pubtatorlist = []
    for eachhit in hitlist:
        try:
            pubtatorlist.append({'identifier':eachhit['identifier'],'inDefinedTermSet': eachhit['inDefinedTermSet'], 'originalName':eachhit['originalName']})
        except:
            pass
    return pubtatorlist

def parse_citation_links(a_hit):
    hitlist = a_hit['citation']
    pmidlist = []
    for eachhit in hitlist:
        try:
            pmidlist.append({'citationPMID':eachhit['pmid']})
        except:
            pass
    return pmidlist

def parse_results(json_result):
    datasetlist = []
    for eachhit in json_result['hits']:
        tmpdict = {'_id':eachhit['_id'],'description':eachhit['description']}
        tmpdict['citation'] = parse_citation_links(eachhit)
        try:
            tmpdict['healthCondition'] = parse_pubtator_hit(eachhit['healthCondition'])
        except:
            tmpdict['healthCondition'] = []
        try:
            tmpdict['species'] = parse_pubtator_hit(eachhit['species'])
        except:
            tmpdict['species'] = []
        try:
            tmpdict['infectiousAgent'] = parse_pubtator_hit(eachhit['infectiousAgent']) 
        except:
            tmpdict['infectiousAgent'] = []
        datasetlist.append(tmpdict)
    return datasetlist

In [82]:
def fetch_data(sourcelist):
    pmid_results = []
    for eachsource in sourcelist:
        base_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=(includedInDataCatalog.name:{eachsource})&fetch_all=True"
        r = requests.get(base_url)
        results = json.loads(r.text)
        parsed_results = parse_results(results)
        pmid_results.extend(parsed_results)
    pmidDF = pd.DataFrame(pmid_results)
    return pmidDF

def process_raw_df(pmidDF):
    pmid = pmidDF.copy()
    pmid.reset_index(drop=True, inplace=True)
    pmid = pd.concat([pd.DataFrame(x) for x in pmid['citation']], keys=pmid['_id']).reset_index(level=1, drop=True).reset_index()
    df = pmidDF.copy()
    df = df.explode('healthCondition')
    df = df.explode('species')
    df = df.explode('infectiousAgent')
    health = df[['_id','healthCondition']].loc[~df['healthCondition'].isna()]
    health['healthID'] = [x['identifier'] for x in health['healthCondition']]
    health['healthSet'] = [x['inDefinedTermSet'] for x in health['healthCondition']]
    health.drop(columns='healthCondition',inplace=True,axis=1)
    speciesdf = df[['_id','species']].loc[~df['species'].isna()]
    speciesdf['speciesID'] = [x['identifier'] for x in speciesdf['species']]
    speciesdf['speciesSet'] = [x['inDefinedTermSet'] for x in speciesdf['species']]
    speciesdf.drop(columns='species',inplace=True,axis=1)
    pathogen = df[['_id','infectiousAgent']].loc[~df['infectiousAgent'].isna()]
    pathogen['pathogenID'] = [x['identifier'] for x in pathogen['infectiousAgent']]
    pathogen['pathogenSet'] = [x['inDefinedTermSet'] for x in pathogen['infectiousAgent']]
    pathogen.drop(columns='infectiousAgent',inplace=True,axis=1)
    return pmid, health, speciesdf, pathogen

In [16]:
r = requests.get('https://api-staging.data.niaid.nih.gov/v1/query?&q=_exists_:citation.pmid&extra_filter=(includedInDataCatalog.name:%22Data%20Discovery%20Engine%22)&fetch_all=True')
results = json.loads(r.text)
print(results.keys())

dict_keys(['_scroll_id', 'took', 'total', 'max_score', 'hits'])


In [83]:
%%time
## Fetch all sources with Pubtator annotations that also have citation links
sourcelist = ["%22ImmPort%22", "%22Data%20Discovery%20Engine%22", "%22VEuPathDB%22", "%22Vivli%22"]
pmidDF = fetch_data(sourcelist)
print(pmidDF.head(n=2))

                    _id                                        description  \
0  DDE_4d9350ca40032db3  Gut microbiota play a key role in maintaining ...   
1  DDE_524ab0dcc8dfaff4  The purpose is to obtain samples for mRNA, miR...   

                         citation healthCondition  \
0  [{'citationPMID': '28506317'}]              []   
1  [{'citationPMID': '34051754'}]              []   

                                             species  \
0  [{'identifier': '9606', 'inDefinedTermSet': 'U...   
1  [{'identifier': '9606', 'inDefinedTermSet': 'U...   

                                     infectiousAgent  
0                                                 []  
1  [{'identifier': '11320', 'inDefinedTermSet': '...  
Wall time: 3.08 s


In [131]:
pmid, health, speciesdf, pathogen = process_raw_df(pmidDF)
print(speciesdf.head(n=2))

                    _id speciesID speciesSet
0  DDE_4d9350ca40032db3      9606    UniProt
1  DDE_524ab0dcc8dfaff4      9606    UniProt


In [133]:
pmid_health_inner = pmid.merge(health,on='_id',how='inner')
print(pmid_health_inner)
pmid_health = pmid.merge(health,on='_id',how='left')
pmid_health.drop_duplicates(keep='first')
#print(pmid_health)

                     _id citationPMID healthID healthSet
0   DDE_ebeeda66c192b60c     28694998  D019142      MESH
1   DDE_ebeeda66c192b60c     26091036  D019142      MESH
2   DDE_ebeeda66c192b60c     26083749  D019142      MESH
3   DDE_ebeeda66c192b60c     25970247  D019142      MESH
4   DDE_ebeeda66c192b60c     26651942  D019142      MESH
..                   ...          ...      ...       ...
57  DDE_a929c6242865974e     27121861  D003967      MESH
58  DDE_f5fc793b695bd1e1     29116155  D003967      MESH
59  DDE_0101e7fb8f7fbf97     27602409  D003967      MESH
60  DDE_c35639fc074ae587     26556275  D003967      MESH
61  DDE_da51e1e4a852e31b     32348771  D014376      MESH

[62 rows x 4 columns]


Unnamed: 0,_id,citationPMID,healthID,healthSet
0,DDE_4d9350ca40032db3,28506317,,
1,DDE_524ab0dcc8dfaff4,34051754,,
2,DDE_9e400175601f51dc,28935729,,
3,DDE_ebeeda66c192b60c,28694998,D019142,MESH
4,DDE_ebeeda66c192b60c,26091036,D019142,MESH
...,...,...,...,...
1277,veupathdb_DS_25337f8fe9,35123054,,
1278,veupathdb_DS_25a96f8412,25903370,,
1279,veupathdb_DS_25ee308595,34042486,,
1280,veupathdb_DS_261ce7db76,31461898,,


In [119]:
%%time
## Fetch MeSH from citations
pmidlist = pmid_health['citationPMID'].unique().tolist()
testlist = pmidlist[0:5]
print(testlist)
handle = Entrez.efetch(db="pubmed", id=testlist, rettype="medline", retmode="text")
records = Medline.parse(handle)

tmplist = []
for record in records:
    pmid = record['PMID']
    try:
        mesh = record['MH']
    except:
        mesh = []
    tmplist.append({'PMID':pmid,'rawMeSH':mesh})

rawdf = pd.DataFrame(tmplist)

['28506317', '34051754', '28935729', '28694998', '26091036']
Wall time: 693 ms


In [129]:
rawdf = pd.DataFrame(tmplist)
explode_once = rawdf.explode('rawMeSH').fillna('None')
explode_once['MeSH'] = [x.split('/') for x in explode_once['rawMeSH']]
explode_twice = explode_once.explode('MeSH').fillna('None')
print(explode_once.head(n=2))
print(len(explode_once))
print(explode_twice.head(n=2))
clean_df = explode_twice[['PMID','MeSH']].loc[explode_twice['MeSH']!='None'].copy()
print(clean_df)

       PMID rawMeSH     MeSH
0  28506317   Adult  [Adult]
0  28506317    Aged   [Aged]
31
       PMID rawMeSH   MeSH
0  28506317   Adult  Adult
0  28506317    Aged   Aged
       PMID                              MeSH
0  28506317                             Adult
0  28506317                              Aged
0  28506317                 Aged, 80 and over
0  28506317                          Bacteria
0  28506317                   *classification
0  28506317                          genetics
0  28506317          Clostridioides difficile
0  28506317                        physiology
0  28506317            Clostridium Infections
0  28506317                     complications
0  28506317                      microbiology
0  28506317                          *therapy
0  28506317  Fecal Microbiota Transplantation
0  28506317                          *methods
0  28506317                             Feces
0  28506317                      microbiology
0  28506317                            Female
0