## Developing a training dataset for Topic Category classifications

MeSH to EDAM Topic mappings can be found here: https://bioportal.bioontology.org/mappings/EDAM?target=https%3A%2F%2Fdata.bioontology.org%2Fontologies%2FMESH

Unfortunately, there does not appear to be a clean way to pull this data programmatically, so we'll just manually copy/paste it from the website into a tab delimited file and go from there.

To develop the training dataset, 
1. Pull all MeSH terms associated with a dataset, via a dataset's citation PMID
2. Map MeSH terms to EDAM Topics

If the training dataset is not comprehensive enough, consider:
1. Pull the MeSH mappings of EDAM Topics
2. For each mapping, pull 500 titles and abstracts from PubMed and use that as the traininig data

In [43]:
from Bio import Entrez
from Bio import Medline
import requests
import pandas as pd
import text2term
from datetime import datetime
import time
import os

In [11]:
Entrez.email = "your email here"

In [2]:
citation_file = 'data/citation_df_clean.tsv'
citationdf = pd.read_csv(citation_file, delimiter='\t',header=0,index_col=0)
print(citationdf.head(n=2))

                   _id                                        description  \
0  OMICSDI_PRJNA775608  Alveolar epithelial glycocalyx degradation med...   
1   OMICSDI_PRJNA74531  Streptococcus agalactiae STIR-CD-17 Genome seq...   

                                                name      pmid  
0  Alveolar epithelial glycocalyx degradation med...  34874923  
1                Streptococcus agalactiae STIR-CD-17  23105075  


  citationdf = pd.read_csv(citation_file, delimiter='\t',header=0,index_col=0)


In [8]:
pmidfile = pd.read_csv('data/pmids.csv', delimiter='\t',header=0)
pmidlist = pmidfile['Abstract'].tolist()

In [4]:
#test_pmid = citationdf.iloc[0]['pmid']
test_pmid = "21406103"
print(test_pmid)

21406103


In [5]:
handle = Entrez.efetch(db="pubmed", id=test_pmid, rettype="medline", retmode="text")
records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
for record in records:
    MESHSet = record.get("MH","?") #writes the record to a list called MH
    print(MESHSet)
    

['Arabidopsis/*genetics/metabolism', 'Arabidopsis Proteins/*genetics', 'Gene Expression Profiling', 'Gene Expression Regulation, Plant', 'Gene Regulatory Networks', 'Hemoglobins/*genetics', 'Nitric Oxide/metabolism', 'Oligonucleotide Array Sequence Analysis', 'Oxidative Stress', 'Oxygen/metabolism', 'Plants, Genetically Modified/genetics/metabolism', 'Promoter Regions, Genetic', 'RNA, Plant/genetics', 'Seeds/genetics/*metabolism', 'Stress, Physiological']


In [17]:
meshlist = []
PMIDFails = []
for PMID in pmidlist: #iterates through the PMID list
    #print('fetching authors for: '+str(PMID))
    handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
    records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
    for record in records:
        meshset = record.get("MH","?") 
        tempmesh = [x.replace('/',',').replace("*","") for x in meshset]
        meshlist.append({'pmid':PMID, 'mesh':tempmesh})
        print(PMID)
    time.sleep(0.5)

NameError: name 'meshSet' is not defined

In [20]:
def retrieve_mesh_by_pmids(PMIDList):
    print(datetime.now().time())
    meshlist = []
    PMIDFails = []
    for PMID in PMIDList: #iterates through the PMID list
        try:
            #print('fetching authors for: '+str(PMID))
            handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
            records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
            for record in records:
                meshset = record.get("MH","?") 
                tempmesh = [x.replace('/',',').replace("*"," ") for x in meshset]
                for eachterm in tempmesh:
                    if ',' not in eachterm:
                        meshlist.append({'pmid':PMID, 'mesh':eachterm})
                    else:
                        termlist = eachterm.split(',')
                        for cleanterm in termlist:
                            meshlist.append({'pmid':PMID, 'mesh':cleanterm.strip()})               
        except:
            PMIDFails.append(PMID)
            print("pmid not found: ",PMID)
        time.sleep(0.5)
    print(datetime.now().time())
    return(meshlist)

meshlist = retrieve_mesh_by_pmids(pmidlist)

14:42:10.831540
14:43:50.066549


In [35]:
meshdf = pd.DataFrame(meshlist)
print(meshdf.head(n=2))
meshlist_4_t2t = meshdf['mesh'].unique().tolist()
print(len(meshlist_4_t2t))
df1 = text2term.map_terms(meshlist_4_t2t, "EDAM", use_cache=True)
df1.sort_values(['Source Term','Mapping Score'],ascending=[False,False],inplace=True)
df2 = df1.drop_duplicates('Source Term',keep='first')
print(df2.head(n=10))

       pmid                                               mesh
0  32483332  Clustered Regularly Interspaced Short Palindro...
1  32483332                                           genetics
796
2023-12-12 15:13:20 INFO [text2term.t2t]: Loading cached ontology from: cache\EDAM\EDAM-term-details.pickle
2023-12-12 15:13:20 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-12 15:13:20 INFO [text2term.t2t]: Mapping 796 source terms to EDAM
2023-12-12 15:13:20 INFO [text2term.t2t]: ...done (mapping time: 0.30s seconds)
                                  Source Term ID                  Source Term  \
746   http://ccb.hms.harvard.edu/t2t/RKBkQcZLJym                     virology   
87    http://ccb.hms.harvard.edu/t2t/R5H5PuHskrh               ultrastructure   
117   http://ccb.hms.harvard.edu/t2t/RHYGKwKez4t                     toxicity   
200   http://ccb.hms.harvard.edu/t2t/RDeMxMfqDkW                      therapy   
721   http://ccb.hms.harvard.edu/t2t/R3dS7P5wJji 

In [45]:
df3 = df2.loc[df2['Mapped Term CURIE'].astype(str).str.contains('TOPIC')]
#print(len(df3))
#print(df3.head(n=10))
df_thresh_70 = df3.loc[df3['Mapping Score']>=0.7]
print(df_thresh_70)
df3.to_csv(os.path.join('data','mesh_t2t_no_threshhold.tsv'),sep='\t',header=True)


                                  Source Term ID            Source Term  \
746   http://ccb.hms.harvard.edu/t2t/RKBkQcZLJym               virology   
721   http://ccb.hms.harvard.edu/t2t/R3dS7P5wJji                surgery   
99    http://ccb.hms.harvard.edu/t2t/RAXCEN7kXtQ             physiology   
313   http://ccb.hms.harvard.edu/t2t/RJvZaGZrSEx           pharmacology   
106   http://ccb.hms.harvard.edu/t2t/RB9oAjeSd8a              pathology   
...                                          ...                    ...   
1561  http://ccb.hms.harvard.edu/t2t/RFmN5rcdQkP        Gene Expression   
920   http://ccb.hms.harvard.edu/t2t/R7fyQiZy3Mn        DNA Replication   
1577  http://ccb.hms.harvard.edu/t2t/R5CTPtzqDfE        DNA Methylation   
472   http://ccb.hms.harvard.edu/t2t/R6HtqgnjfRT   Alternative Splicing   
405   http://ccb.hms.harvard.edu/t2t/RBSmDqmscW2            Agriculture   

                      Mapped Term Label Mapped Term CURIE  \
746                            Virolog

In [51]:
df4 = pd.read_csv(os.path.join('data','mesh_t2t_no_threshhold_checked.tsv'),delimiter='\t',header=0, index_col=0)
print(df4.head(n=2))
df5 = df4.loc[df4['match']!='bad']
print('Mean',df5.groupby('match')['Mapping Score'].mean())
print('Max',df5.groupby('match')['Mapping Score'].max())
print('Min',df5.groupby('match')['Mapping Score'].min())

                                 Source Term ID Source Term Mapped Term Label  \
746  http://ccb.hms.harvard.edu/t2t/RKBkQcZLJym    virology          Virology   
117  http://ccb.hms.harvard.edu/t2t/RHYGKwKez4t    toxicity        Toxicology   

    Mapped Term CURIE                     Mapped Term IRI  Mapping Score  \
746   EDAM.TOPIC:0781  http://edamontology.org/topic_0781          0.993   
117   EDAM.TOPIC:2840  http://edamontology.org/topic_2840          0.553   

     Tags match  
746   NaN  good  
117   NaN    ok  
Mean match
good    0.829730
ok      0.566939
Name: Mapping Score, dtype: float64
Max match
good    0.999
ok      0.893
Name: Mapping Score, dtype: float64
Min match
good    0.304
ok      0.335
Name: Mapping Score, dtype: float64


In [59]:
meshdf['Source Term'] = meshdf['mesh']
#print(meshdf.head(n=2))
meshmapped = meshdf.merge(df5,on=['Source Term'],how = 'inner')
print(len(meshmapped))
print(meshmapped.head(n=2))
unique_matches = meshmapped.groupby(['pmid','Mapped Term Label','Mapped Term IRI']).size().reset_index(name='counts')
print(len(unique_matches))
priority_matches = unique_matches.loc[unique_matches['counts']>1]
print(len(priority_matches))
print(priority_matches.head(n=10))

976
       pmid      mesh Source Term                              Source Term ID  \
0  32483332  genetics    genetics  http://ccb.hms.harvard.edu/t2t/RCfTRcoJghm   
1  32483332  genetics    genetics  http://ccb.hms.harvard.edu/t2t/RCfTRcoJghm   

  Mapped Term Label Mapped Term CURIE                     Mapped Term IRI  \
0          Genetics   EDAM.TOPIC:3053  http://edamontology.org/topic_3053   
1          Genetics   EDAM.TOPIC:3053  http://edamontology.org/topic_3053   

   Mapping Score  Tags match  
0          0.998   NaN  good  
1          0.998   NaN  good  
582
175
        pmid Mapped Term Label                     Mapped Term IRI  counts
3   11530848      Mice or rats  http://edamontology.org/topic_0213       2
5   11530848        Physiology  http://edamontology.org/topic_3300       2
11  12536215          Genetics  http://edamontology.org/topic_3053       4
13  15656970       Arabidopsis  http://edamontology.org/topic_0786       2
15  15656970          Genetics  http://edamo

In [31]:
text2term.cache_ontology("https://data.bioontology.org/ontologies/EDAM/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf", "EDAM")

2023-12-12 15:03:34 INFO [text2term.term_collector]: Loading ontology https://data.bioontology.org/ontologies/EDAM/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf...
2023-12-12 15:03:36 INFO [text2term.term_collector]: ...done (ontology loading time: 1.80s)
2023-12-12 15:03:36 INFO [text2term.term_collector]: Collecting ontology term details...
2023-12-12 15:03:37 INFO [text2term.term_collector]: ...done: collected 3588 ontology terms (collection time: 0.72s)
2023-12-12 15:03:37 INFO [text2term.t2t]: Filtered ontology terms to those of type: any
2023-12-12 15:03:37 INFO [text2term.t2t]: Caching ontology https://data.bioontology.org/ontologies/EDAM/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=rdf to: cache\EDAM


<text2term.onto_cache.OntologyCache at 0x1bfaf012c50>

In [33]:
df2 = text2term.map_terms(["Virology", "metabolism"], "EDAM", use_cache=True)
print(df2)

2023-12-12 15:10:03 INFO [text2term.t2t]: Loading cached ontology from: cache\EDAM\EDAM-term-details.pickle
2023-12-12 15:10:03 INFO [text2term.t2t]: Filtered ontology terms to those of type: class
2023-12-12 15:10:03 INFO [text2term.t2t]: Mapping 2 source terms to EDAM
2023-12-12 15:10:03 INFO [text2term.t2t]: ...done (mapping time: 0.30s seconds)
                               Source Term ID Source Term  \
0  http://ccb.hms.harvard.edu/t2t/REhvV6wpwrf    Virology   
1  http://ccb.hms.harvard.edu/t2t/REhvV6wpwrf    Virology   
2  http://ccb.hms.harvard.edu/t2t/REhvV6wpwrf    Virology   
3  http://ccb.hms.harvard.edu/t2t/R7BTYkJ4dcF  metabolism   
4  http://ccb.hms.harvard.edu/t2t/R7BTYkJ4dcF  metabolism   
5  http://ccb.hms.harvard.edu/t2t/R7BTYkJ4dcF  metabolism   

              Mapped Term Label Mapped Term CURIE  \
0                      Virology   EDAM.TOPIC:0781   
1        Urology and nephrology   EDAM.TOPIC:3422   
2                       Biology   EDAM.TOPIC:3070   
3        

In [9]:
meshtest = [x.replace('/',',') for x in MESHSet]
dftest = text2term.map_terms(meshtest,"EDAMT", use_cache=True)
print(dftest)

2023-10-12 11:15:30 INFO [text2term.tfidf_mapper]: Mapping 17 source terms...
2023-10-12 11:15:30 INFO [text2term.tfidf_mapper]: ...against 226 ontology terms (363 labels/synonyms)
2023-10-12 11:15:30 INFO [text2term.tfidf_mapper]: ...done (mapping time: 0.06s seconds)
                                Source Term ID  \
0   http://ccb.hms.harvard.edu/t2t/R84GM5W24Ap   
1   http://ccb.hms.harvard.edu/t2t/R84GM5W24Ap   
2   http://ccb.hms.harvard.edu/t2t/R84GM5W24Ap   
3   http://ccb.hms.harvard.edu/t2t/R9gTWbnWbyz   
4   http://ccb.hms.harvard.edu/t2t/RF6HnvUi6Wa   
5   http://ccb.hms.harvard.edu/t2t/R5HU4YdZkbe   
6   http://ccb.hms.harvard.edu/t2t/R5HU4YdZkbe   
7   http://ccb.hms.harvard.edu/t2t/R5HU4YdZkbe   
8   http://ccb.hms.harvard.edu/t2t/R7dSudQvDDQ   
9   http://ccb.hms.harvard.edu/t2t/R7dSudQvDDQ   
10  http://ccb.hms.harvard.edu/t2t/R7dSudQvDDQ   
11  http://ccb.hms.harvard.edu/t2t/RAqRerDDpmi   
12  http://ccb.hms.harvard.edu/t2t/RAqRerDDpmi   
13  http://ccb.hms.harvard.edu

In [12]:
print(dftest.head(n=1))

                               Source Term ID  \
0  http://ccb.hms.harvard.edu/t2t/R84GM5W24Ap   

                                      Source Term Mapped Term Label  \
0  Alveolar Epithelial Cells,metabolism,pathology         Pathology   

  Mapped Term CURIE                     Mapped Term IRI  Mapping Score  Tags  
0   EDAM.TOPIC:0634  http://edamontology.org/topic_0634          0.447  None  


In [None]:
def retrieve_mesh_by_pmids(PMIDList):
    print(datetime.datetime.now().time())
    meshdf = pd.DataFrame(columns=['pmid','Source Term ID','Source Term','Mapped Term Label',
                                   'Mapped Term CURIE','Mapped Term IRI','Mapping Score','Tags']
    PMIDFails = []
    for PMID in PMIDList: #iterates through the PMID list
        try:
            #print('fetching authors for: '+str(PMID))
            handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
            records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
            for record in records:
                meshset = record.get("MH","?") 
                tempmesh = [x.replace('/',',').replace("*","") for x in meshSet]
                tempdf = text2term.map_terms(tempmesh,"EDAMT", use_cache=True)
                tempdf['pmid'] = PMID
                
        except:
            PMIDFails.append(PMID)
            print("pmid not found: ",PMID)

    print(datetime.datetime.now().time())
    return(PublicationDF,author_df,PMIDFails)