Notebook created by Forrest Hooton

In [2]:
import pandas as pd
import numpy as np
import importlib
import seaborn as sns
from matplotlib import pyplot
import time

In [None]:
import os
os.chdir('..')

In [None]:
from config import mfp
from src.data_loader import load_ctd

In [14]:
# For Microsoft Acedemic Graph
import http.client, urllib.request, urllib.parse, urllib.error, base64
import json
import requests

# Need to insert api key for microsoft acedimic Graph
msft_apikey = 'Insert API key here as string'

def get_title(ID):
    # Mode can be 'calchistogram', 'evaluate', 'interpret', or 'similarity'
    mode = 'evaluate'

    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': msft_apikey,
    }
    
    params = urllib.parse.urlencode({
        # Request parameters
        'expr': f"Id={ID}",
        'count': '10',
        'model': 'latest',
        'attributes': 'Ti',
    })

    loaded_eval = query_API(mode, params, headers)
    
    try:
        title = loaded_eval['entities'][0]['Ti']
    except:
        return 0
    
    return title


def get_citations(paper):
    # Code snippits at: https://dev.labs.cognitive.microsoft.com/docs/services/56332331778daf02acc0a50b/operations/56332331778daf06340c9666

    # Mode can be 'calchistogram', 'evaluate', 'interpret', or 'similarity'
    mode = 'interpret'

    query = paper

    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': msft_apikey,
    }

    params = urllib.parse.urlencode({
        # Request parameters
        'query': query,
        'count': '10',
        'model': 'latest',
        'attributes': 'Ti',
    })

    loaded_eval = query_API(mode, params, headers)

    # If there are issiues with retrieving info, like no interpretations returned, return 0
    try:
        paper_query = loaded_interpret['interpretations'][0]['rules'][0]['output']['value']
        ID, citations = __get_msft_attribute__(paper_query, 'RId', msft_apikey)
    except:
        return np.nan, []

    return ID, citations

# Gets a attribute from paper of microsoft acedemic graph
def __get_msft_attribute__(query, attr, key=msft_apikey):
    # Mode can be 'calchistogram', 'evaluate', 'interpret', or 'similarity'
    mode = 'evaluate'

    headers = {
        # Request headers
        'Ocp-Apim-Subscription-Key': key,
    }

    params = urllib.parse.urlencode({
        # Request parameters
        'expr': query,
        'count': '10',
        'model': 'latest',
        'attributes': attr,
    })

    loaded_eval = query_API(mode, params, headers)

    # See for all attributes: https://docs.microsoft.com/en-us/azure/cognitive-services/academic-knowledge/paperentityattributes
    ID = loaded_eval['entities'][0]['Id']
    citations = loaded_eval['entities'][0][attr]

    return ID, citations

def query_API(mode, params, headers):
    try:
        conn = http.client.HTTPSConnection('api.labs.cognitive.microsoft.com')
        conn.request("GET", "/academic/v1.0/" + mode + "?%s" % params, "{body}", headers)
        response = conn.getresponse()
        data = response.read()

        conn.close()
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
        
    return json.loads(data)

In [3]:
# Adds citations and paper ID to dataframe
# Target is the column that holds the paper titles
def add_citations(df, target):
    papers = pd.DataFrame({ target : df[df[target].notnull()][target].drop_duplicates().tolist() })
    
    citations_list = []
    for idx, row in papers.iterrows():
        ID, citations = get_citations(row[target])
        
        # Float means that the ID is na, aka it did not recognize a paper
        if isinstance(ID, float):
            citations_list.append(np.nan)
            continue
        else:
            papers.at[idx, 'paper_id'] = ID
            citations_list.append(citations)
    
    papers['citations'] = citations_list
    
    return papers

Load FoodMine Data

In [6]:
g_food_data = pd.read_pickle(mfp('misc_save/garlic_food_data.pkl'))
g_food_info = pd.read_csv(mfp('data/garlic_scoring.csv'), encoding='latin1')

c_food_data = pd.read_pickle(mfp('misc_save/cocoa_food_data.pkl'))
c_food_info = pd.read_csv(mfp('data/cocoa_scoring.csv', encoding='latin1'))

In [11]:
# Select all unique PMIDs and store into df
g_PMIDs = pd.DataFrame({'PMID' : g_food_data.PMID.drop_duplicates().tolist()}).merge(g_food_info, how = 'left', on = 'PMID')[['PMID', 'paper']]
c_PMIDs = pd.DataFrame({'PMID' : c_food_data.PMID.drop_duplicates().tolist()}).merge(c_food_info, how = 'left', on = 'PMID')[['PMID', 'paper']]

In [12]:
# Retrieve citation ids from microsoft acedemic graph
g_papers = add_citations(g_PMIDs, 'paper')
c_papers = add_citations(c_PMIDs, 'paper')

#g_papers.to_pickle(mfp('misc_save/garlic_msft.pkl'))
#c_papers.to_pickle(mfp('misc_save/cocoa_msft.pkl'))

In [4]:
g_citations = pd.read_pickle(mfp(f'misc_save/garlic_msft.pkl'))
c_citations = pd.read_pickle(mfp(f'misc_save/cocoa_msft.pkl'))

g_citation_ids = list(set([i for j in g_citations.citations.dropna().tolist() for i in j]))
c_citation_ids = list(set([i for j in c_citations.citations.dropna().tolist() for i in j]))

In [17]:
start = time.time()

# Retrieve paper titles from MAG
titles = []
c = 0
for p in g_citation_ids:
    titles.append(get_title(p))
    if not c % 3:
        time.sleep(3)
        
    if not c % 50:
        print(f'{c} at {(time.time()-start)/60} min')
    c+=1

#pd.DataFrame({'id' : g_citation_ids, 'title' : titles}).to_pickle(mfp('misc_save/garlic_citation_titles.pkl'))

In [27]:
start = time.time()

# Retrieve paper titles from MAG
titles = []
c = 0
for p in c_citation_ids:
    titles.append(get_title(p))
    if not c % 3:
        time.sleep(3)
        
    if not c % 50:
        print(f'{c} at {(time.time()-start)/60} min')
    c+=1

#pd.DataFrame({'id' : c_citation_ids, 'title' : titles}).to_pickle(mfp('misc_save/cocoa_citation_titles.pkl'))

0 at 0.055836995442708336 min
50 at 1.2028738657633464 min
100 at 2.3527114470799764 min
150 at 3.50199187596639 min
200 at 4.597352143128713 min
250 at 5.795228282610576 min
300 at 6.960369209448497 min
350 at 8.074555778503418 min
400 at 9.23499865134557 min
450 at 10.397917222976684 min
500 at 11.508652718861898 min
550 at 12.667752373218537 min
600 at 13.829023762543995 min
650 at 14.948043950398763 min
700 at 16.120840458075204 min
750 at 17.280420589447022 min
800 at 18.44827857812246 min
850 at 19.62849095662435 min
900 at 20.791995211442313 min
950 at 21.903584138552347 min
1000 at 23.066848842302957 min
1050 at 24.29008613030116 min
1100 at 25.61460832754771 min
1150 at 26.776571385065715 min
1200 at 27.93707577387492 min
1250 at 29.04781185388565 min
1300 at 30.221344435214995 min


In [11]:
g_ct = pd.read_pickle(mfp('misc_save/garlic_citation_titles.pkl'))
c_ct = pd.read_pickle(mfp('misc_save/cocoa_citation_titles.pkl'))

In [30]:
# Functions are generally the same as misc/pubmed_utils.py, but slightly tweaked

import urllib.request as request
from lxml import etree
import math

# Constructs appropriate url for pubmed api from search terms
def __construct_url__(url_input, query_type, num_results = 1000000):

    # Constructs url for search query
    if query_type == 'search':
        base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='
        
        if isinstance(url_input, str):
            term_url = url_input.replace(" ", "%20")
        else:
            adjusted_terms = [s.replace(" ", "%20") for s in url_input]
            term_url = '%20AND%20'.join(adjusted_terms)

        url = base_url + term_url
        return url

    elif query_type == 'document':
        base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id='

        doc_urls = ""
        for i in url_input:
            if isinstance(i, str): 
                doc_urls = doc_urls + "," + i
            else:
                doc_urls = doc_urls + "," + str(i)

        url = base_url + doc_urls.lstrip(",") + '&retmode=xml'

        return url


# Divides doc ids for larger paper queries in retrieve_doc_info()
def __divide_list__(ids, num_divisions):

    split_ids = np.array_split(np.asarray(ids), num_divisions)
    split_ids = [np.ndarray.tolist(split_ids[i]) for i in range(len(split_ids))]

    return split_ids

# Enters search terms into pubmed database to return document ID's
def search_pubmed(search_term):

    url = __construct_url__(search_term, 'search')
    
    with request.urlopen(url) as response:
        xml = response.read()

    root = etree.fromstring(xml)
    #print(etree.tostring(root, pretty_print=True))

    # Recursively gets all objects where the tag is Id
    if (root.findall('.//Count') is None) | (root.findall('.//Count')[0].text == '0'):
        return np.nan, np.nan
    
    elements = root.findall('.//Count')
    
    ID = root.findall('.//Id')[0].text
    

    # Converts all lxml objects to their text values
    ids = [i.text for i in elements]

    return ids[0], ID

# Retrieves document (paper) info using pubmed paper ids
def retrieve_doc_info(ids):
    # Can't query too much in a single query, so divides larger id lists into seperate queries
    num_loops = int(math.ceil(len(ids) / 100))

    # Have to split requests larger than 100 documents to keep it within url size
    ids = __divide_list__(ids, num_loops)

    documents = []

    # Retrieves xml data from pubmed
    for i in ids:
        url = self.__construct_url__(i, 'document')

        with request.urlopen(url) as response:
            xml = response.read()

        root = etree.fromstring(xml)

        documents = documents + root.findall('PubmedArticle')

    info = pd.DataFrame()

    for document in documents:

        doc_id = int(document.find('.//PMID').text)

        paper = document.find('.//ArticleTitle').text

        journal = document.find('.//Title').text

        year = document.find('.//Year').text

        if document.find('.//AbstractText') is not None:
            abstract = document.find('.//AbstractText').text
        else:
            abstract = None

        mesh_terms = []
        mesh_UIds = []
        qual_terms = []
        qual_UIds = []

        for mesh_section in document.findall('.//MeshHeading'):
            mesh_terms.append(mesh_section.find('.//DescriptorName').text)
            mesh_UIds.append(mesh_section.find('.//DescriptorName').attrib['UI'])

            if mesh_section.find('.//QualifierName') is not None:
                qual_terms.append(mesh_section.find('.//QualifierName').text)
                qual_UIds.append(mesh_section.find('.//QualifierName').attrib['UI'])
            else:
                qual_terms.append(None)
                qual_UIds.append(None)

        new_row = {
            'PMID' : doc_id,
            'paper' : paper,
            'journal' : journal,
            'year' : year,
            'abstract' : abstract,
            'mesh_terms' : mesh_terms,
            'mesh_UIds' : mesh_UIds,
            'qual_terms' : qual_terms,
            'qual_UIds' : qual_UIds,
            'webpage' : 'https://www.ncbi.nlm.nih.gov/pubmed/' + str(doc_id)
        }

        info = info.append(new_row, ignore_index = True)

    info['PMID'] = info['PMID'].astype('int32')

    return info.reset_index(drop = True)

def greek_letter_converter(chem):
    chem = chem.replace('α ', 'alpha-')
    chem = chem.replace('β ', 'beta-')
    chem = chem.replace('γ ', 'gamma-')
    chem = chem.replace('ρ ', 'rho-')
    chem = chem.replace('δ ', 'delta-')

    return chem


In [41]:
g_ct.title = g_ct.title.apply(greek_letter_converter)

# Query pubmed with titles to find PMID
start = time.time()
c=0
for idx, row in g_ct.iterrows():
    try:
        _, ID = search_pubmed(row['title'])
        g_ct.at[idx, 'pubmed_id'] = ID
        time.sleep(.5)
    except:
        g_ct.at[idx, 'pubmed_id'] = 0
        pass
    
    if not c % 50:
        print(f'{c} at {(time.time()-start)/60} min')
    c+=1
    
#g_ct.to_pickle(mfp('misc_save/garlic_citation_PMIDs.pkl'))

In [46]:
c_ct.title = c_ct.title.apply(greek_letter_converter)

# Query pubmed with titles to find PMID
start = time.time()
c=0
for idx, row in c_ct.iterrows():
    try:
        _, ID = search_pubmed(row['title'])
        g_ct.at[idx, 'pubmed_id'] = ID
        time.sleep(.5)
    except:
        g_ct.at[idx, 'pubmed_id'] = 0
        pass
    
    if not c % 50:
        print(f'{c} at {(time.time()-start)/60} min')
    c+=1
    
#c_ct.to_pickle(mfp('misc_save/cocoa_citation_PMIDs.pkl'))

In [47]:
g_ct = pd.read_pickle(mfp('misc_save/garlic_citation_PMIDs.pkl'))
c_ct = pd.read_pickle(mfp('misc_save/cocoa_citation_PMIDs.pkl'))

Read in CTD

In [3]:
hdata = load_ctd()

de_health = pd.DataFrame(
    hdata[hdata.pubchem_id.notnull() & hdata.PubMedIDs.notnull()][['pubchem_id', 'PubMedIDs', 'ChemicalName']]
    .groupby(['pubchem_id','PubMedIDs']).count()).reset_index()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
len(de_health.PubMedIDs.drop_duplicates())

65016

In [None]:
g_fm = pd.read_pickle(mfp('misc_save/garlic_food_data.pkl'))
c_fm = pd.read_pickle(mfp('misc_save/cocoa_food_data.pkl'))

# Use papers for CTD compounds that appear in the food pilots
g_fm = g_fm.merge(de_health, how='inner', on = 'pubchem_id')
c_fm = c_fm.merge(de_health, how='inner', on = 'pubchem_id')

In [69]:
h_papers_g = pd.DataFrame({'PMID' : list(set([int(i) for j in g_fm.PubMedIDs.dropna().str.split('|').tolist() for i in j]))})
h_papers_c = pd.DataFrame({'PMID' : list(set([int(i) for j in c_fm.PubMedIDs.dropna().str.split('|').tolist() for i in j]))})

## Compare Paper Overlap

In [70]:
output = h_papers_g.merge(pd.DataFrame(g_food_data.PMID.drop_duplicates().reset_index()), how='inner')
output

Unnamed: 0,PMID,index


In [71]:
output = h_papers_c.merge(pd.DataFrame(c_food_data.PMID.drop_duplicates().reset_index()), how='inner')
output

Unnamed: 0,PMID,index


## Compare citations

In [72]:
g_list = g_ct[g_ct.pubmed_id.notnull()].pubmed_id.apply(int).tolist()

g_cda = pd.DataFrame({'PMID' : g_list})

g_overlap = h_papers_g.merge(g_cda, how='inner')
g_overlap

Unnamed: 0,PMID
0,23430952


In [73]:
c_list = c_ct[c_ct.pubmed_id.notnull()].pubmed_id.apply(int).tolist()

c_cda = pd.DataFrame({'PMID' : c_list})

c_overlap = h_papers_c.merge(c_cda, how='inner')
c_overlap

Unnamed: 0,PMID
0,30336258
