Goal: make a map of literature on a certain topic where papers are individual nodes and two nodes are connected when a paper cites another. The most highly cited papers in the field will be prominent hubs, with satelite literature arounud them

Data to scrape: 
- NIH PMID
- Title
- List of PMID of papers current paper cites
- Total number of citations of paper

Root of graph is the oldest or most highly cited seminal paper
Root connects to 3 most highly cited paper that the root cites
The cycle continues

website to get pubmed urls for papers cited by current paper or cited in current paper
 https://www.ncbi.nlm.nih.gov/pmc/tools/cites-citedby/

#### Idea of how to do this

collect sample of pubmed IDs of a given topic (say 1000 papers)

find out which articles are cited by initial 1000 papers

create dataframe where the columns are: 'article name' 'article id' 'global citations' 'citations within sample'

arrange rows of data frame with the articles with the most citations in the sample at the top


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import time
from Bio import Entrez

In [None]:
# get pubmed data for 100 articles

In [None]:
def search(query):
    Entrez.email = 'cepeders@ncsu.edu'
    handle = Entrez.esearch(db='pubmed', 
                            sort='relevance', 
                            retmax='1000',           # number of articles pulled at once
                            retmode='xml', 
                            term=query)
    results = Entrez.read(handle)
    return results

def fetch_details(id_list):
    #ids = ','.join(id_list)
    Entrez.email = 'cepeders@ncsu.edu'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=id_list)
    results = Entrez.read(handle)
    return results

In [None]:
# generate search for given term

results = search('nucleus accumbens')

id_list = results['IdList']

#print(id_list)
print(len(id_list))         # number of PMIDs in list
print(len(set(id_list)))    # number of unique PMIDs

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
# create a dictionary key for each article's PMID, and append values of PMIDs that are cited by the key PMID

cite_dict = {}

for ID in id_list:
    
    cite_dict[ID] = []
    
    paper = fetch_details(ID)
    
    # get list of reference PMIDs for current article
    citation_id = int(ID)
    
    # values = PMID of articles that cite current article
    #response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?' +
    #            'dbfrom=pubmed&linkname=pubmed_pubmed_citedin&id={citation_id}'.format(citation_id=citation_id))
    
    # values = PMID of articles that are cited by the current article
    response = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?' + 
                'dbfrom=pubmed&linkname=pubmed_pubmed_refs&id={citation_id}'.format(citation_id=citation_id))
    
    time.sleep(1)  # delays url request for n seconds
    
    soup = BeautifulSoup(response.text)
    refs = soup.findAll('id')
    refs = list(refs)
    refs = refs[1:]   # first ref id is the paper itself (we should exclude)
    
    for k,ele in enumerate(refs):
    
        ele = str(ele)
    
        ele = ele.replace('<id>','').replace('</id>','')
    
        #ele = int(ele)
    
        refs[k] = ele
        
    for pmid in refs:
        
        cite_dict[ID].append(pmid)

In [None]:
type(cite_dict)

In [None]:
# save the cite_dict for later hierarchy construction
import json

#with open('NAc_CiteDict.json', 'w') as fp:
#    json.dump(cite_dict, fp)
    
#with open('NAc_CiteDict.json', 'r') as fp:
#    cite_dict = json.load(fp)

In [None]:
len(cite_dict.keys())

In [None]:
# count how many times each article is cited

cite_cnt = {}

for key in cite_dict:
    
    for value in cite_dict[key]:
        
        #value = value.lower()
        #value = value.replace()
        
        if value in cite_cnt.keys():
            
            cite_cnt[value] += 1
            
        else:
            
            cite_cnt[value] = 1

In [None]:
# only keep keys with over 'n' value count

cited_papers = {key:val for key, val in cite_cnt.items() if val > 3}

abc = dict(sorted(cited_papers.items(), key=lambda item: item[1]))

In [None]:
abc

In [None]:
# note: cite_cnt contains PMIDs for papers not in the 1000 paper call
print(len(cite_cnt))
print(len(cited_papers))

In [None]:
# for loop through each article
# append the article PMID, author last name, year pub'd, article title, journal name to a DATAFRAME

id_list2 = list(cited_papers.keys())

paper_df = pd.DataFrame(columns = ['pmid','author','year','title','journal'])

for k,ID in enumerate(id_list2):
    
    paper = fetch_details(ID)
    
    if paper['PubmedArticle'] == []:
        
        paper_df.loc[k] = ['nan','nan','nan','book','nan']
        
    else:
    
        yr = 'None'

        if 'DateCompleted' in paper['PubmedArticle'][0]['MedlineCitation']:

            yr = paper['PubmedArticle'][0]['MedlineCitation']['DateCompleted']['Year']

        ti = paper['PubmedArticle'][0]['MedlineCitation']['Article'].get('ArticleTitle','None')
        
        au = 'None'
        
        if 'AuthorList' in paper['PubmedArticle'][0]['MedlineCitation']['Article']:

            au = paper['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList'][0].get('LastName','None')

        jo = paper['PubmedArticle'][0]['MedlineCitation']['Article']['Journal'].get('Title','None')

        paper_df.loc[k] = [ID, au, yr, ti, jo]

        time.sleep(0.7)  # delays url request for n seconds
    

In [None]:
paper['PubmedArticle'][0]['MedlineCitation']['Article'].keys()

In [None]:
print(paper_df.shape)
paper_df.head(15)

In [None]:
paper_df['year'].unique()

In [None]:
# add the cite_cnt for each pmid in the data frame

paper_df['citedByNof1000'] = pd.Series(np.zeros([len(id_list2),]), index=paper_df.index)

for k in range(len(paper_df)):
    
    if paper_df['pmid'][k] in cite_cnt.keys():
        
        paper_df['citedByNof1000'][k] = cite_cnt[paper_df['pmid'][k]]
                

In [None]:
paper1_df = paper_df.sort_values(['citedByNof1000'],ascending=False)

In [None]:
print(paper1_df.shape)
paper1_df.head(30)

In [None]:
#paper1_df.to_csv('pubmed_NAc_top1000_Jan6.csv', sep=',')

In [None]:
id_list3 = list(cited_papers.keys())

print(len(id_list3))

In [None]:
import Bio.Entrez as Entrez
#from Bio.Entrez import efetch

def get_abstract(pmid):
    
    Entrez.email = 'cepeders@ncsu.edu'
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    
    txt = handle.read().strip().split('\n\n')
    
    longest = 0
    copyright_idx = False

    for n,sect in enumerate(txt):

        if len(sect) > len(txt[longest]):

            longest = n

        if "Copyright" in sect:

            copyright_idx = n
            
    #print(longest)
    #print(copyright_idx)
    #print('\n')

    if longest == (copyright_idx - 1):  
        # if the longest string is also the string before the string containing 'Copyright'

        abstract = txt[longest].replace('\n',' ')
        
    elif copyright_idx == False:
        
        abstract = txt[longest].replace('\n',' ')
        
    else:
        
        if copyright_idx != False and longest != (copyright_idx - 1):
            
            abstract = txt[copyright_idx-1].replace('\n',' ')

    return abstract

In [None]:
abz_dict = {}

for pmid in id_list3:
    
    tmp = get_abstract(pmid)
    
    abz_dict[pmid] = tmp
    
    time.sleep(0.8)
    
    if len(abz_dict.keys())%100 == 0:
        print(len(abz_dict.keys()))
    

In [None]:
# save the abz_dict for later NLP analysis
import json

#with open('NAcTopCited_AbzDict.json', 'w') as fp:
#    json.dump(abz_dict, fp)
    
#with open('NAc_AbzDict.json', 'r') as fp:
#    data = json.load(fp)

In [None]:
# use cite_cnt dict to find how many times each abstract in abz_dict has been cited
abz_cited = {}

for key in abz_dict.keys():
    
    if key in cite_cnt.keys():
        
        abz_cited[key] = cite_cnt[key]
        
    else:
        
        abz_cited[key] = 0


In [None]:
abz_cited

In [None]:
# save the abz_cited for later NLP analysis
import json

#with open('NAcTopCited_AbzCited.json', 'w') as fp:
#    json.dump(abz_cited, fp)
    
#with open('NAc_AbzDict.json', 'r') as fp:
#    data = json.load(fp)

In [None]:
# create a dictionary key for each article's PMID, and appeand values of keywords that are included for key PMID

keyword_dict = {}

for ID in id_list:
    
    keyword_dict[ID] = []
    
    paper = fetch_details(ID)
    
    time.sleep(0.5)
    
    #lengo = len(paper['PubmedArticle'][0]['MedlineCitation']['MeshHeadingList'])
    
    if paper['PubmedArticle'][0]['MedlineCitation']['KeywordList'] != []:
        
        lengo = len(paper['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0])
        
        for k in range(lengo):
        
            #tempword = paper['PubmedArticle'][0]['MedlineCitation']['MeshHeadingList'][k]['DescriptorName'][:]
            tempword = paper['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0][k][:]
        
            keyword_dict[ID].append(tempword)

In [None]:
len(keyword_dict.keys())

In [None]:
# count the frequency of each keyword

value_cnt = {}

for key in keyword_dict:
    
    for value in keyword_dict[key]:
        
        value = value.lower()
        #value = value.replace()
        
        if value in value_cnt.keys():
            
            value_cnt[value] += 1
            
        else:
            
            value_cnt[value] = 1
    

In [None]:
# only keep keys with over 1 value count

repeat_words = {key:val for key, val in value_cnt.items() if val > 1}

abc = dict(sorted(repeat_words.items(), key=lambda item: item[1]))

In [None]:
print(abc)