In [1]:
!pip install scholarly
!pip install tqdm
!pip install pandas
!pip install pymed



## PART 1: Fetch publications and abstracts from google scholar

In [2]:
from scholarly import scholarly #, ProxyGenerator
import pymed
from pymed import PubMed
from tqdm import tqdm
import requests
from xml.etree import ElementTree

In [3]:
AUTHOR = 'Marc Kirschner'
last_name = 'kirschner' # lower case is better

# Retrieve the author's data, fill-in, and print
try:
    search_query = scholarly.search_author(AUTHOR)
    author = scholarly.fill(next(search_query))
    
except:
    
    from scholarly import ProxyGenerator

    pg = ProxyGenerator()
    pg.FreeProxies()
    scholarly.use_proxy(pg)
    search_query = scholarly.search_author(AUTHOR)
    author = scholarly.fill(next(search_query))

In [4]:

for pub in tqdm(author['publications']):
    scholarly.fill(pub)

 28%|██▊       | 182/655 [09:05<23:36,  3.00s/it]  


MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [None]:
#df = pd.DataFrame(columns = ['title','year','authors','abstract'])
import pandas as pd

def fetch(pub):
    return {'title':pub['bib'].get('title', ''), 'year': pub['bib'].get('pub_year', ''), 'authors': pub['bib'].get('author', ''), 'abstract': pub['bib'].get('abstract', '')}

def fetch_keywords(pubmed_id, print_title=False):
    url_pub = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pubmed_id}&&rettype=abstract&retmode=xml'
    response = requests.get(url_pub)
    tree = ElementTree.fromstring(response.content)
    
    if print_title:
        for el in tree.iter('ArticleTitle'):
            print(el.text)

    keywords = []
    for mesh in tree.iter('MeshHeading'):
        descriptor = mesh.find('DescriptorName').text
        keywords.append(descriptor)
    return keywords

with open('stopwords.txt') as f:
    x = f.readlines()
stopwords = [i.strip(',').strip() for i in x[0].split()]

def write_query(title):
    l = title.split()
    l = [i.lower() for i in l if not i.lower() in stopwords]
    l = [i.lower() for i in l if i.isalnum()]
    return " AND ".join([f"{i}[Title]" for i in l])+ f" AND {last_name} [Author]"


In [None]:
df = pd.DataFrame([fetch(pub) for pub in author['publications']])

In [None]:
df = df[df.year!='']
df = df[df.authors.str.lower().str.contains(last_name)]
df = df[df.abstract.notnull()]
df.to_csv(f'{last_name}_publications.csv')

In [None]:
df = pd.read_csv(f'{last_name}_publications.csv', index_col=0)

In [None]:
df.set_index('title', inplace = True)

In [None]:

pubmed = PubMed(tool="MyTool", email="my@email.address")

for i in tqdm(df.index):
    title = i
    year = df.loc[i,'year']
    q = write_query(title)
    results = pubmed.query(q, max_results=1)
    #print(title)
    try:
        x = next(results)
        x = x.toDict()
        pubmed_id = x['pubmed_id'].split('\n')[0]
        keywords = fetch_keywords(pubmed_id)
        df.loc[i, keywords] = 1
        df.loc[i, 'pubmed_id'] = pubmed_id
        #print(keywords, '\n')


    except StopIteration:

        #print("QUERY NOT FOUND!!! \n")
        continue
    
    
    



In [None]:
kw_cols = df.columns[4:]
df[kw_cols] = pd.notnull(df[kw_cols])

df[kw_cols]

In [None]:
(df[kw_cols].sum(1) == 0).mean()

In [None]:
df_full = df[(df[kw_cols].sum(1) != 0)]

In [None]:
df_full[kw_cols].sum(1)