In [57]:
import scholarly
import pandas
import time
import datetime

In [35]:
# load the gidi-affiliated faculty list
faculty = pandas.read_csv('./data/gidi_faculty_list.tsv',sep='\t',header=None)
faculty.columns = ['name']
faculty

Unnamed: 0,name
0,Herve Agaisse
1,Huiwang Ai
2,Janis Antonovics
3,Lawrence Band
4,Laura Barnes
...,...
128,Ozlem Yildiz
129,Steven Zeichner
130,Jianhui Zhou
131,Jarrett Zigon


In [86]:
# identify which faculty have google scholar profiles
MAX_ITERS = 3
def query_profiles(name, existing_query=None, iteration=0):
    '''
    Recursive function to query google scholar profiles. Returns the scholar ID if it
    is found within the first MAX_ITERS searches, otherwise returns None.
    '''
    if existing_query:
        query = existing_query
    else:
        query = scholarly.search_author(name)
    
    try:
        query_hit = next(query)
        email_affiliation = query_hit.email
        # if the email address is a virginia address, this is probably the right profile
        if email_affiliation.find('virginia') > -1:
            query_hit.fill()
            profile = query_hit
        # otherwise, recurse through the function up to MAX_ITERS
        else:
            iteration+=1
            if iteration < MAX_ITERS:
                profile = query_profiles(name,existing_query=query,iteration=iteration)
            else:
                profile = None
    except StopIteration:
        profile = None
        
    if not profile:
        print('No google scholar profile hits for ' + name)
    return profile

no_profile = []
current_year_pubs = {}
for name in faculty['name']:
    print(name)
    profile = query_profiles(name)
    if not profile: 
        no_profile.append(name)
    else:
        # get all publications within the current year
        current_year_pubs[name] = []
        for pub in profile.publications:
            if 'year' in pub.bib.keys(): # some publications don't include the year/date
                if pub.bib['year'] == datetime.datetime.now().year:
                    current_year_pubs[name].append(pub.bib.copy())

Herve Agaisse
No google scholar profile hits for Herve Agaisse
Huiwang Ai
Janis Antonovics
No google scholar profile hits for Janis Antonovics
Lawrence Band
Laura Barnes
Christopher L. Barrett
Ruth Gaare Bernheim
No google scholar profile hits for Ruth Gaare Bernheim
Sam Bodily
Phillip Bourne
Thomas Braciale
No google scholar profile hits for Thomas Braciale
Catherine Bradshaw
Michael G. Brown
Timothy Bullock
No google scholar profile hits for Timothy Bullock
Anselmo Canfora
No google scholar profile hits for Anselmo Canfora
James Casanova
No google scholar profile hits for James Casanova
Anna Cliffe
No google scholar profile hits for Anna Cliffe
Linda Columbus
Alison Criss
Cristian Danna
No google scholar profile hits for Cristian Danna
Ashley Deeks
No google scholar profile hits for Ashley Deeks
Isabelle Derre
No google scholar profile hits for Isabelle Derre
Zygmunt Derwenda
Rebecca Dillingham
No google scholar profile hits for Rebecca Dillingham
Marcel Durieux
Joshua Eby
No google 

In [124]:
pub_to_frame = {}
for author, pubs in current_year_pubs.items():
    for pub in pubs:
        title = pub['title']
        pub_to_frame[pub['title']] = {'Author':author,'year_recorded':datetime.datetime.now().year,
                                     'month_recorded':datetime.datetime.now().month,
                                     'day_recorded':datetime.datetime.now().day}
pub_to_frame = pandas.DataFrame(pub_to_frame).T
pub_to_frame['Title'] = pub_to_frame.index
pub_to_frame = pub_to_frame.reset_index(drop=True)
# Make the titles uppercase to avoid missing duplicates due to case differences
pub_to_frame['Title'] = pub_to_frame['Title'].str.upper()
pub_to_frame

Unnamed: 0,Author,year_recorded,month_recorded,day_recorded,Title
0,Huiwang Ai,2019,11,13,DEVELOPMENT AND APPLICATIONS OF BIOLUMINESCENT...
1,Huiwang Ai,2019,11,13,RED-SHIFTED LUCIFERASE-LUCIFERIN PAIRS FOR ENH...
2,Huiwang Ai,2019,11,13,IDENTIFICATION OF FACTORS COMPLICATING BIOLUMI...
3,Huiwang Ai,2019,11,13,"A GENETICALLY ENCODED, RATIOMETRIC FLUORESCENT..."
4,Huiwang Ai,2019,11,13,ATP-INDEPENDENT BIOLUMINESCENT REPORTER VARIAN...
...,...,...,...,...,...
427,Mark Yeager,2019,11,13,EDITORIAL OVERVIEW: BIOPHYSICAL AND COMPUTATIO...
428,Jarrett Zigon,2019,11,13,TRAVELING WITH SUGAR: CHRONICLES OF A GLOBAL E...
429,Jarrett Zigon,2019,11,13,"SACRIFICIAL LIMBS: MASCULINITY, DISABILITY, AN..."
430,Jochen Zimmer,2019,11,13,A LIPID GATING MECHANISM FOR THE CHANNEL-FORMI...


In [128]:
# Find publications that appear more than once in the list; these involve multiple GIDI faculty
len(pub_to_frame['Title'])
len(set(pub_to_frame['Title']))
l = list(pub_to_frame['Title'])
set([x for x in l if l.count(x) > 1])

{'HETEROCELLULAR CONTACT CAN DICTATE ARTERIAL FUNCTION',
 'QUANTIFYING CEACAM TARGETED LIPOSOME DELIVERY USING IMAGING FLOW CYTOMETRY'}

In [136]:
# determine which publications are new since the last scan

# Load the previous scan
old_frame = pandas.read_csv('./results/publication_table.tsv',sep='\t')

# which titles are new?
old_titles = set(old_frame['Title'])
current_titles = set(pub_to_frame['Title'])
new_titles = current_titles - old_titles
new_titles

set()

In [138]:
# merge the old and new publication dataframes and re-save
new_publication_data = old_frame.merge(pub_to_frame)
new_publication_data.to_csv('./results/publication_table.tsv',sep='\t',index=False)

In [140]:
# save the list of faculty that don't have google scholar profiles
no_profile
with open('./results/faculty_missing_profiles.txt', 'w') as file_handler:
    for item in no_profile:
        file_handler.write("{}\n".format(item))