In [1]:
import scholarly
import pandas
import time
import datetime

In [2]:
# load the gidi-affiliated faculty list
faculty = pandas.read_csv('./data/gidi_faculty_list.tsv',sep='\t',header=None)
faculty.columns = ['name']
faculty

Unnamed: 0,name
0,Herve Agaisse
1,Huiwang Ai
2,Janis Antonovics
3,Lawrence Band
4,Laura Barnes
...,...
128,Ozlem Yildiz
129,Steven Zeichner
130,Jianhui Zhou
131,Jarrett Zigon


In [3]:
# identify which faculty have google scholar profiles
MAX_ITERS = 3
def query_profiles(name, existing_query=None, iteration=0):
    '''
    Recursive function to query google scholar profiles. Returns the scholar ID if it
    is found within the first MAX_ITERS searches, otherwise returns None.
    '''
    if existing_query:
        query = existing_query
    else:
        query = scholarly.search_author(name)
    
    try:
        query_hit = next(query)
        email_affiliation = query_hit.email
        # if the email address is a virginia address, this is probably the right profile
        if email_affiliation.find('virginia') > -1:
            query_hit.fill()
            profile = query_hit
        # otherwise, recurse through the function up to MAX_ITERS
        else:
            iteration+=1
            if iteration < MAX_ITERS:
                profile = query_profiles(name,existing_query=query,iteration=iteration)
            else:
                profile = None
    except StopIteration:
        profile = None
        
    if not profile:
        print('No google scholar profile hits for ' + name)
    return profile

no_profile = []
current_year_pubs = {}
for name in faculty['name']:
    print(name)
    profile = query_profiles(name)
    if not profile: 
        no_profile.append(name)
    else:
        # get all publications within the current year
        current_year_pubs[name] = []
        for pub in profile.publications:
            if 'year' in pub.bib.keys(): # some publications don't include the year/date
                if pub.bib['year'] == datetime.datetime.now().year:
                    current_year_pubs[name].append(pub.bib.copy())

Herve Agaisse
No google scholar profile hits for Herve Agaisse
Huiwang Ai
Janis Antonovics
No google scholar profile hits for Janis Antonovics
Lawrence Band
Laura Barnes
Christopher L. Barrett
Ruth Gaare Bernheim
No google scholar profile hits for Ruth Gaare Bernheim
Sam Bodily
Phillip Bourne
Thomas Braciale
No google scholar profile hits for Thomas Braciale
Catherine Bradshaw
Michael G. Brown
Timothy Bullock
No google scholar profile hits for Timothy Bullock
Anselmo Canfora
No google scholar profile hits for Anselmo Canfora
James Casanova
No google scholar profile hits for James Casanova
Anna Cliffe
No google scholar profile hits for Anna Cliffe
Linda Columbus
Alison Criss
Cristian Danna
No google scholar profile hits for Cristian Danna
Ashley Deeks
No google scholar profile hits for Ashley Deeks
Isabelle Derre
No google scholar profile hits for Isabelle Derre
Zygmunt Derwenda
Rebecca Dillingham
No google scholar profile hits for Rebecca Dillingham
Marcel Durieux
Joshua Eby
No google 

In [4]:
pub_to_frame = {}
pub_count = 0
for author, pubs in current_year_pubs.items():
    for pub in pubs:
        pub_count+=1
        pub_id = 'pub'+str(pub_count)
        title = pub['title']
        pub_to_frame[pub_id] = {'Author':author,
                                'Title':pub['title'],
                                'year_recorded':datetime.datetime.now().year,
                                     'month_recorded':datetime.datetime.now().month,
                                     'day_recorded':datetime.datetime.now().day}
pub_to_frame = pandas.DataFrame(pub_to_frame).T
# Make the titles uppercase to avoid missing duplicates due to case differences
pub_to_frame['Title'] = pub_to_frame['Title'].str.upper()
pub_to_frame

Unnamed: 0,Author,Title,year_recorded,month_recorded,day_recorded
pub1,Huiwang Ai,DEVELOPMENT AND APPLICATIONS OF BIOLUMINESCENT...,2019,12,9
pub2,Huiwang Ai,RED-SHIFTED LUCIFERASE-LUCIFERIN PAIRS FOR ENH...,2019,12,9
pub3,Huiwang Ai,IDENTIFICATION OF FACTORS COMPLICATING BIOLUMI...,2019,12,9
pub4,Huiwang Ai,"A GENETICALLY ENCODED, RATIOMETRIC FLUORESCENT...",2019,12,9
pub5,Huiwang Ai,ATP-INDEPENDENT BIOLUMINESCENT REPORTER VARIAN...,2019,12,9
...,...,...,...,...,...
pub533,Jarrett Zigon,TRAVELING WITH SUGAR: CHRONICLES OF A GLOBAL E...,2019,12,9
pub534,Jarrett Zigon,"SACRIFICIAL LIMBS: MASCULINITY, DISABILITY, AN...",2019,12,9
pub535,Jochen Zimmer,A LIPID GATING MECHANISM FOR THE CHANNEL-FORMI...,2019,12,9
pub536,Jochen Zimmer,STRUCTURAL FEATURES UNDERLYING RECOGNITION AND...,2019,12,9


In [5]:
# Find publications that appear more than once in the list; these involve multiple GIDI faculty
len(pub_to_frame['Title'])
len(set(pub_to_frame['Title']))
l = list(pub_to_frame['Title'])
set([x for x in l if l.count(x) > 1])

{'3165 DISEASED AND HEALTHY GASTROINTESTINAL TISSUE DATA MINING REQUIRES AN ENGAGED TRANSDISCIPLINARY TEAM',
 'A MOLECULAR MODEL OF THE ALPHA GLOBIN/ENOS COMPLEX',
 'A NOVEL RETROVIRAL VECTOR SYSTEM TO ANALYZE EXPRESSION FROM MRNA WITH RETAINED INTRONS USING FLUORESCENT PROTEINS AND FLOW CYTOMETRY',
 'A ONE HEALTH APPROACH TO PREVENTION, TREATMENT, AND CONTROL OF CAMPYLOBACTERIOSIS',
 'AN ASSAY TO MEASURE THE ACTIVITY OF RNA ELEMENTS AND PROTEINS THAT PROMOTE THE EXPRESSION OF MRNA WITH RETAINED INTRONS',
 'ARCHITECTURE OF THE CELLULOSE SYNTHASE OUTER MEMBRANE CHANNEL AND ITS ASSOCIATION WITH THE PERIPLASMIC TPR DOMAIN',
 'ARTIFICIAL INTELLIGENCE APPLIED TO GASTROINTESTINAL DIAGNOSTICS: A REVIEW',
 'ASSESSMENT OF MACHINE LEARNING DETECTION OF ENVIRONMENTAL ENTEROPATHY AND CELIAC DISEASE IN CHILDREN',
 'CHARACTERIZATION OF APOBEC3 VARIATION IN A POPULATION OF HIV-1 INFECTED INDIVIDUALS IN NORTHERN SOUTH AFRICA',
 'COMPARATIVE ANALYSES OF PARASITES WITH A COMPREHENSIVE DATABASE OF GENOME

In [6]:
# determine which publications are new since the last scan

# Load the previous scan
old_frame = pandas.read_csv('./results/publication_table.tsv',sep='\t')

# which titles are new?
old_titles = set(old_frame['Title'])
current_titles = set(pub_to_frame['Title'])
new_titles = current_titles - old_titles
new_titles

{'A FRAMEWORK FOR DISCOVERING HEALTH DISPARITIES AMONG COHORTS IN AN INFLUENZA EPIDEMIC',
 'CELIACNET: CELIAC DISEASE SEVERITY DIAGNOSIS ON DUODENAL HISTOPATHOLOGICAL IMAGES USING DEEP RESIDUAL NETWORKS',
 'FORECASTING DENGUE AND INFLUENZA INCIDENCES USING A SPARSE REPRESENTATION OF GOOGLE TRENDS, ELECTRONIC HEALTH RECORDS, AND TIME SERIES DATA.',
 'PERSISTENT POST-DISCHARGE OPIOID PRESCRIBING AFTER TRAUMATIC BRAIN INJURY REQUIRING INTENSIVE CARE UNIT ADMISSION: A CROSS-SECTIONAL STUDY WITH LONGITUDINAL OUTCOME',
 'RANK AGGREGATION VIA HETEROGENEOUS THURSTONE PREFERENCE MODELS',
 'THE AAA+ ATPASE TORSINA POLYMERIZES INTO HOLLOW HELICAL TUBES WITH 8.5 SUBUNITS PER TURN'}

In [7]:
# merge the old and new publication dataframes and re-save
new_publication_data = pandas.concat([old_frame,pub_to_frame.loc[pub_to_frame['Title'].isin(new_titles)]])
new_publication_data.to_csv('./results/publication_table.tsv',sep='\t',index=False)

In [8]:
# save the list of faculty that don't have google scholar profiles
no_profile
with open('./results/faculty_missing_profiles.txt', 'w') as file_handler:
    for item in no_profile:
        file_handler.write("{}\n".format(item))