In [1]:
from datetime import datetime
from datetime import timedelta
import time
from Bio import Entrez
from Bio import Medline
import pandas as pd
from pandas import read_csv
import os
import re
import pickle
import lzma

Entrez.email = os.environ['USEREMAIL']



In [2]:
###############################################################################
## This module takes a list of entrez gene ids, looks up PMIDs associated with
## each gene, obtains the authors of each PMID along with their affiliations
## The returned dataframe can be processed for top authors and more
## Note that Entrez caps requests at 3/second without API key
## With API key, requests are capped at 10/second
## To be on the safe side, this module voluntarily throttles (sleeps) to
## 2 requests/second (sleep is half a second)
###############################################################################
def retrieve_detailed_pubs_by_gene(genelist):
    timestart = datetime.now().time()
    genefailures = []
    pmid_failures = []
    pmid_author_fail = []
    author_df = pd.DataFrame(columns = ["AU", "FullName","AuthorDetails","pmid","publish_date"])
    PublicationDetails = []
    print(timestart, 'obtaining publication details and authors for each gene.')
    for geneid in genelist:
        print ('fetching pmids for: '+str(geneid))
        try: 
            record = Entrez.read(Entrez.elink(dbfrom="gene", id=geneid))
            PMIDList=[] ##creates an empty list to store PMIDs   
            for link in record[0]["LinkSetDb"][0]["Link"] : ##retrieves each PMID stored in the record associated with the gene ID
                PMIDList.append(link["Id"]) ##stores PMID's linked to the gene into the list
            if len(PMIDList) > 30:
                for PMID in PMIDList: #iterates through the PMID list
                    try:
                        handle = Entrez.efetch(db="pubmed", id=int(PMID), rettype="medline", retmode="text")
                        records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
                        for record in records:
                            try:
                                PublicationDate = record.get("DP","?") #writes the publication date 
                                pubdate_type = "DP"
                            except:
                                PublicationDate = record.get("EDAT","?") #writes the initial Entrez record submission date 
                                pubdate_type = "EDAT"
                            PublicationDetails.append({'geneid':str(geneid),'pmid':PMID,'PublicationDate':PublicationDate,"PubDateType":pubdate_type})
                            try:
                                AuthorSet = record.get("AU","?") #writes the record to a list called AuthorSet                     
                                FullAuthorSet = record.get("FAU","?") #writes the record to a list called AuthorSet
                                AuthorDetails = record.get("AD","?") #writes the record to a list called AuthorDetails
                                tmp_df = pd.DataFrame({'AU':pd.Series(AuthorSet), "FullName":pd.Series(FullAuthorSet),'AuthorDetails': pd.Series(AuthorDetails)})
                                tmp_df['pmid']=PMID
                                tmp_df['publish_date']=PublicationDate
                                author_df = pd.concat((author_df,tmp_df),ignore_index=True)
                            except:
                                pmid_author_fail.append(PMID)
                        time.sleep(0.5)
                    except:
                        #print('bad pmid: ', PMID)
                        pmid_failures.append({'geneid':geneid,'pmid':PMID})
            else:
                genefailures.append(geneid)
        except:
            genefailures.append(geneid)
        
    PublicationDetailsDF = pd.DataFrame(PublicationDetails)
    PMIDfailsDF = pd.DataFrame(pmid_failures)
    timeend = datetime.now().time()
    print(timeend)
    return(PublicationDetailsDF, author_df, genefailures, pmid_author_fail, PMIDfailsDF)


In [3]:
################################################################################
## This uses a list of PMIDs to pull author information
################################################################################

#PMIDList = [23039619,29390967,31363486,30951672] ##Unit test

def retrieve_authors_by_pmids(PMIDList):
    print(datetime.datetime.now().time())
    author_df = pd.DataFrame(columns = ["AU", "FullName","AuthorDetails","pmid","publish_date"])
    PublicationDetails = []
    PMIDFails = []
    for PMID in PMIDList: #iterates through the PMID list
        try:
            #print('fetching authors for: '+str(PMID))
            handle = Entrez.efetch(db="pubmed", id=PMID, rettype="medline", retmode="text")
            records = Medline.parse(handle) ##parses pubmed entry for that ID and records the author
            for record in records:
                AuthorSet = record.get("AU","?") #writes the record to a list called AuthorSet
                FullAuthorSet = record.get("FAU","?") #writes the record to a list called AuthorSet
                try:
                    AuthorDetails = record.get("AD","?") #writes the record to a list called AuthorDetails
                except:
                    AuthorDetails = ['No details']
                try:
                    PublicationDate = record.get("DP","?") #writes the initial Entrez record submission date
                    PublicationDetails.append({'pmid':PMID, 'PubDateType':'PD','PubDate':PublicationDate})
                except:
                    PublicationDate = record.get("EDAT","?")
                    PublicationDetails.append({'pmid':PMID, 'PubDateType':'EDAT','PubDate':PublicationDate})
                #print(len(AuthorSet),len(AuthorDetails),AuthorSet[0],AuthorDetails[0])
                tmp_df = pd.DataFrame({'AU':pd.Series(AuthorSet), "FullName":pd.Series(FullAuthorSet),'AuthorDetails': pd.Series(AuthorDetails)})
                tmp_df['pmid']=PMID
                tmp_df['PubDate']=PublicationDate
                author_df = pd.concat((author_df,tmp_df))
        except:
            PMIDFails.append(PMID)
            print("pmid not found: ",PMID)

    PublicationDF = pd.DataFrame(PublicationDetails)
    print(datetime.datetime.now().time())
    return(PublicationDF,author_df,PMIDFails)



In [4]:
################################################################################
## This module parses email addresses from the AD/Affiliation field if available
## input is the author dataframe resulting from this module:
## retrieve_detailed_pubs_by_gene
## Note that Articles prior to: October 1, 2013, will have an affiliation ONLY 
## for the first author. Details on that here: 
## https://www.nlm.nih.gov/pubs/techbull/so13/brief/so13_author_affiliations.html
################################################################################

def parse_out_emails(author_df):
    author_df.reset_index(inplace=True)
    try:
        author_df.drop(['level_0'],axis=1,inplace=True)
    except Exception: 
        pass
    try:
        author_df.drop(['index'],axis=1,inplace=True)
    except Exception: 
        pass
    author_df['email'] = author_df['AuthorDetails'].str.extract(r'([^@|\s]+@[^@]+\.[^@|\s]+)')
    author_emails = author_df.loc[author_df['email'].notnull()]
    author_emails_dots = author_emails.loc[author_emails['email'].str[-1]=="."]
    author_emails_dots['email'] = author_emails_dots['email'].str[:-1]
    author_emails.update(author_emails_dots)
    author_df.update(author_emails)
    return(author_df)


In [5]:
###############################################################################
## This module takes the resulting dataframes from the previous module
## merges them and then gets a count of the number of publications an author
## has for each gene. Note that the optional parameter, method allows for the
## counts to be based on the author "AU" or full name of the author "FullName"
## The default is the FullName
###############################################################################
def get_top_authors_from_dfs(PublicationDetailsDF, author_df, method="FullName"):
    all_merged_df = author_df_deets.merge(PublicationDetailsDF, on='pmid',how='left')
    all_merged_df.drop_duplicates(keep='first',inplace=True)
    top_authors_per_gene = all_merged_df.groupby(['geneid',method,'FullName']).size().reset_index(name='pubcounts')
    top_authors_per_gene.sort_values(by=['geneid','pubcounts'],ascending=[True,False],inplace=True)
    return(top_authors_per_gene)

In [6]:
## Pull up all pmids per genes, get author details, pull out available email addresses
## Perform groupby counts to get top contributing authors per gene
        
def get_authors(genelist,datapath,test=False):
    if test==True:
        genelist = [439921,55768] ## for unit test     
    PublicationDetailsDF, author_df, genefailures, pmid_author_fail, PMIDfailsDF = retrieve_detailed_pubs_by_gene(genelist)
    author_df_deets = parse_out_emails(author_df)
    PublicationDetailsDF.to_csv(os.path.join(datapath,'PublicationDetailsDF.tsv'),sep='\t',header=True)
    #author_df.to_csv(os.path.join(datapath,'author_df.tsv'),sep='\t',header=True)
    author_pickle = pickle.dumps(author_df)
    with lzma.open(os.path.join(datapath,"author_df.xz"), "w") as f:
        f.write(author_pickle)
    #author_df_deets.to_csv(os.path.join(datapath,'author_df_deets.tsv'),sep='\t',header=True)
    PMIDfailsDF.to_csv(os.path.join(datapath,'PMIDfailsDF.tsv'),sep='\t',header=True)
    with open(datapath+'genefailures.txt','w') as outwrite:
        for eachgene in genefailures:
            outwrite.write(str(eachgene)+'\n')
        outwrite.close()

def deal_with_failures(datapath,hasfailures = True):
    oldPublicationDetailsDF = read_csv(os.path.join(datapath,'PublicationDetailsDF.tsv'),delimiter='\t',header=0,index_col=0)
    oldauthor_df = read_csv(os.path.join(datapath,'author_df.tsv'),delimiter='\t',header=0,index_col=0)
    oldauthor_df_deets = read_csv(os.path.join(datapath,'author_df_deets.tsv'),delimiter='\t',header=0,index_col=0)
    oldPMIDfailsDF = read_csv(os.path.join(datapath,'PMIDfailsDF.tsv'),delimiter='\t',header=0,index_col=0)
    i=0
    while hasfailures == True:
        with open(datapath+'genefailures.txt','r') as infile:
            tmpgenelist = []
            for line in infile:
                tmpgenelist.append(line.strip())
            infile.close()
        tmpPublicationDetailsDF, tmpauthor_df, tmpgenefailures, tmppmid_author_fail, tmpPMIDfailsDF = retrieve_detailed_pubs_by_gene(tmpgenelist)
        tmpauthor_df_deets = parse_out_emails(author_df)
        PublicationDetailsDF = pd.concat((oldPublicationDetailsDF,tmpPublicationDetailsDF),ignore_index=True)
        PublicationDetailsDF.to_csv(os.path.join(datapath,'PublicationDetailsDF.tsv'),sep='\t',header=True)
        author_df = pd.concat((oldauthor_df,tmpauthor_df),ignore_index=True)
        author_df.to_csv(os.path.join(datapath,'author_df.tsv'),sep='\t',header=True)
        author_df_deets = pd.concat((oldauthor_df_deets,tmpauthor_df_deets),ignore_index=True)
        author_df_deets.to_csv(os.path.join(datapath,'author_df_deets.tsv'),sep='\t',header=True)
        PMIDfailsDF = pd.concat((oldPMIDfailsDF,tmpPMIDfailsDF),ignore_index=True)
        PMIDfailsDF.to_csv(os.path.join(datapath,'PMIDfailsDF.tsv'),sep='\t',header=True)
        with open(datapath+'genefailures.txt','w') as outwrite:
            for eachgene in tmpgenefailures:
                outwrite.write(str(eachgene)+'\n')
            outwrite.close()
        if len(tmpgenefailures) < 2:
            hasfailures = False
        i=i+1
    return(i)

# Main Script

### Obtaining Publication and author details

In [7]:
## Pull top authors for high priority genes

datapath = 'data/'
resultpath = 'results/'
genefile = 'priority_by_size.tsv'

prioritylist = read_csv(os.path.join(resultpath,genefile),delimiter='\t',header=0,index_col=0)
genelist = prioritylist['geneID'].unique().tolist()


In [8]:
%%time
get_authors(genelist,datapath,test=False)

08:19:06.190562 obtaining publication details and authors for each gene.
fetching pmids for: 83986
fetching pmids for: 10578
fetching pmids for: 120425
fetching pmids for: 10858
fetching pmids for: 3001
fetching pmids for: 820
fetching pmids for: 57817
fetching pmids for: 4357
fetching pmids for: 10381
fetching pmids for: 51738
fetching pmids for: 4521
fetching pmids for: 54884
fetching pmids for: 1806
fetching pmids for: 100130274
fetching pmids for: 441478
fetching pmids for: 84901
fetching pmids for: 55163
fetching pmids for: 9477
fetching pmids for: 84851
fetching pmids for: 7551
fetching pmids for: 192666
fetching pmids for: 390792
fetching pmids for: 140807
fetching pmids for: 54221
fetching pmids for: 80235
fetching pmids for: 3886
fetching pmids for: 8687
fetching pmids for: 342574
fetching pmids for: 147183
fetching pmids for: 162605
fetching pmids for: 5152
fetching pmids for: 51409
fetching pmids for: 23251
fetching pmids for: 85449
fetching pmids for: 729447
fetching pmids 

fetching pmids for: 79750
fetching pmids for: 54955
fetching pmids for: 80816
fetching pmids for: 84502
fetching pmids for: 140886
fetching pmids for: 57118
fetching pmids for: 246184
fetching pmids for: 79697
fetching pmids for: 114971
fetching pmids for: 80149
fetching pmids for: 125704
fetching pmids for: 389840
fetching pmids for: 6039
fetching pmids for: 1439
fetching pmids for: 56649
fetching pmids for: 5452
fetching pmids for: 55247
fetching pmids for: 6433
fetching pmids for: 344838
fetching pmids for: 344167
fetching pmids for: 9751
fetching pmids for: 8748
fetching pmids for: 8034
fetching pmids for: 1212
fetching pmids for: 9214
fetching pmids for: 112937
fetching pmids for: 84920
fetching pmids for: 27115
fetching pmids for: 51166
fetching pmids for: 7107
fetching pmids for: 114799
fetching pmids for: 122809
fetching pmids for: 387032
fetching pmids for: 140691
fetching pmids for: 493869
fetching pmids for: 23015
fetching pmids for: 10079
fetching pmids for: 319101
fetching

fetching pmids for: 441639
fetching pmids for: 391190
fetching pmids for: 84969
fetching pmids for: 283849
fetching pmids for: 219429
fetching pmids for: 219960
fetching pmids for: 390142
fetching pmids for: 120796
fetching pmids for: 390093
fetching pmids for: 390433
fetching pmids for: 390083
fetching pmids for: 390038
fetching pmids for: 119679
fetching pmids for: 390075
fetching pmids for: 390072
fetching pmids for: 119695
fetching pmids for: 390037
fetching pmids for: 219965
fetching pmids for: 219870
fetching pmids for: 390036
fetching pmids for: 219869
fetching pmids for: 26683
fetching pmids for: 341276
fetching pmids for: 391211
fetching pmids for: 116931
fetching pmids for: 100381270
fetching pmids for: 10344
fetching pmids for: 64577
fetching pmids for: 8837
fetching pmids for: 103910
fetching pmids for: 441608
fetching pmids for: 401665
fetching pmids for: 56655
fetching pmids for: 9832
fetching pmids for: 57721
fetching pmids for: 57654
fetching pmids for: 125963
fetching 

fetching pmids for: 971
fetching pmids for: 55122
fetching pmids for: 641654
fetching pmids for: 56941
fetching pmids for: 1551
fetching pmids for: 54576
fetching pmids for: 124538
fetching pmids for: 390197
fetching pmids for: 57053
fetching pmids for: 146183
fetching pmids for: 23416
fetching pmids for: 387509
fetching pmids for: 119682
fetching pmids for: 56996
fetching pmids for: 390261
fetching pmids for: 10936
fetching pmids for: 54097
fetching pmids for: 64859
fetching pmids for: 127068
fetching pmids for: 399664
fetching pmids for: 347732
fetching pmids for: 10412
fetching pmids for: 138803
fetching pmids for: 138799
fetching pmids for: 255743
fetching pmids for: 79190
fetching pmids for: 442038
fetching pmids for: 10486
fetching pmids for: 388531
fetching pmids for: 286464
fetching pmids for: 83550
fetching pmids for: 64756
fetching pmids for: 26219
fetching pmids for: 387601
fetching pmids for: 343169
fetching pmids for: 255928
fetching pmids for: 166379
fetching pmids for: 2

fetching pmids for: 3742
fetching pmids for: 4849
fetching pmids for: 10795
fetching pmids for: 11018
fetching pmids for: 10641
fetching pmids for: 92597
fetching pmids for: 60676
fetching pmids for: 7484
fetching pmids for: 83903
fetching pmids for: 200959
fetching pmids for: 80778
fetching pmids for: 51032
fetching pmids for: 3443
fetching pmids for: 83692
fetching pmids for: 7554
fetching pmids for: 55520
fetching pmids for: 138804
fetching pmids for: 57560
fetching pmids for: 340485
fetching pmids for: 64062
fetching pmids for: 147841
fetching pmids for: 51117
fetching pmids for: 11248
fetching pmids for: 11264
fetching pmids for: 92906
fetching pmids for: 63970
fetching pmids for: 25890
fetching pmids for: 23387
fetching pmids for: 9899
fetching pmids for: 3743
fetching pmids for: 26476
fetching pmids for: 57176
fetching pmids for: 54825
fetching pmids for: 207063
fetching pmids for: 113263
fetching pmids for: 161357
fetching pmids for: 54780
fetching pmids for: 139135
fetching pm

fetching pmids for: 51808
fetching pmids for: 9736
fetching pmids for: 57121
fetching pmids for: 22873
fetching pmids for: 4958
fetching pmids for: 51078
fetching pmids for: 22874
fetching pmids for: 84076
fetching pmids for: 7108
fetching pmids for: 25821
fetching pmids for: 9694
fetching pmids for: 7582
fetching pmids for: 114885
fetching pmids for: 10440
fetching pmids for: 54510
fetching pmids for: 22862
fetching pmids for: 92521
fetching pmids for: 55798
fetching pmids for: 29114
fetching pmids for: 26716
fetching pmids for: 57512
fetching pmids for: 54461
fetching pmids for: 8872
fetching pmids for: 6847
fetching pmids for: 11222
fetching pmids for: 10613
fetching pmids for: 8689
fetching pmids for: 51091
fetching pmids for: 30815
fetching pmids for: 10990
fetching pmids for: 93643
fetching pmids for: 9465
fetching pmids for: 55139
fetching pmids for: 144455
fetching pmids for: 114769
fetching pmids for: 253980
fetching pmids for: 51507
fetching pmids for: 57465
fetching pmids fo

fetching pmids for: 166378
fetching pmids for: 1952
fetching pmids for: 84203
fetching pmids for: 10735
fetching pmids for: 134526
fetching pmids for: 10687
fetching pmids for: 64094
fetching pmids for: 23559
fetching pmids for: 8354
fetching pmids for: 63894
fetching pmids for: 112476
fetching pmids for: 8882
fetching pmids for: 10811
fetching pmids for: 28965
fetching pmids for: 7589
fetching pmids for: 25759
fetching pmids for: 80325
fetching pmids for: 100132074
fetching pmids for: 91663
fetching pmids for: 23567
fetching pmids for: 51309
fetching pmids for: 92737
fetching pmids for: 54795
fetching pmids for: 7180
fetching pmids for: 79668
fetching pmids for: 58190
fetching pmids for: 29993
fetching pmids for: 92359
fetching pmids for: 9130
fetching pmids for: 84063
fetching pmids for: 64800
fetching pmids for: 93979
fetching pmids for: 22998
fetching pmids for: 22880
fetching pmids for: 55589
fetching pmids for: 9175
fetching pmids for: 79140
fetching pmids for: 25957
fetching pmi

fetching pmids for: 25809
fetching pmids for: 3884
fetching pmids for: 57472
fetching pmids for: 112574
fetching pmids for: 9521
fetching pmids for: 8351
fetching pmids for: 8749
fetching pmids for: 55506
fetching pmids for: 7553
fetching pmids for: 51534
fetching pmids for: 64221
fetching pmids for: 27123
fetching pmids for: 126147
fetching pmids for: 113451
fetching pmids for: 3769
fetching pmids for: 117246
fetching pmids for: 23554
fetching pmids for: 10483
fetching pmids for: 8458
fetching pmids for: 28986
fetching pmids for: 55228
fetching pmids for: 55527
fetching pmids for: 60436
fetching pmids for: 3013
fetching pmids for: 121441
fetching pmids for: 1841
fetching pmids for: 9553
fetching pmids for: 23345
fetching pmids for: 84164
fetching pmids for: 10190
fetching pmids for: 2830
fetching pmids for: 4848
fetching pmids for: 54538
fetching pmids for: 2135
fetching pmids for: 9806
fetching pmids for: 23177
fetching pmids for: 152926
fetching pmids for: 8624
fetching pmids for: 5

fetching pmids for: 9355
fetching pmids for: 10493
fetching pmids for: 5271
fetching pmids for: 80168
fetching pmids for: 117155
fetching pmids for: 10437
fetching pmids for: 26211
fetching pmids for: 7681
fetching pmids for: 23091
fetching pmids for: 158471
fetching pmids for: 10422
fetching pmids for: 340719
fetching pmids for: 63917
fetching pmids for: 5412
fetching pmids for: 79571
fetching pmids for: 51399
fetching pmids for: 29907
fetching pmids for: 56124
fetching pmids for: 8727
fetching pmids for: 221472
fetching pmids for: 51096
fetching pmids for: 8643
fetching pmids for: 56105
fetching pmids for: 404672
fetching pmids for: 9750
fetching pmids for: 8609
fetching pmids for: 4528
fetching pmids for: 7091
fetching pmids for: 5582
fetching pmids for: 9462
fetching pmids for: 23580
fetching pmids for: 11178
fetching pmids for: 51118
fetching pmids for: 55558
fetching pmids for: 9695
fetching pmids for: 9247
fetching pmids for: 1350
fetching pmids for: 55656
fetching pmids for: 18

fetching pmids for: 124995
fetching pmids for: 29801
fetching pmids for: 54549
fetching pmids for: 54033
fetching pmids for: 9348
fetching pmids for: 54584
fetching pmids for: 79971
fetching pmids for: 8214
fetching pmids for: 9737
fetching pmids for: 140825
fetching pmids for: 2537
fetching pmids for: 4783
fetching pmids for: 51631
fetching pmids for: 5273
fetching pmids for: 3770
fetching pmids for: 10930
fetching pmids for: 116985
fetching pmids for: 26747
fetching pmids for: 9928
fetching pmids for: 5645
fetching pmids for: 55423
fetching pmids for: 4992
fetching pmids for: 84864
fetching pmids for: 56660
fetching pmids for: 84803
fetching pmids for: 221188
fetching pmids for: 55901
fetching pmids for: 51495
fetching pmids for: 29058
fetching pmids for: 7482
fetching pmids for: 9254
fetching pmids for: 27020
fetching pmids for: 55620
fetching pmids for: 10941
fetching pmids for: 9702
fetching pmids for: 115703
fetching pmids for: 64506
fetching pmids for: 2137
fetching pmids for: 5

fetching pmids for: 10849
fetching pmids for: 54148
fetching pmids for: 56259
fetching pmids for: 284086
fetching pmids for: 84446
fetching pmids for: 29093
fetching pmids for: 57553
fetching pmids for: 7586
fetching pmids for: 10605
fetching pmids for: 8076
fetching pmids for: 23641
fetching pmids for: 7029
fetching pmids for: 10421
fetching pmids for: 1233
fetching pmids for: 10840
fetching pmids for: 29121
fetching pmids for: 2863
fetching pmids for: 3234
fetching pmids for: 27127
fetching pmids for: 1990
fetching pmids for: 23043
fetching pmids for: 55917
fetching pmids for: 6936
fetching pmids for: 25875
fetching pmids for: 196743
fetching pmids for: 284366
fetching pmids for: 9032
fetching pmids for: 9955
fetching pmids for: 54329
fetching pmids for: 9829
fetching pmids for: 10557
fetching pmids for: 5051
fetching pmids for: 26167
fetching pmids for: 27109
fetching pmids for: 7581
fetching pmids for: 28952
fetching pmids for: 57338
fetching pmids for: 7809
fetching pmids for: 515

fetching pmids for: 23596
fetching pmids for: 23072
fetching pmids for: 54507
fetching pmids for: 9114
fetching pmids for: 7991
fetching pmids for: 158326
fetching pmids for: 54882
fetching pmids for: 1572
fetching pmids for: 6296
fetching pmids for: 84172
fetching pmids for: 10541
fetching pmids for: 5146
fetching pmids for: 8383
fetching pmids for: 123016
fetching pmids for: 346562
fetching pmids for: 26039
fetching pmids for: 8941
fetching pmids for: 5872
fetching pmids for: 55576
fetching pmids for: 51780
fetching pmids for: 3705
fetching pmids for: 7368
fetching pmids for: 57214
fetching pmids for: 80833
fetching pmids for: 9419
fetching pmids for: 8669
fetching pmids for: 260425
fetching pmids for: 1258
fetching pmids for: 3996
fetching pmids for: 54914
fetching pmids for: 92170
fetching pmids for: 55578
fetching pmids for: 5558
fetching pmids for: 25794
fetching pmids for: 4430
fetching pmids for: 51747
fetching pmids for: 56605
fetching pmids for: 10282
fetching pmids for: 8428

fetching pmids for: 219931
fetching pmids for: 55902
fetching pmids for: 3007
fetching pmids for: 326625
fetching pmids for: 5309
fetching pmids for: 7556
fetching pmids for: 9801
fetching pmids for: 2676
fetching pmids for: 3097
fetching pmids for: 5136
fetching pmids for: 10573
fetching pmids for: 9436
fetching pmids for: 5545
fetching pmids for: 4701
fetching pmids for: 8522
fetching pmids for: 10768
fetching pmids for: 9382
fetching pmids for: 26515
fetching pmids for: 22985
fetching pmids for: 6182
fetching pmids for: 221613
fetching pmids for: 5143
fetching pmids for: 4146
fetching pmids for: 8510
fetching pmids for: 1397
fetching pmids for: 7142
fetching pmids for: 523
fetching pmids for: 23235
fetching pmids for: 93587
fetching pmids for: 5210
fetching pmids for: 56052
fetching pmids for: 55145
fetching pmids for: 1840
fetching pmids for: 517
fetching pmids for: 79670
fetching pmids for: 4101
fetching pmids for: 5596
fetching pmids for: 54716
fetching pmids for: 9987
fetching p

fetching pmids for: 10554
fetching pmids for: 23517
fetching pmids for: 30968
fetching pmids for: 10294
fetching pmids for: 93659
fetching pmids for: 55272
fetching pmids for: 79918
fetching pmids for: 126393
fetching pmids for: 92815
fetching pmids for: 53632
fetching pmids for: 5050
fetching pmids for: 55643
fetching pmids for: 3892
fetching pmids for: 9551
fetching pmids for: 57111
fetching pmids for: 4703
fetching pmids for: 51266
fetching pmids for: 11045
fetching pmids for: 10238
fetching pmids for: 7153
fetching pmids for: 84159
fetching pmids for: 11016
fetching pmids for: 9352
fetching pmids for: 366
fetching pmids for: 125988
fetching pmids for: 9053
fetching pmids for: 81031
fetching pmids for: 55243
fetching pmids for: 112950
fetching pmids for: 3806
fetching pmids for: 91147
fetching pmids for: 8622
fetching pmids for: 51603
fetching pmids for: 79885
fetching pmids for: 5357
fetching pmids for: 84888
fetching pmids for: 10284
fetching pmids for: 5863
fetching pmids for: 65

fetching pmids for: 6276
fetching pmids for: 9074
fetching pmids for: 26256
fetching pmids for: 57717
fetching pmids for: 2272
fetching pmids for: 373
fetching pmids for: 56648
fetching pmids for: 5569
fetching pmids for: 3188
fetching pmids for: 51025
fetching pmids for: 393
fetching pmids for: 1175
fetching pmids for: 11035
fetching pmids for: 55437
fetching pmids for: 1819
fetching pmids for: 8778
fetching pmids for: 143503
fetching pmids for: 57502
fetching pmids for: 4706
fetching pmids for: 10690
fetching pmids for: 91584
fetching pmids for: 5348
fetching pmids for: 5678
fetching pmids for: 2931
fetching pmids for: 10110
fetching pmids for: 5784
fetching pmids for: 54862
fetching pmids for: 23221
fetching pmids for: 139818
fetching pmids for: 8698
fetching pmids for: 8875
fetching pmids for: 114335
fetching pmids for: 57509
fetching pmids for: 11043
fetching pmids for: 5936
fetching pmids for: 23590
fetching pmids for: 51043
fetching pmids for: 8224
fetching pmids for: 3957
fetch

fetching pmids for: 7940
fetching pmids for: 26469
fetching pmids for: 10393
fetching pmids for: 27297
fetching pmids for: 2894
fetching pmids for: 51460
fetching pmids for: 390212
fetching pmids for: 2350
fetching pmids for: 1960
fetching pmids for: 1553
fetching pmids for: 27332
fetching pmids for: 5089
fetching pmids for: 25870
fetching pmids for: 91949
fetching pmids for: 25898
fetching pmids for: 10058
fetching pmids for: 123264
fetching pmids for: 80339
fetching pmids for: 636
fetching pmids for: 54470
fetching pmids for: 1388
fetching pmids for: 26133
fetching pmids for: 26155
fetching pmids for: 7225
fetching pmids for: 134121
fetching pmids for: 10335
fetching pmids for: 8705
fetching pmids for: 85477
fetching pmids for: 4991
fetching pmids for: 51690
fetching pmids for: 83999
fetching pmids for: 27089
fetching pmids for: 317772
fetching pmids for: 55696
fetching pmids for: 27094
fetching pmids for: 51102
fetching pmids for: 9406
fetching pmids for: 219738
fetching pmids for: 

fetching pmids for: 3151
fetching pmids for: 9794
fetching pmids for: 94081
fetching pmids for: 6689
fetching pmids for: 3444
fetching pmids for: 5098
fetching pmids for: 1620
fetching pmids for: 84313
fetching pmids for: 65010
fetching pmids for: 10487
fetching pmids for: 56953
fetching pmids for: 10406
fetching pmids for: 3434
fetching pmids for: 8153
fetching pmids for: 54413
fetching pmids for: 6455
fetching pmids for: 9962
fetching pmids for: 9246
fetching pmids for: 9128
fetching pmids for: 22872
fetching pmids for: 4063
fetching pmids for: 5994
fetching pmids for: 246243
fetching pmids for: 51095
fetching pmids for: 9693
fetching pmids for: 8340
fetching pmids for: 1846
fetching pmids for: 81618
fetching pmids for: 6843
fetching pmids for: 136895
fetching pmids for: 8100
fetching pmids for: 8703
fetching pmids for: 11102
fetching pmids for: 4640
fetching pmids for: 1912
fetching pmids for: 340393
fetching pmids for: 2330
fetching pmids for: 2780
fetching pmids for: 195828
fetchi

fetching pmids for: 8801
fetching pmids for: 1983
fetching pmids for: 10128
fetching pmids for: 27129
fetching pmids for: 8910
fetching pmids for: 84807
fetching pmids for: 100288687
fetching pmids for: 2802
fetching pmids for: 79152
fetching pmids for: 27324
fetching pmids for: 167227
fetching pmids for: 10097
fetching pmids for: 23013
fetching pmids for: 8898
fetching pmids for: 2342
fetching pmids for: 50511
fetching pmids for: 8338
fetching pmids for: 8543
fetching pmids for: 11124
fetching pmids for: 2868
fetching pmids for: 53938
fetching pmids for: 51160
fetching pmids for: 9519
fetching pmids for: 34
fetching pmids for: 3489
fetching pmids for: 8853
fetching pmids for: 5256
fetching pmids for: 6430
fetching pmids for: 5521
fetching pmids for: 107
fetching pmids for: 2953
fetching pmids for: 65260
fetching pmids for: 9374
fetching pmids for: 10979
fetching pmids for: 25805
fetching pmids for: 57054
fetching pmids for: 10961
fetching pmids for: 5305
fetching pmids for: 56945
fetc

fetching pmids for: 23601
fetching pmids for: 5414
fetching pmids for: 64755
fetching pmids for: 135112
fetching pmids for: 6866
fetching pmids for: 9092
fetching pmids for: 4327
fetching pmids for: 6443
fetching pmids for: 3709
fetching pmids for: 5378
fetching pmids for: 8370
fetching pmids for: 6240
fetching pmids for: 2703
fetching pmids for: 64398
fetching pmids for: 79155
fetching pmids for: 7353
fetching pmids for: 9491
fetching pmids for: 7320
fetching pmids for: 4776
fetching pmids for: 4726
fetching pmids for: 9560
fetching pmids for: 51062
fetching pmids for: 5908
fetching pmids for: 60626
fetching pmids for: 81688
fetching pmids for: 10290
fetching pmids for: 93663
fetching pmids for: 84168
fetching pmids for: 90167
fetching pmids for: 3232
fetching pmids for: 126
fetching pmids for: 56998
fetching pmids for: 2018
fetching pmids for: 9530
fetching pmids for: 728642
fetching pmids for: 4014
fetching pmids for: 8608
fetching pmids for: 51643
fetching pmids for: 10044
fetching

fetching pmids for: 506
fetching pmids for: 26993
fetching pmids for: 116159
fetching pmids for: 57048
fetching pmids for: 5029
fetching pmids for: 5563
fetching pmids for: 9429
fetching pmids for: 23683
fetching pmids for: 9495
fetching pmids for: 55215
fetching pmids for: 8506
fetching pmids for: 2290
fetching pmids for: 7841
fetching pmids for: 7867
fetching pmids for: 7163
fetching pmids for: 11197
fetching pmids for: 6405
fetching pmids for: 7220
fetching pmids for: 65264
fetching pmids for: 22930
fetching pmids for: 9139
fetching pmids for: 9580
fetching pmids for: 109
fetching pmids for: 4250
fetching pmids for: 7916
fetching pmids for: 1141
fetching pmids for: 9403
fetching pmids for: 6602
fetching pmids for: 8676
fetching pmids for: 8220
fetching pmids for: 2517
fetching pmids for: 563
fetching pmids for: 10772
fetching pmids for: 11170
fetching pmids for: 24137
fetching pmids for: 6884
fetching pmids for: 10603
fetching pmids for: 5655
fetching pmids for: 26354
fetching pmids

fetching pmids for: 79132
fetching pmids for: 6002
fetching pmids for: 29992
fetching pmids for: 2036
fetching pmids for: 3592
fetching pmids for: 5890
fetching pmids for: 6045
fetching pmids for: 10749
fetching pmids for: 1180
fetching pmids for: 56673
fetching pmids for: 3799
fetching pmids for: 4610
fetching pmids for: 79020
fetching pmids for: 11011
fetching pmids for: 4218
fetching pmids for: 6710
fetching pmids for: 7385
fetching pmids for: 211
fetching pmids for: 863
fetching pmids for: 3036
fetching pmids for: 823
fetching pmids for: 5031
fetching pmids for: 9044
fetching pmids for: 54345
fetching pmids for: 3992
fetching pmids for: 3191
fetching pmids for: 10661
fetching pmids for: 5441
fetching pmids for: 3209
fetching pmids for: 5435
fetching pmids for: 10659
fetching pmids for: 2314
fetching pmids for: 9294
fetching pmids for: 3767
fetching pmids for: 2558
fetching pmids for: 54550
fetching pmids for: 57609
fetching pmids for: 27183
fetching pmids for: 2560
fetching pmids f

fetching pmids for: 8605
fetching pmids for: 6447
fetching pmids for: 284001
fetching pmids for: 57761
fetching pmids for: 1137
fetching pmids for: 9276
fetching pmids for: 8668
fetching pmids for: 171024
fetching pmids for: 5473
fetching pmids for: 6498
fetching pmids for: 9568
fetching pmids for: 5350
fetching pmids for: 130814
fetching pmids for: 774
fetching pmids for: 2069
fetching pmids for: 10152
fetching pmids for: 56288
fetching pmids for: 1404
fetching pmids for: 2488
fetching pmids for: 3547
fetching pmids for: 51138
fetching pmids for: 2941
fetching pmids for: 10267
fetching pmids for: 23181
fetching pmids for: 6833
fetching pmids for: 1174
fetching pmids for: 7021
fetching pmids for: 7903
fetching pmids for: 1478
fetching pmids for: 6448
fetching pmids for: 6690
fetching pmids for: 11057
fetching pmids for: 3646
fetching pmids for: 9328
fetching pmids for: 26277
fetching pmids for: 54332
fetching pmids for: 390
fetching pmids for: 4134
fetching pmids for: 60529
fetching pm

fetching pmids for: 1207
fetching pmids for: 3449
fetching pmids for: 5434
fetching pmids for: 55177
fetching pmids for: 54386
fetching pmids for: 1009
fetching pmids for: 1848
fetching pmids for: 80224
fetching pmids for: 23192
fetching pmids for: 2027
fetching pmids for: 160364
fetching pmids for: 6890
fetching pmids for: 63895
fetching pmids for: 6781
fetching pmids for: 9135
fetching pmids for: 3822
fetching pmids for: 81704
fetching pmids for: 10254
fetching pmids for: 490
fetching pmids for: 6785
fetching pmids for: 6414
fetching pmids for: 79742
fetching pmids for: 2782
fetching pmids for: 84433
fetching pmids for: 5947
fetching pmids for: 11267
fetching pmids for: 6837
fetching pmids for: 265
fetching pmids for: 2110
fetching pmids for: 10681
fetching pmids for: 1725
fetching pmids for: 7799
fetching pmids for: 3746
fetching pmids for: 25842
fetching pmids for: 4705
fetching pmids for: 1617
fetching pmids for: 114757
fetching pmids for: 2533
fetching pmids for: 6634
fetching pm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  author_emails_dots['email'] = author_emails_dots['email'].str[:-1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = expressions.where(mask, this, that)


Wall time: 8d 5h 51min 12s


In [None]:
%%time
i = deal_with_failures(datapath,hasfailures = True)

In [None]:
## Try to pull the information for the PMID failures
print(len(PMIDfailsDF))
print(PMIDfailsDF.head(n=2))
PMIDFailList = PMIDfailsDF['pmid'].unique().tolist()

PublicationDF,authordf,PMIDFails = retrieve_authors_by_pmids(PMIDFailList)
print(len(PMIDFails))

In [None]:
print(PublicationDetailsDF.head(n=2))
#print(PublicationDF.head(n=2))

PublicationDetailsDF_leftovers = PublicationDF.merge(PMIDfailsDF,on='pmid',how='left')
PublicationDetailsDF_leftovers.drop('Unnamed: 0',axis=1,inplace=True)
print(PublicationDetailsDF_leftovers.head(n=2))

authordf_deets = parse_out_emails(authordf)

In [None]:
PublicationDetailsDF_leftovers.to_csv(exppath+'PublicationDF_run6.tsv',sep='\t',header=True)
authordf.to_csv(exppath+'author_df_run6.tsv',sep='\t',header=True)
authordf_deets.to_csv(exppath+'author_df_deets_run6.tsv',sep='\t',header=True)
PMIDfailsDF.to_csv(exppath+'PMIDfailsDF_run6.tsv',sep='\t',header=True)

In [None]:
## get author counts for each gene
top_authors_per_gene = get_top_authors_from_dfs(PublicationDetailsDF, author_df, "FullName")
print(top_authors_per_gene.head(n=3))

top_authors_per_gene.to_csv(exppath+'top_authors_per_gene_FullName.tsv',sep='\t',header=True)

top_authors_per_gene = get_top_authors_from_dfs(PublicationDetailsDF, author_df, "AU")
print(top_authors_per_gene.head(n=3))

top_authors_per_gene.to_csv(exppath+'top_authors_per_gene_AU.tsv',sep='\t',header=True)

## Summarizing and making sense of the results

In [None]:
author_df = read_csv(exppath+'author_df_run1.tsv', delimiter='\t', header=0)
PublicationDetailsDF = read_csv(exppath+'PublicationDetailsDF_run1.tsv', delimiter='\t', header=0)

i=2
while i < 7:
    tmpdf = read_csv(exppath+"author_df_run"+str(i)+".tsv", delimiter='\t', header=0)
    author_df = pd.concat((author_df,tmpdf), ignore_index=True)
    tmpdeets_df = read_csv(exppath+"PublicationDetailsDF_run"+str(i)+".tsv", delimiter='\t', header=0)
    PublicationDetailsDF = pd.concat((PublicationDetailsDF,tmpdeets_df), ignore_index=True)
    i=i+1

author_df.drop('Unnamed: 0',axis=1,inplace=True)
PublicationDetailsDF.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
print(len(PublicationDetailsDF['geneid'].unique()))

In [None]:
PublicationDetailsDF.rename(columns={'PublicationDate':'publish_date'},inplace=True)
PublicationDetailsDF['geneid'] = PublicationDetailsDF['geneid'].astype(str)
print(len(PublicationDetailsDF), PublicationDetailsDF.head(n=2))
print(len(author_df), author_df.head(n=2))

In [None]:
all_results = author_df.merge(PublicationDetailsDF,on=['pmid','PubDate','publish_date'],how='left')
print(len(all_results))
all_results.drop_duplicates(keep='first',inplace=True)
print(len(all_results))
print(all_results.head(n=2))

In [None]:
## sort the authors to get the top authors
author_numbers = all_results.groupby(['geneid','FullName','AU']).size().reset_index(name='author_count')
author_numbers.sort_values(['geneid','author_count'],ascending=[True,False],inplace=True)
print(author_numbers.head(n=2))

In [None]:
## Get the author email list (wherever email is not nan) and drop any immediate and obvious duplicate entries
author_emails = author_df.loc[author_df['email'].notnull()].sort_values('AU',ascending=True)
author_emails.drop_duplicates(keep='first',inplace=True)
author_emails.sort_values(['FullName','publish_date'],ascending=[True,True],inplace=True)

## fill in publication dates where missing and delete duplicate column
author_emails['publish_date'].loc[author_emails['publish_date'].isnull()] = author_emails['PubDate']
author_emails.drop('PubDate',axis=1,inplace=True)

## Drop duplicates again
author_emails.drop_duplicates(keep='first',inplace=True)

## sort by pmid (assuming pmid numbers only go up over time), de-duplicate to get most recent
author_emails.sort_values(['AU','FullName','pmid'],ascending=[True,True,True],inplace=True)

most_recent_email = author_emails.drop_duplicates(subset=['AU','FullName'],keep='last',inplace=False)
most_recent_email['pmid'] = most_recent_email['pmid'].astype(str)
print(most_recent_email.head(n=2))



In [None]:
## get most recent author details and merge with most recent author email list

author_no_emails = author_df.loc[author_df['email'].isnull()].sort_values(['AU','FullName','pmid'],ascending=[True,True,True])
author_no_emails.drop_duplicates(subset=['AU','FullName'],keep='last',inplace=True)

author_details = pd.concat((author_no_emails,most_recent_email))                                                                                                           

In [None]:
top_authors = author_numbers.merge(author_details,on=['AU','FullName'],how='left').fillna('Not Available')
print(len(top_authors))
top_authors.drop_duplicates(keep='first',inplace=True)
print(len(top_authors))
print(top_authors.head(n=2))

In [None]:
top_authors.to_csv(exppath+'top_ad_author_details.tsv',sep='\t',header=True)

In [None]:
#### Export the top 20 authors for each gene since exporting ALL the authors is too many

top_authors = read_csv(exppath+'top_ad_author_details.tsv',delimiter='\t',header=0)
top_authors.drop('Unnamed: 0',axis=1,inplace=True)
top_authors.reset_index(inplace=True)
top_authors.rename(columns={'index':'rowid'},inplace=True)
print(top_authors.head(n=2))
top_20_authors = top_authors.groupby('geneid')['author_count'].nlargest(20).reset_index()
top_20_authors.rename(columns={'level_1':'rowid'},inplace=True)

top_20_author_deets = top_20_authors.merge(top_authors,on=['geneid','rowid','author_count'],how='left')
print(len(top_authors), len(top_20_authors), len(top_20_author_deets))
print(top_20_author_deets.head(n=11))
top_20_author_deets.to_csv(exppath+'top_20_author_deets.tsv',sep='\t',header=True)