In [1]:
import os
import pandas
from pandas import read_csv
import json
import pickle
from datetime import datetime
import sys
from io import StringIO
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import requests

## Parse PDFs for initial dump

In [2]:
#### Create curatedBy Object
def generate_curator():
    todate = datetime.now()
    curatedByObject = {"@type": "Organization", "identifier": "covid19LST", "url": "https://www.covid19lst.org/", 
                              "name": "COVID-19 Literature Surveillance Team", "affiliation": [], 
                              "curationDate": todate.strftime("%Y-%m-%d")}
    return(curatedByObject)

In [3]:
def generate_author():
    authorObject = generate_curator()
    authorObject.pop('curationDate')
    memberlist = read_csv('data/LST members.txt',delimiter='\t',header=0,encoding='UTF-8')
    memberlist.rename(columns={'affiliation':'affiliation list'}, inplace=True)
    memberlist['affiliation']='blank'
    for i in range(len(memberlist)):
        affiliationlist = memberlist.iloc[i]['affiliation list'].split(';')
        tmplist = []
        for eachaffiliation in affiliationlist:
            tmplist.append({"name":eachaffiliation})
        memberlist.at[i,'affiliation'] = tmplist
    memberlist.drop(columns='affiliation list',inplace=True)
    memberdictlist = memberlist.to_dict('records')
    authorObject['members']=memberdictlist 
    return(authorObject)

In [4]:
def generate_abstract(publist):
    cleanlist = [str(item) for item in publist]
    separator = ', '
    abstract = "Analytical reviews on the level of evidence presented in publications. This report specifically covers the following publications: "+ separator.join(cleanlist)
    return(abstract)

In [5]:
### Batch convert DOIs
def convert_dois(doilist):
    doistring = '"' + '","'.join(doilist) + '"'
    r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': doistring, 'scopes': 'doi', 'fields': '_id,name,url,doi'})
    if r.status_code == 200:
        rawresult = pandas.read_json(r.text)
        if 'notfound' in rawresult.columns:
            check = rawresult.loc[(rawresult['notfound']==1.0)|(rawresult['notfound']==True)]
            if len(check)==len(doilist):
                cleanresult = pandas.DataFrame(columns=['_id','name','url','doi'])
                missing = doilist            
            else:
                no_dups = rawresult[rawresult['query']==rawresult['doi']]
                cleanresult = no_dups[['_id','name','url','doi']].loc[~no_dups['_id'].isin(check['_id'].tolist())].copy()
                missing = [x for x in doilist if x not in cleanresult['doi'].unique().tolist()]        
        else:
            no_dups = rawresult[rawresult['query']==rawresult['doi']]
            cleanresult = no_dups[['_id','name','url','doi']]
            missing = []
        cleanresult.drop('doi',axis=1,inplace=True)
        
    else:
        cleanresult=[]
        missing=[]
    return(cleanresult, missing)

### Convert a single doi
##"https://api.outbreak.info/resources/query?q=doi:"+doi

### Batch fetch pmid meta
def get_pmid_meta(pmidlist):
    pmidstring = '"' + '","'.join(pmidlist) + '"'
    r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': pmidstring, 'scopes': '_id', 'fields': '_id,name,url'})
    if r.status_code == 200:
        rawresult = pandas.read_json(r.text)
        no_dups = rawresult[rawresult['query']==rawresult['_id']]
        if 'notfound' in rawresult.columns:
            check = rawresult.loc[(rawresult['notfound']==1.0)|(rawresult['notfound']==True)]
            if len(check)==len(pmidlist):
                cleanresult = pandas.DataFrame(columns=['_id','name','url'])
                missing = pmidlist            
            else:
                cleanresult = no_dups[['_id','name','url']].loc[~no_dups['_id'].isin(check['_id'].tolist())].copy()
                missing = [x for x in pmidlist if x not in cleanresult['_id'].unique().tolist()]
        else:
            cleanresult = no_dups[['_id','name','url']]
            missing = []
        
    else:
        cleanresult=[]
        missing=[]
    return(cleanresult, missing)   


In [6]:
def strip_ids_from_text(output_text):
    #### The COVID-19 LST reports do not list pmids in a parsable fashion in the text. Only DOI's can be parsed out
    pmidlist = []
    doilist = []
    check = output_text.split('\n')
    doilines = [x for x in check if 'doi' in x.lower()]
    if len(doilines)>0:
        for doiline in doilines:
            if '\t' in doiline:
                doistart = doiline[doiline.find('doi'):]
                doi = doistart[doistart.find('\t'):doistart.find('.\t')]
                doilist.append(doi.strip())
            else:
                doistart = doiline[doiline.find('doi'):]
                doi = doistart[doistart.find(' '):doistart.find('. ')]
                doilist.append(doi.strip())
    return(pmidlist,doilist)

In [7]:
def parse_urls(eachurl,pmidlist,doilist):
    if 'pubmed' in eachurl and '?' not in eachurl:
        pmid = eachurl.replace("https://www.ncbi.nlm.nih.gov/pubmed/","").replace("https://pubmed.ncbi.nlm.nih.gov/","").rstrip("/")
        if "#affiliation" in pmid:
            trupmid = pmid.split("/")[0]
            tmpid = 'pmid'+trupmid
        else:
            tmpid = 'pmid'+pmid
        pmidlist.append(tmpid)
    elif 'doi' in eachurl:
        tenplace = eachurl.find('10.')
        doi = eachurl[tenplace:]
        doilist.append(doi)  
    return(pmidlist,doilist)

In [8]:
def parse_pdf(eachfile):
    pdffile = open('data/reports/'+eachfile,'rb')
    parser = PDFParser(pdffile)
    doc = PDFDocument(parser)
    allurls = []
    pmidlist = []
    doilist = []
    for page in PDFPage.create_pages(doc):
        try: 
            for annotation in page.annots:
                annotationDict = annotation.resolve()
                if "A" in annotationDict:
                    uri = annotationDict["A"]["URI"].decode('UTF-8').replace(" ", "%20")
                    allurls.append(uri)
        except:
            continue  
    if len(allurls)>0:  
        for eachurl in allurls:
            pmidlist,doilist = parse_urls(eachurl,pmidlist,doilist)              
    if len(allurls)==0: 
        output_string = StringIO()
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        output_text = output_string.getvalue()
        pmidlist, doilist = strip_ids_from_text(output_text)  
        
    pmidlist = list(set(pmidlist))
    doilist = list(set(doilist))
                   
    return(pmidlist,doilist)

                           
def merge_meta(pmidlist,doilist):
    if len(doilist)>0:
        doianns,missing_dois = convert_dois(doilist)
        doicheck = True
    else:
        doicheck = False
        missing_dois = None
    if len(pmidlist)>0:
        pmidanns,missing_pmids = get_pmid_meta(pmidlist)
        pmidcheck = True
    else:
        pmidcheck = False
        missing_pmids = None
    if doicheck==True and pmidcheck==True:
        basedOndf = pandas.concat((pmidanns,doianns),ignore_index=True)
    elif doicheck==True and pmidcheck==False:
        basedOndf = doianns
    elif doicheck==False and pmidcheck==True:
        basedOndf = pmidanns     
    if missing_pmids!=None and missing_dois!=None:
        missing = list(set(missing_pmids).union(set(missing_dois)))
    elif missing_pmids==None and missing_dois!=None:
        missing = missing_dois
    elif missing_pmids!=None and missing_dois==None:
        missing = missing_pmids
    else:
        missing = None
    return(basedOndf,missing)

In [9]:
def save_missing(missing):
    try:
        missing_list = pickle.load(open('results/pubs_not_yet_in_outbreak.txt','rb'))
        if missing != None:
            total_missing = list(set([*missing_list, *missing]))
            with open('results/pubs_not_yet_in_outbreak.txt','wb') as dmpfile:
                pickle.dump(total_missing,dmpfile)
    except:
        if missing != None:
            with open('results/pubs_not_yet_in_outbreak.txt','wb') as dmpfile:
                pickle.dump(missing,dmpfile)

In [10]:
## Note that strftime("%d") will give the day with a leading zero
## In windows, strftime("%#d") will give it without leading zeros
## In linux, strftime("%-d") will give it without leading zeros
def generate_report_url(datePublished):
    urlbase = "https://www.covid19lst.org/post/"
    urlend = "daily-covid-19-lst-report"
    is_windows = sys.platform.startswith('win')
    if is_windows==True:
        reporturl = urlbase+datePublished.strftime("%B").lower()+"-"+datePublished.strftime("%#d")+"-"+urlend
    else:
        reporturl = urlbase+datePublished.strftime("%B").lower()+"-"+datePublished.strftime("%-d")+"-"+urlend
    return(reporturl)

In [11]:
def generate_report_meta(filelist):
    report_pmid_df = pandas.DataFrame(columns=['_id','name','identifier','url'])
    curatedByObject = generate_curator()
    author = generate_author()
    badpdfs = []
    for eachfile in filelist:
        reportdate = eachfile[0:4]+'.'+eachfile[4:6]+'.'+eachfile[6:8]
        datePublished = datetime.fromisoformat(eachfile[0:4]+'-'+eachfile[4:6]+'-'+eachfile[6:8])
        name = "Covid-19 LST Report "+reportdate
        reporturl = generate_report_url(datePublished)
        report_id = 'lst'+reportdate
        pmidlist,doilist = parse_pdf(eachfile)
        if len(pmidlist)+len(doilist)==0:
            badpdfs.append(eachfile)
        else:
            basedOndf,missing = merge_meta(pmidlist,doilist)
            basedOndf['@type']='Publication'
            reportlinkdf = basedOndf[['_id','url']]
            reportlinkdf['identifier']=report_id
            reportlinkdf['url']=reporturl
            reportlinkdf['name']=name
            report_pmid_df = pandas.concat(([report_pmid_df,reportlinkdf]),ignore_index=True)
            report_pmid_df.drop_duplicates(keep='first',inplace=True)
            report_pmid_df.to_csv('data/report_pmid_df.txt',sep='\t',header=True)
            save_missing(missing)
            abstract = generate_abstract(basedOndf['_id'].unique().tolist())
            metadict = {"@context": {"schema": "http://schema.org/", "outbreak": "https://discovery.biothings.io/view/outbreak/"}, 
                        "@type": "Publication", "journalName": "COVID-19 LST Daily Summary Reports", "journalNameAbbreviation": "covid19LST", 
                        "publicationType": "Review", "license":"(CC BY-NC-SA 4.0) (http://creativecommons.org/licenses/by-nc-sa/4.0/)",
                        "_id":report_id,"curatedBy": curatedByObject,"abstract": abstract, "name": name, 
                        "datePublished": datePublished.strftime("%Y-%m-%d"),"url": reporturl,"author":[author], 
                        "isBasedOn":basedOndf.to_dict('records')}
            yield(metadict)
        except:
            save_missing(list(report_id))

In [None]:
## Unit test

dumpdir = 'data/reports/'
filelist = os.listdir(dumpdir)
testlist = filelist[0:2]
metadict = generate_report_meta(testlist)
for eachdict in metadict:
    print(eachdict)

## Download PDFs from google drive

https://drive.google.com/drive/folders/1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF

but only if they were uploaded after a specific date. The reason for this is that prior to 2020.09.11, there was variability in the file naming and file names were sometimes incorrect (202080 instead of 202008). These have been manually corrected in the initial dump, but are otherwise still incorrect in the google drive.

Note that this script uses the googledrive api which requires authentification even when accessing a public google drive. To fulfill this requirement without needing to manually log in, credentials from a service account are needed  The googledrive API is only used to read the files in the drive so that the newest ones (by date) can be identified, and their id's taken.

Additionally the the pydrive2 library (use to access the google drive api) sometimes has trouble finding the client_secrets.json file, so you may need to manually point to it.

The downloader uses the GoogleDriveDownloader library which is based off of requests and should not require the google drive api.

In [None]:
## This function identifies files uploaded after 2020.09.11 that have NOT yet been downloaded
## Note that this is the function if a service account is not available. It requires a login

def check_google():
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive

    GoogleAuth.DEFAULT_SETTINGS['client_config_file'] = 'client_secrets.json' ##point to secrets file location
    gauth = GoogleAuth()
    #gauth.LocalWebserverAuth()

    drive = GoogleDrive(gauth)
    file_id = '1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF'
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % file_id}).GetList()
    
    df = pandas.DataFrame(file_list)
    dfclean = df[['createdDate','id','title']].copy()
    dfclean['date'] = pandas.to_datetime(dfclean['createdDate'],format='%Y-%m-%d', errors='coerce')
    lastupdate = dfclean.loc[dfclean['createdDate']=='2020-09-11T01:53:29.639Z'].iloc[0]['date']
    dfnew = dfclean.loc[dfclean['date']>lastupdate]
    
    all_files = os.listdir('data/reports/')
    new_files = [item for item in dfnew['title'].unique().tolist() if item not in all_files]
    reportdf = dfnew.loc[dfnew['title'].isin(new_files)]
    return(reportdf)

In [14]:
## This function identifies files uploaded after 2020.09.11 that have NOT yet been downloaded
## Note that this is the function if a service account IS available. 
def check_google():
    from pydrive2.auth import GoogleAuth
    from pydrive2.drive import GoogleDrive
    from pydrive2.auth import ServiceAccountCredentials
    
    gauth = GoogleAuth()
    scope = ['https://www.googleapis.com/auth/drive']
    gauth.credentials = ServiceAccountCredentials.from_json_keyfile_name('credentials.json', scope)
    drive = GoogleDrive(gauth)
    file_id = '1603ahBNdt1SnSaYYBE-G8SA6qgRTQ6fF'
    file_list = drive.ListFile({'q': "'%s' in parents and trashed=false" % file_id}).GetList()
    
    df = pandas.DataFrame(file_list)
    dfclean = df[['createdDate','id','title']].copy()
    dfclean['date'] = pandas.to_datetime(dfclean['createdDate'],format='%Y-%m-%d', errors='coerce')
    lastupdate = dfclean.loc[dfclean['createdDate']=='2020-09-11T01:53:29.639Z'].iloc[0]['date']
    dfnew = dfclean.loc[dfclean['date']>lastupdate]
    
    all_files = os.listdir('data/reports/')
    new_files = [item for item in dfnew['title'].unique().tolist() if item not in all_files]
    reportdf = dfnew.loc[dfnew['title'].isin(new_files)]
    return(reportdf)

In [12]:
#### This is the function to actually conduct the download
## Note that the report date in the title is used for generating the corresponding covid19 LST report url
## For this reason, special reports cannot be automated and will be exempt from downloading
def download_reports(reportdf):
    from google_drive_downloader import GoogleDriveDownloader as gdd
    notdownloaded = 0
    for i in range(len(reportdf)):
        title = reportdf.iloc[i]['title']
        eachid = reportdf.iloc[i]['id']
        try:
            date_title = int(title[0:6])
            gdd.download_file_from_google_drive(file_id=eachid,
                                                dest_path='data/reports/'+title,
                                                unzip=False)
        except:
            notdownloaded = notdownloaded+1   

In [13]:
def load_annotations():
    reportdf = check_google()
    download_reports(reportdf)
    dumpdir = 'data/reports/'
    filelist = os.listdir(dumpdir)
    metadict = generate_report_meta(filelist)
    yield from(metadict)

In [15]:
##### Unit test

#reportdf = check_google()
#download_reports(reportdf)
dumpdir = 'data/reports/'
filelist = os.listdir(dumpdir)
#metadict = generate_report_meta(filelist)
#for eachmeta in metadict:
#    print(eachmeta)

In [None]:
print(filelist[0])

In [16]:
##### Unit test
##https://www.covid19lst.org/post/september-11-daily-covid-19-lst-report
#dt.strftime("%A, %d. %B %Y %I:%M%p")
#'Tuesday, 21. November 2006 04:30PM'

report_pmid_df = pandas.DataFrame(columns=['_id','name','identifier','url'])
curatedByObject = generate_curator()
author = curatedByObject.copy()
author.pop("curationDate")
#curatedByObject.pop('members')
print('start time: ',datetime.now())
badpdfs = []

for eachfile in filelist:
    reportdate = eachfile[0:4]+'.'+eachfile[4:6]+'.'+eachfile[6:8]
    datePublished = datetime.fromisoformat(eachfile[0:4]+'-'+eachfile[4:6]+'-'+eachfile[6:8])
    name = "Covid-19 LST Report "+reportdate
    reporturl = generate_report_url(datePublished)
    report_id = 'lst'+reportdate
    pmidlist,doilist = parse_pdf(eachfile)
    if len(pmidlist)+len(doilist)==0:
        badpdfs.append(eachfile)
    else:
        basedOndf,missing = merge_meta(pmidlist,doilist)
        basedOndf['@type']='Publication'
        reportlinkdf = basedOndf[['_id','url']].copy()
        reportlinkdf['identifier']=report_id
        reportlinkdf['url']=reporturl
        reportlinkdf['name']=name
        report_pmid_df = pandas.concat(([report_pmid_df,reportlinkdf]),ignore_index=True)
        report_pmid_df.drop_duplicates(keep='first',inplace=True)
        report_pmid_df.to_csv('data/report_pmid_df.txt',sep='\t',header=True)
        save_missing(missing)
        abstract = generate_abstract(basedOndf['_id'].unique().tolist())
        metadict = {"@context": {"schema": "http://schema.org/", "outbreak": "https://discovery.biothings.io/view/outbreak/"}, 
                    "@type": "Publication", "journalName": "COVID-19 LST Daily Summary Reports", "journalNameAbbreviation": "covid19LST", 
                    "publicationType": "Review", "license":"(CC BY-NC-SA 4.0) (http://creativecommons.org/licenses/by-nc-sa/4.0/)",
                    "_id":report_id,"curatedBy": curatedByObject,"abstract": abstract, "name": name, 
                    "datePublished": datePublished.strftime("%Y-%m-%d"),"url": reporturl, 
                    "isBasedOn":basedOndf.to_dict('records')}
        with open("results/pdfminer2/"+name+".json", "w") as outfile:  
            json.dump(metadict, outfile) 

print(len(badpdfs))    
print('end time: ',datetime.now())

start time:  2021-02-26 13:33:58.419988


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


0
end time:  2021-02-26 13:40:51.126061


In [None]:
print(badpdfs[0])

In [None]:
print(filelist[30])

In [None]:
from io import StringIO
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams


with open(dumpdir+filelist[30], 'rb') as pdffile:
    parser = PDFParser(pdffile)
    doc = PDFDocument(parser)
    allurls = []
    pmidlist = []
    doilist = []
    for page in PDFPage.create_pages(doc):
        try: 
            for annotation in page.annots:
                annotationDict = annotation.resolve()
                if "A" in annotationDict:
                    uri = annotationDict["A"]["URI"].decode('UTF-8').replace(" ", "%20")
                    allurls.append(uri)
        except:
            continue
    if len(allurls)>0:  
        for eachurl in allurls:
            pmidlist,doilist = parse_urls(eachurl,pmidlist,doilist)              
    if len(allurls)==0: 
        output_string = StringIO()
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        output_text = output_string.getvalue()
        print(output_text)
        #pmidlist, doilist = strip_ids_from_text(output_text)  

#print(doilist)                             

In [None]:
pmidlist = []
doilist = []
check = output_text.split('\n')
doilines = [x for x in check if 'doi' in x.lower()]

if len(doilines)>0:
    for doiline in doilines:
        if '\t' in doiline:
            doistart = doiline[doiline.find('doi'):]
            doi = doistart[doistart.find('\t'):doistart.find('.\t')]
            doilist.append(doi.strip())
        else:
            doistart = doiline[doiline.find('doi'):]
            doi = doistart[doistart.find(' '):doistart.find('. ')]
            doilist.append(doi.strip())

print(doilist)
#print(doilist)
#doistring = '"' + '","'.join(doilist) + '"'


In [None]:
check = result.split('\n')
dois = [x for x in check if 'doi' in x.lower()]
pmids = [x for x in check if ('pmid' or 'pubmed') in x.lower()]
doiline = dois[4]
doistart = doiline[doiline.find('doi'):]
doi = doistart[doistart.find('\t'):doistart.find('.\t')]
print(doi.strip())

In [None]:

doistring = '"' + '","'.join(doilist) + '"'
r = requests.post("https://api.outbreak.info/resources/query/", params = {'q': doistring, 'scopes': 'doi', 'fields': '_id,name,url,doi'})
print(r.status_code)

"""if r.status_code == 200:
    rawresult = pandas.read_json(r.text)
    if 'notfound' in rawresult.columns:
        check = rawresult.loc[(rawresult['notfound']==1.0)|(rawresult['notfound']==True)]
        if len(check)==len(doilist):
            cleanresult = pandas.DataFrame(columns=['_id','name','url','doi'])
            missing = doilist            
        else:
            no_dups = rawresult[rawresult['query']==rawresult['doi']]
            cleanresult = no_dups[['_id','name','url','doi']].loc[~no_dups['_id'].isin(check['_id'].tolist())].copy()
            missing = [x for x in doilist if x not in cleanresult['doi'].unique().tolist()]        
    else:
        no_dups = rawresult[rawresult['query']==rawresult['doi']]
        cleanresult = no_dups[['_id','name','url','doi']]
        missing = []
    cleanresult.drop('doi',axis=1,inplace=True)

else:
    cleanresult=[]
    missing=[]"""

print(doistring)

In [None]:
blank = StringIO()
a='Howdy!'
b='How ya doing?'
c="what's happening man?"
x="Whatchu up to?"
melist = [a,b,c,x]
for eachelement in melist:
    blank.write(eachelement)
result = blank.getvalue()
print(result)