In [17]:
# Import packages
from pprintjson import pprintjson as ppjson 
import pandas as pd 
from habanero import Crossref
import os 
import earthpy as et 

In [18]:
# Upload USGS spreadsheet as csv to have publication DOIs readily available as well as to incororate new information to add 
# obtain pathway to spreadsheet 
os.chdir(os.path.join(et.io.HOME, "documents", "work"))
os.getcwd()

'C:\\Users\\Taylor Hunt\\documents\\work'

In [19]:
# upload file and then present as dataframe 
fname = os.path.join("doi_dr_pubs.csv")
dr_doi_pubs = pd.read_csv(fname)
dr_doi_pubs

Unnamed: 0,ID,dr_doi,primary,rel_pub_url
0,0,doi:10.5066/P90QU56J,True,https://doi.org/10.1021/acs.est.8b07227
1,1,doi:10.5066/F73R0R24,True,https://doi.org/10.1007/s13157-017-0895-3
2,2,doi:10.5066/F7VQ30RM,True,https://doi.org/10.1002/etc.3391
3,3,doi:10.5066/F71G0JF6,True,https://doi.org/10.3133/sim3378
4,4,doi:10.5066/F7571931,True,https://doi.org/10.3133/sir20155164
...,...,...,...,...
1349,1349,doi:10.5066/P9D5IP0G,True,https://doi.org/10.1186/s40462-019-0178-0
1350,1350,doi:10.5066/F7JH3KBD,True,https://doi.org/10.3133/sir20175135
1351,1351,doi:10.5066/P9V9AORH,True,https://doi.org/10.1371/journal.pone.0197584
1352,1352,doi:10.5066/P9BS882S,True,https://doi.org/10.3133/sim3423


## Column defintions:
* **dr_doi:** Data release digital object identifier from USGS (prefix is 10.5066) that is directly related to the related publication
* **rel_pub_url:** Related publication URL, the doi that is direclty related USGS data release 

In [20]:
# Code below adapted from:
# https://habanero.readthedocs.io/en/latest/modules/crossref.html#habanero.Crossref.works
cr = Crossref()

In [21]:
# Example publication: https://api.crossref.org/works/10.1021/acs.est.8b07227 
# To show the return of infortion from Crossref 
result = cr.works(ids = '10.1021/acs.est.8b07227')

In [22]:
# prettyprint is used to see a more human readable result 
ppjson(result)

{
    "status": "ok",
    "message-type": "work",
    "message-version": "1.0.0",
    "message": {
        "indexed": {
            "date-parts": [
                [
                    2020,
                    4,
                    24
                ]
            ],
            "date-time": "2020-04-24T19:23:29Z",
            "timestamp": 1587756209612
        },
        "reference-count": 67,
        "publisher": "American Chemical Society (ACS)",
        "issue": "9",
        "funder": [
            {
                "DOI": "10.13039/100007149",
                "name": "U.S. Bureau of Land Management",
                "doi-asserted-by": "publisher",
                "award": []
            },
            {
                "DOI": "10.13039/100000203",
                "name": "U.S. Geological Survey",
                "doi-asserted-by": "publisher",
                "award": []
            }
        ],
        "content-domain": {
            "domain": [],
            "crossmark-restri

In [23]:
# Most relevant information contained within "message"
ppjson(result["message"])

{
    "indexed": {
        "date-parts": [
            [
                2020,
                4,
                24
            ]
        ],
        "date-time": "2020-04-24T19:23:29Z",
        "timestamp": 1587756209612
    },
    "reference-count": 67,
    "publisher": "American Chemical Society (ACS)",
    "issue": "9",
    "funder": [
        {
            "DOI": "10.13039/100007149",
            "name": "U.S. Bureau of Land Management",
            "doi-asserted-by": "publisher",
            "award": []
        },
        {
            "DOI": "10.13039/100000203",
            "name": "U.S. Geological Survey",
            "doi-asserted-by": "publisher",
            "award": []
        }
    ],
    "content-domain": {
        "domain": [],
        "crossmark-restriction": false
    },
    "short-container-title": [
        "Environ. Sci. Technol."
    ],
    "published-print": {
        "date-parts": [
            [
                2019,
                5,
                7
       

In [24]:
# Get publication DOI from within the message 
result["message"]["DOI"]

'10.1021/acs.est.8b07227'

In [25]:
# Get publication date 
result["message"]["published-online"]["date-parts"][0]

[2019, 3, 29]

In [26]:
# To get just the year of the publication date 
result["message"]["published-online"]["date-parts"][0][0]

2019

In [27]:
# Get title of publication
result["message"]["title"][0]

'Mercury Exposure and Altered Parental Nesting Behavior in a Wild Songbird'

In [28]:
# Get publisher
result["message"]["publisher"]

'American Chemical Society (ACS)'

In [29]:
# This example contains reference count but not list of references  
result["message"]["references-count"]

67

In [30]:
# Code does not execute because this info is not available for this publication
#result["message"]["reference"]

## Example of publication with full reference list 

In [31]:
result = cr.works(ids = 'https://doi.org/10.1007/s13157-017-0895-3')

In [32]:
# This example contains the actual list of references
# Note key for last entry in reference: "key": "895_CR89"
ppjson(result["message"]["reference"])

[
    {
        "key": "895_CR1",
        "unstructured": "Baldwin BG (2002) The Jepson Desert manual: vascular plants of southeastern California. Univ of California Press, Oakland CA"
    },
    {
        "key": "895_CR2",
        "doi-asserted-by": "publisher",
        "first-page": "1533",
        "DOI": "10.1111/j.0030-1299.2008.16776.x",
        "volume": "117",
        "author": "M Bernhardt-R\u00f6mermann",
        "year": "2008",
        "unstructured": "Bernhardt-R\u00f6mermann M, R\u00f6mermann C, Nuske R, Parth A, Klotz S, Schmidt W, Stadler J (2008) On the identification of the most suitable traits for plant functional trait analyses. Oikos 117:1533\u20131541",
        "journal-title": "Oikos"
    },
    {
        "key": "895_CR3",
        "doi-asserted-by": "publisher",
        "first-page": "287",
        "DOI": "10.1007/BF00540195",
        "volume": "45",
        "author": "TW Boutton",
        "year": "1980",
        "unstructured": "Boutton TW, Harrison AT, Smith BN (

In [37]:
# In order to pull out DOIs from reference count a loop will be implemented
x = 0
for i in result["message"]["reference"]:
    try:
        print(result["message"]["reference"][x]["DOI"])
    except: 
        print("no DOI")
    x+=1

no DOI
10.1111/j.0030-1299.2008.16776.x
10.1007/BF00540195
no DOI
10.1093/treephys/24.8.891
10.1111/j.1461-0248.2009.01285.x
10.1002/ecy.1453
10.1890/07-1134.1
10.1890/0012-9658(2006)87[1465:ATTFHF]2.0.CO;2
10.1126/science.199.4335.1302
10.1038/nature16489
no DOI
10.1007/BF00347821
10.1029/2011GL050762
no DOI
10.1007/s11258-016-0628-3
no DOI
10.1086/283244
no DOI
10.1111/j.1469-185X.1977.tb01347.x
10.1890/0012-9658(2000)081[2149:COSSOP]2.0.CO;2
10.1007/s004420100628
10.1111/j.0022-0477.2004.00918.x
10.1073/pnas.0404500101
10.3133/ofr20061243
10.1371/journal.pone.0056033
10.1890/ES15-00064.1
10.1086/628592
no DOI
10.1007/s00442-008-0965-6
no DOI
10.1111/jvs.12066
10.2307/2259725
10.1111/j.1442-9993.2009.01988.x
10.1890/13-1965.1
10.1046/j.1365-2435.2002.00664.x
10.2307/2261604
10.1104/pp.16.00829
10.1016/j.tree.2003.10.002
no DOI
10.1111/j.1365-3040.2005.01433.x
10.5066/F73R0R24
no DOI
10.1890/12-0303.1
10.1111/j.1365-2427.2009.02206.x
10.1111/j.1461-0248.2010.01476.x
10.1111/j.1469-813

In [34]:
# Note that we are looking for the DOI with the prefix 10.5066, this will have to be incorporated into the loop eventually.
# First we may want to know what related publications have references attached to them and then added to the new spreadsheet

In [35]:
# This will be done via funtion and list comprehension 
# in this function it is asking each related publication url if there is a reference/citations list attached in Crossref api  
# the try is asking to query doi information from crossref
# if there publication DOI isnt able to be queried then the return will be none  
# If  the result can be queried then it will give a true or false statement on whether there is a reference list attahced    
def crossref_has_citations(doi):
    try: 
        result = cr.works(ids = doi)
    except:
        return None
    return "reference" in result["message"]

In [36]:
crossref_has_citations("https://doi.org/10.1021/acs.est.8b07227")

False

In [38]:
# list comprehension examples
[i for i in range(10)]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [39]:
# list comprehension examples 
[i*2 for i in range(10)]

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [41]:
# Now recalling the USGS spread sheet dr_doi_pubs to implement function and list comphension for the column rel_pub_url
[doi for doi in dr_doi_pubs["rel_pub_url"]]

['https://doi.org/10.1021/acs.est.8b07227',
 'https://doi.org/10.1007/s13157-017-0895-3',
 'https://doi.org/10.1002/etc.3391',
 'https://doi.org/10.3133/sim3378',
 'https://doi.org/10.3133/sir20155164',
 'https://doi.org/10.1186/s40317-018-0145-3',
 'https://doi.org/10.5066/P9MR4XN4',
 'https://doi.org/10.1002/ece3.5146',
 'https://doi.org/10.1016/j.envpol.2017.08.070',
 'https://doi.org/10.1002/eap.1912',
 'https://doi.org/10.1002/ecs2.2095',
 'https://doi.org/10.1021/acs.est.7b04076',
 'https://doi.org/10.3996/042016-JFWM-029',
 'https://doi.org/10.3133/sir20175064',
 'https://doi.org/10.1177/0361198118822821',
 'https://doi.org/10.3133/ds1048',
 'https://doi.org/10.1002/ecs2.1582',
 'https://doi.org/10.3133/ofr20191054',
 'https://doi.org/10.1111/1365-2745.12901',
 'https://doi.org/10.3133/ofr20181123',
 'https://doi.org/10.1111/cobi.12569',
 'https://doi.org/10.1002/nafm.10373',
 'https://doi.org/10.1016/j.ecss.2018.01.006',
 'https://doi.org/10.1111/1752-1688.12603',
 'https://doi

In [42]:
# now calling the function to the list comprehension 
[crossref_has_citations(doi) for doi in dr_doi_pubs["rel_pub_url"]]

[False,
 True,
 True,
 False,
 False,
 True,
 None,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 True,
 True,
 None,
 True,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 None,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,


In [43]:
# with this new information, it now needs to be added to the spread sheet 
dr_doi_pubs["rel_pub_has_ref"] = [crossref_has_citations(doi) for doi in dr_doi_pubs["rel_pub_url"]]

In [44]:
# updated spreadsheet
dr_doi_pubs

Unnamed: 0,ID,dr_doi,primary,rel_pub_url,rel_pub_has_ref
0,0,doi:10.5066/P90QU56J,True,https://doi.org/10.1021/acs.est.8b07227,False
1,1,doi:10.5066/F73R0R24,True,https://doi.org/10.1007/s13157-017-0895-3,True
2,2,doi:10.5066/F7VQ30RM,True,https://doi.org/10.1002/etc.3391,True
3,3,doi:10.5066/F71G0JF6,True,https://doi.org/10.3133/sim3378,False
4,4,doi:10.5066/F7571931,True,https://doi.org/10.3133/sir20155164,False
...,...,...,...,...,...
1349,1349,doi:10.5066/P9D5IP0G,True,https://doi.org/10.1186/s40462-019-0178-0,True
1350,1350,doi:10.5066/F7JH3KBD,True,https://doi.org/10.3133/sir20175135,False
1351,1351,doi:10.5066/P9V9AORH,True,https://doi.org/10.1371/journal.pone.0197584,True
1352,1352,doi:10.5066/P9BS882S,True,https://doi.org/10.3133/sim3423,False
