In [1]:
from pprintjson import pprintjson as ppjson
import pandas as pd 
from habanero import Crossref
import numpy as np 
import os

In [2]:
fname = os.path.join("doi_dr_pubs.csv")

In [3]:
dr_doi_pubs = pd.read_csv(fname)

In [4]:
# This is the spreadsheet with the information that is being drawn from 
dr_doi_pubs

Unnamed: 0,ID,dr_doi,primary,rel_pub_url
0,0,doi:10.5066/P90QU56J,True,https://doi.org/10.1021/acs.est.8b07227
1,1,doi:10.5066/F73R0R24,True,https://doi.org/10.1007/s13157-017-0895-3
2,2,doi:10.5066/F7VQ30RM,True,https://doi.org/10.1002/etc.3391
3,3,doi:10.5066/F71G0JF6,True,https://doi.org/10.3133/sim3378
4,4,doi:10.5066/F7571931,True,https://doi.org/10.3133/sir20155164
...,...,...,...,...
1349,1349,doi:10.5066/P9D5IP0G,True,https://doi.org/10.1186/s40462-019-0178-0
1350,1350,doi:10.5066/F7JH3KBD,True,https://doi.org/10.3133/sir20175135
1351,1351,doi:10.5066/P9V9AORH,True,https://doi.org/10.1371/journal.pone.0197584
1352,1352,doi:10.5066/P9BS882S,True,https://doi.org/10.3133/sim3423


In [5]:
cr = Crossref()

In [6]:
def get_crossref_data(row):
    """ Get data from crossref in order to get information on data 
    citation practices. This is done by taking the related publicaton url 
    (column rel_pub_url) and searching the for the data released DOI (column dr_doi) within the references 
    of the related publication url by row. 
    
    Parameters
    ----------
    Args: one row from a data frame
   
    Returns: a Series containing data from crossref
    """
    try:
        result = cr.works(ids = row["rel_pub_url"])
    except:
        return None
    try:
        date_parts = result["message"].get("published-online")["date-parts"][0][0]
    except:
        date_parts = None
# First line takes out doi: in order to run just numeric DOI from dr_doi column 
# Second line is pulling DOIs from references within the message of the rel_pub_url DOIs 
# Third line is taking the data release DOIs and searching for them within the set of DOIs pullled from the rel_pub_url 
    if "reference" in result["message"]:
        dr_doi = row["dr_doi"].replace("doi:", "") 
        dois = [r.get("DOI") for r in result["message"]["reference"]]
        dr_doi_cited = dr_doi in set(dois)
    else: 
        dr_doi_cited = None 
        
    data_to_return = {
        "has_refs": "reference" in result["message"],
        "has_title": "title" in result["message"],
        "title": result["message"].get("title")[0],
        "publisher": result["message"].get("publisher"),
        "pub_year": date_parts,
        "dr_doi_cited": dr_doi_cited,
        "rel_pub_url": row["rel_pub_url"],
        "dr_doi": row["dr_doi"]
       
    }
    return pd.Series(data_to_return) 

In [7]:
dr_doi_pubs.apply(get_crossref_data, axis=1)

Unnamed: 0,has_refs,has_title,title,publisher,pub_year,dr_doi_cited,rel_pub_url,dr_doi
0,False,True,Mercury Exposure and Altered Parental Nesting ...,American Chemical Society (ACS),2019.0,,https://doi.org/10.1021/acs.est.8b07227,doi:10.5066/P90QU56J
1,True,True,Changes in Community-Level Riparian Plant Trai...,Springer Science and Business Media LLC,2017.0,True,https://doi.org/10.1007/s13157-017-0895-3,doi:10.5066/F73R0R24
2,True,True,Spatial and temporal variation in microcystin ...,Wiley,2016.0,False,https://doi.org/10.1002/etc.3391,doi:10.5066/F7VQ30RM
3,False,True,Hydrogeologic characteristics and geospatial a...,US Geological Survey,2017.0,,https://doi.org/10.3133/sim3378,doi:10.5066/F71G0JF6
4,False,True,"Volcanic aquifers of Hawai‘i—Hydrogeology, wat...",US Geological Survey,2018.0,,https://doi.org/10.3133/sir20155164,doi:10.5066/F7571931
...,...,...,...,...,...,...,...,...
1349,True,True,Tropical cyclones alter short-term activity pa...,Springer Science and Business Media LLC,2019.0,False,https://doi.org/10.1186/s40462-019-0178-0,doi:10.5066/P9D5IP0G
1350,False,True,"Hydrogeology of, simulation of groundwater flo...",US Geological Survey,2018.0,,https://doi.org/10.3133/sir20175135,doi:10.5066/F7JH3KBD
1351,True,True,Isotope niche dimension and trophic overlap be...,Public Library of Science (PLoS),2018.0,False,https://doi.org/10.1371/journal.pone.0197584,doi:10.5066/P9V9AORH
1352,False,True,Delineation of selected lithologic units using...,US Geological Survey,2019.0,,https://doi.org/10.3133/sim3423,doi:10.5066/P9BS882S


In [8]:
# Add data to the spreadsheet 
dr_doi_pubs = dr_doi_pubs.apply(get_crossref_data, axis=1)

In [9]:
# This line converts the pub_year column from a float numerical value to integer value 
dr_doi_pubs["pub_year"] = dr_doi_pubs["pub_year"].astype('Int64')
dr_doi_pubs

Unnamed: 0,has_refs,has_title,title,publisher,pub_year,dr_doi_cited,rel_pub_url,dr_doi
0,False,True,Mercury Exposure and Altered Parental Nesting ...,American Chemical Society (ACS),2019,,https://doi.org/10.1021/acs.est.8b07227,doi:10.5066/P90QU56J
1,True,True,Changes in Community-Level Riparian Plant Trai...,Springer Science and Business Media LLC,2017,True,https://doi.org/10.1007/s13157-017-0895-3,doi:10.5066/F73R0R24
2,True,True,Spatial and temporal variation in microcystin ...,Wiley,2016,False,https://doi.org/10.1002/etc.3391,doi:10.5066/F7VQ30RM
3,False,True,Hydrogeologic characteristics and geospatial a...,US Geological Survey,2017,,https://doi.org/10.3133/sim3378,doi:10.5066/F71G0JF6
4,False,True,"Volcanic aquifers of Hawai‘i—Hydrogeology, wat...",US Geological Survey,2018,,https://doi.org/10.3133/sir20155164,doi:10.5066/F7571931
...,...,...,...,...,...,...,...,...
1349,True,True,Tropical cyclones alter short-term activity pa...,Springer Science and Business Media LLC,2019,False,https://doi.org/10.1186/s40462-019-0178-0,doi:10.5066/P9D5IP0G
1350,False,True,"Hydrogeology of, simulation of groundwater flo...",US Geological Survey,2018,,https://doi.org/10.3133/sir20175135,doi:10.5066/F7JH3KBD
1351,True,True,Isotope niche dimension and trophic overlap be...,Public Library of Science (PLoS),2018,False,https://doi.org/10.1371/journal.pone.0197584,doi:10.5066/P9V9AORH
1352,False,True,Delineation of selected lithologic units using...,US Geological Survey,2019,,https://doi.org/10.3133/sim3423,doi:10.5066/P9BS882S
