
# Green OA Dataset Policy Finder Data

Pulls in CSV created through API harvest so you can goof around with it. Needs **raw** urls to CSV file of data hosted in [repository](https://github.com/elibtronic/green_oa_compiler/) to start.

Uses [Policy Finder](https://openpolicyfinder.jisc.ac.uk/sherpa_legacy_api.pdf) to create a dataset enriched with policy finder info and writes out to csv file.

In [1]:
# @title URL of CSV file
csv_url = "https://raw.githubusercontent.com/elibtronic/green_oa_compiler/refs/heads/main/Brock_ORCID_Harvest_Start_Date_2025-01-01_to_2025-07-21.csv" # @param {"type":"string","placeholder":"https://raw.githubusercontent.com/elibtronic/green_oa_compiler/refs/heads/main/Brock_ORCID_Harvest_Start_Date_2025-01-01_to_2025-07-21.csv"}
pf_key = "" # @param {"type":"string"}

import pandas as pd
import requests
import json
import pprint

from google.colab import files
from urllib.parse import urlparse
doi_df = pd.read_csv(csv_url)

#need to write it out, ignoring index
doi_df = doi_df.drop('Unnamed: 0', axis=1)


#Normalize the LICENSE URL column?
standard_url = []
for key, value in doi_df.iterrows():

  try:
    standard_url.append(urlparse(value['LICENSE_URL']).netloc)
  except:
    standard_url.append("No URL retrieved")

doi_df['standard_url'] = standard_url


# Policy Finder Augementation

Next couple of cells will attempt to get all of the policy finder information via API and join to publications

In [107]:
def pf_fetch(pf_key,issn):
  '''Returns JSON from Policy Finder given an ISSN and API Key'''
  paip_options = {
      "item-type" : "publication",
      "api-key" : pf_key,
      "format" : "Json",
  }

  paip_options['identifier'] = issn

  requests.packages.urllib3.disable_warnings()
  api_blob = requests.get("https://v2.sherpa.ac.uk/cgi/retrieve_by_id",params=paip_options,verify=False).json()

  if api_blob['items'] == []:
    return None
  else:
    return api_blob['items'][0]

In [None]:
#through the dataframe

list_uri = []
list_submitted_oa_policy = []
list_accepted_oa_policy = []
list_published_oa_policy = []
count = 0

for key, value in doi_df.iterrows():

  count += 1
  if count % 50 == 0:
    print(count)

  if value['issn'] == "No ISSN found":
    print("no issn")
    list_uri.append("None")
    list_submitted_oa_policy.append("None")
    list_accepted_oa_policy.append("None")
    list_published_oa_policy.append("None")
  else:
    #print(value['issn'])
    par = pf_fetch(pf_key,value['issn'])

    #No policy found in otherwords
    if par == None:
      list_uri.append("None")
      list_submitted_oa_policy.append("None")
      list_accepted_oa_policy.append("None")
      list_published_oa_policy.append("None")
    else:
      policy_uri = par['system_metadata']['uri']
      accepted_oa_policy = ""
      published_oa_policy = ""
      submitted_oa_policy = ""
      for pub_policy in par['publisher_policy']:
        for per_oa in pub_policy['permitted_oa']:
          try:
            #Only keep ones that have IR as option, for first pass
            if 'institutional_repository' in per_oa['location']['location']:
              #accepted policy first
              if per_oa['article_version'][0] == 'accepted':
                try:
                  conditions_text = " ".join(per_oa['conditions'])
                except:
                  conditions_text = "no conditions"
                if per_oa['additional_oa_fee'] == "no":
                  fee_text = "fee_no"
                else:
                  fee_text = "fee_yes"
                per_id = str(per_oa['id'])
                accepted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

              #published
              elif per_oa['article_version'][0] == 'published':
                try:
                  conditions_text = " ".join(per_oa['conditions'])
                except:
                  conditions_text = "no conditions"
                if per_oa['additional_oa_fee'] == "no":
                  fee_text = "fee_no"
                else:
                  fee_text = "fee_yes"
                per_id = str(per_oa['id']).strip()
                published_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

              #submitted
              elif per_oa['article_version'][0] == 'submitted':
                try:
                  conditions_text = " ".join(per_oa['conditions'])
                except:
                  conditions_text = "no conditions"
                if per_oa['additional_oa_fee'] == "no":
                  fee_text = "fee_no"
                else:
                  fee_text = "fee_yes"
                per_id = str(per_oa['id']).strip()
                submitted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"
          except:
            print("error")
            policy_uri = "error"
            accepted_oa_policy = "error"
            published_oa_policy = "error"
            submitted_oa_policy = "error"

        if submitted_oa_policy == "":
          submitted_oa_policy = "None"
        if accepted_oa_policy == "":
          accepted_oa_policy = "None"
        if published_oa_policy == "":
          published_oa_policy = "None"

      #print("\n")
      #print(policy_uri)
      list_uri.append(policy_uri)
      #print(submitted_oa_policy)
      list_submitted_oa_policy.append(submitted_oa_policy)
      #print(accepted_oa_policy)
      list_accepted_oa_policy.append(accepted_oa_policy)
      #print(published_oa_policy)
      list_published_oa_policy.append(published_oa_policy)


In [136]:
# add these new columns to the df
doi_df['policy_uri'] = list_uri
doi_df['submitted_oa_policy'] = list_submitted_oa_policy
doi_df['accepted_oa_policy'] = list_accepted_oa_policy
doi_df['published_oa_policy'] = list_published_oa_policy

#DOI data as CSV
stump = csv_url.split("/")[-1][0:-4]
csv_file_name = stump + "_policy_data.csv"

print("Saving to file: "+csv_file_name)
doi_df.to_csv(csv_file_name,index=False)
files.download(csv_file_name)

In [137]:
doi_df

Unnamed: 0,ORCID,DOI,issn,LICENSE_URL,standard_url,policy_uri,submitted_oa_policy,accepted_oa_policy,published_oa_policy
0,0000-0002-2899-0048,10.1139/cjps-2024-0160,0008-4220,https://creativecommons.org/licenses/by/4.0/de...,creativecommons.org,https://v2.sherpa.ac.uk/id/publication/4106,8195:fee_no:no conditions;,3875:fee_no:Publisher copyright and source mus...,
1,0000-0001-6297-4298,10.1108/JHOM-02-2023-0036,1477-7266,https://www.emerald.com/insight/site-policies,www.emerald.com,https://v2.sherpa.ac.uk/id/publication/2842,53:fee_no:Academic Social Networks / Scholarly...,54:fee_no:Academic Social Network/ Scholarly C...,
2,0009-0001-0969-055X,10.55982/openpraxis.17.2.832,2304-070X,No URL retrieved,,https://v2.sherpa.ac.uk/id/publication/25564,,,
3,0000-0002-3772-5561,10.1123/jpah.2024-0225,1543-3080,No URL retrieved,,https://v2.sherpa.ac.uk/id/publication/12252,,3849:fee_no:Publisher copyright and source mus...,
4,0009-0009-2327-7890,10.1002/fer3.48,2835-9402,http://creativecommons.org/licenses/by/4.0/,creativecommons.org,https://v2.sherpa.ac.uk/id/publication/45752,8051:fee_no:Publisher source must be acknowled...,,
...,...,...,...,...,...,...,...,...,...
845,0000-0002-0048-491X,10.1080/030144600419305,0301-4460,No URL retrieved,,https://v2.sherpa.ac.uk/id/publication/763,,,
846,0000-0002-0048-491X,10.1002/(SICI)1520-6300(200005/06)12:3<395::AI...,1042-0533,http://doi.wiley.com/10.1002/tdm_license_1.1,doi.wiley.com,https://v2.sherpa.ac.uk/id/publication/4671,11204:fee_no:Must acknowledge acceptance for p...,,
847,0000-0002-0048-491X,10.1123/PES.11.3.208,0899-8493,No URL retrieved,,https://v2.sherpa.ac.uk/id/publication/12259,,3849:fee_no:Publisher copyright and source mus...,
848,0000-0002-0048-491X,10.1093/AJCN/69.6.1123,0002-9165,https://www.elsevier.com/tdm/userlicense/1.0/,www.elsevier.com,https://v2.sherpa.ac.uk/id/publication/6233,,,10952:fee_yes:Published source must be acknowl...



# Stop here!

In [117]:
# @title Through a random list
publications = ["2304-070X", "0002-9165", "0301-5548"]

list_uri = []
list_submitted_oa_policy = []
list_accepted_oa_policy = []
list_published_oa_policy = []

for pub in publications:
  #print("\n",pub)
  par = pf_fetch(pf_key,pub)

  #No policy found in otherwords
  if par == None:
    list_uri.append("None")
    list_submitted_oa_policy.append("None")
    list_accepted_oa_policy.append("None")
    list_published_oa_policy.append("None")
  else:
    policy_uri = par['system_metadata']['uri']
    accepted_oa_policy = ""
    published_oa_policy = ""
    submitted_oa_policy = ""
    for pub_policy in par['publisher_policy']:
      for per_oa in pub_policy['permitted_oa']:

        #Only keep ones that have IR as option, for first pass
        if 'institutional_repository' in per_oa['location']['location']:
          #accepted policy first
          if per_oa['article_version'][0] == 'accepted':
            conditions_text = " ".join(per_oa['conditions'])
            if per_oa['additional_oa_fee'] == "no":
              fee_text = "fee_no"
            else:
              fee_text = "fee_yes"
            per_id = str(per_oa['id'])
            accepted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

          #published
          elif per_oa['article_version'][0] == 'published':
            conditions_text = " ".join(per_oa['conditions'])
            if per_oa['additional_oa_fee'] == "no":
              fee_text = "fee_no"
            else:
              fee_text = "fee_yes"
            per_id = str(per_oa['id']).strip()
            published_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

          #submitted
          elif per_oa['article_version'][0] == 'submitted':
            conditions_text = " ".join(per_oa['conditions'])
            if per_oa['additional_oa_fee'] == "no":
              fee_text = "fee_no"
            else:
              fee_text = "fee_yes"
            per_id = str(per_oa['id']).strip()
            submitted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

      if submitted_oa_policy == "":
        submitted_oa_policy = "None"
      if accepted_oa_policy == "":
        accepted_oa_policy = "None"
      if published_oa_policy == "":
        published_oa_policy = "None"

    #print("\n")
    #print(policy_uri)
    list_uri.append(policy_uri)
    #print(submitted_oa_policy)
    list_submitted_oa_policy.append(submitted_oa_policy)
    #print(accepted_oa_policy)
    list_accepted_oa_policy.append(accepted_oa_policy)
    #print(published_oa_policy)
    list_published_oa_policy.append(published_oa_policy)




In [None]:
# @title Single ISSN workflow

par = pf_fetch(pf_key,"0002-9165")
policy_uri = par['system_metadata']['uri']
accepted_oa_policy = ""
published_oa_policy = ""
submitted_oa_policy = ""
for pub_policy in par['publisher_policy']:
  for per_oa in pub_policy['permitted_oa']:

    #Only keep ones that have IR as option, for first pass
    if 'institutional_repository' in per_oa['location']['location']:
      #accepted policy first
      if per_oa['article_version'][0] == 'accepted':
        conditions_text = " ".join(per_oa['conditions'])
        if per_oa['additional_oa_fee'] == "no":
          fee_text = "fee_no"
        else:
          fee_text = "fee_yes"
        per_id = str(per_oa['id'])
        accepted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

      #published
      elif per_oa['article_version'][0] == 'published':
        conditions_text = " ".join(per_oa['conditions'])
        if per_oa['additional_oa_fee'] == "no":
          fee_text = "fee_no"
        else:
          fee_text = "fee_yes"
        per_id = str(per_oa['id']).strip()
        published_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

      #submitted
      elif per_oa['article_version'][0] == 'submitted':
        conditions_text = " ".join(per_oa['conditions'])
        if per_oa['additional_oa_fee'] == "no":
          fee_text = "fee_no"
        else:
          fee_text = "fee_yes"
        per_id = str(per_oa['id']).strip()
        submitted_oa_policy += per_id+":"+fee_text+":"+conditions_text+";"

  if submitted_oa_policy == "":
    submitted_oa_policy = "None"
  if accepted_oa_policy == "":
    accepted_oa_policy = "None"
  if published_oa_policy == "":
    published_oa_policy = "None"

print("\n")
print(policy_uri)
print(submitted_oa_policy)
print(accepted_oa_policy)
print(published_oa_policy)
