# Green OA Compiler

Will build dataframe of ORCID and DOI of journal articles published from a date onward. Will need ORCID API key and secret to work. Saves as CSV files that should be added to GH [repository](https://github.com/elibtronic/green_oa_compiler/). Any further analysis goes on in [Analysis](https://colab.research.google.com/drive/1zXmJZbwI8q5Ob5Pwcsx-3pIyhhSwpXNT#scrollTo=lXJDNI-cej1E) notebook.

No modification to the data in this notebook! Just pulling it down and exporting it.

In [None]:
!pip install orcid
!pip install crossrefapi

import requests
import json
import orcid
import pprint
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from crossref.restful import Works
from google.colab import files
from datetime import datetime

pd.set_option('display.max_rows', None)

print("\nReady to proceed!")


Ready to proceed!


In [None]:
# @title ORCID List Creation Parameters {"vertical-output":true,"display-mode":"form"}
Client_ID = "" # @param {"type":"string"}
Client_Secret = "" # @param {"type":"string"}

# @markdown Affiliation search string

s_query = "current-institution-affiliation-name:(\"Brock University\")"#@param {type:"string"}


# @markdown Date to start harvest from?
Start_Date = "2025-01-01" # @param {"type":"date"}
s_date = datetime.strptime(Start_Date, '%Y-%m-%d')


# @markdown Refresh list of ORCIDs?
refresh_orcid = False # @param {"type":"boolean","placeholder":"True"}

api = orcid.PublicAPI(Client_ID, Client_Secret, sandbox=False)
search_token = api.get_search_token_from_orcid()

print("Options set and API connections completed!")

Options set and API connections completed!


In [None]:
b_members = []

if refresh_orcid == True:
  #Build a dictionary of ORCIDs currently affiliated with Brock
  #Also need to check verified email that is with brocku.ca

  search_results = api.search_generator(s_query, access_token=search_token)

  for page in search_results:
    b_members.append(page['orcid-identifier']['path'])
else:
  b_members_df = pd.read_csv("https://github.com/elibtronic/green_oa_compiler/raw/refs/heads/main/ORCID_List.csv")
  b_members = b_members_df['ORCID'].tolist()

print("ORCID API list constructed")

ORCID API list constructed


In [None]:
harvested_j_works = []
works = Works()
#check for most recent pub date of items associated with that orcid
#extract some metadata for that, doi, title, journal, etc.
#created a df with orcid of person and doi etc

print("Retrieving works from ",len(b_members)," ORCID records")
prog = 0
for b_orcid in b_members:

  if prog % 50 == 0:
    print(str(prog)+"..", end = "")
  if prog % 500 == 0:
    print("\n")
  prog += 1

  works_list = api.read_record_public(b_orcid, 'works',search_token)
  for w in works_list['group']:

    date_work = datetime.fromtimestamp(w['last-modified-date']['value'] / 1000)
    if date_work >= s_date:
      for ws in w.get("work-summary"):
        if ws.get('type') == 'JOURNAL_ARTICLE':
          eids = ws.get('external-ids')
          for e in eids['external-id']:

            if e.get('external-id-type') == 'doi':
              doi =  e.get('external-id-value')
              item = works.doi(doi)
              try:
                license_url = item['license'][0]['URL']
              except:
                license_url = "No URL retrieved"

              try:
                issn = item['ISSN'][0]
              except:
                issn = "No ISSN found"

              harvested_j_works.append([b_orcid,doi,issn,license_url])
  print("Done harvesting.")


Retriving works from  785  ORCID records
0..10..20..30..40..50..60..70..80..90..100..110..120..130..140..150..160..170..180..190..200..210..220..230..240..250..260..270..280..290..300..310..320..330..340..350..360..370..380..390..400..410..420..430..440..450..460..470..480..490..500..510..520..530..540..550..560..570..580..590..600..610..620..630..640..650..660..670..680..690..700..710..720..730..740..750..760..770..780..

In [None]:
#Turn retrieved info into Dataframe and display
df = pd.DataFrame(harvested_j_works)
df.columns = ['ORCID',"DOI","issn","LICENSE_URL"]
df.drop_duplicates(subset=['DOI'],inplace=True)
df.reset_index(drop=True,inplace=True)

df.sample(10)


# Export data

In [None]:
#DOI data as CSV
csv_file_name = "Brock_ORCID_Harvest_Start_Date_"+str(Start_Date)+"_to_"+datetime.today().strftime('%Y-%m-%d')+".csv"
print("Saving to file: "+csv_file_name)
df.to_csv(csv_file_name)
files.download(csv_file_name)

# Current ORCID list to csv
b_members_df = pd.DataFrame(b_members)
b_members_df.columns = ['ORCID']
b_members_df.to_csv("ORCID_List.csv",index=False)
files.download("ORCID_List.csv")


Saving to file: Brock_ORCID_Harvest_Start_Date_2025-01-01_to_2025-07-21.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>