<a href="https://colab.research.google.com/github/elibtronic/2025_state_of_scholarly_communication/blob/main/YEAR_Green_OA_Machine_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2025 Green OA Machine Dataset creation

Will just open up the dataset created in **SofSC** notebook
identifies

- `total_eligible_authors` - all authors with a free pathway
- `authors_to_contact` - all authors with a free pathway AND (listed as first OR corresponding author)

Uses ORCID_API to retrieve name. Option to save each as CSV


In [None]:
# @title Dataset Details

!pip install orcid
import orcid
import pandas as pd
from google.colab import files

# @markdown YEAR to tag this dataset with
YEAR = "2025" # @param {"type":"string"}

# @markdown Dataset completed by the SofSC process
url_of_dataset = "https://raw.githubusercontent.com/elibtronic/2025_state_of_scholarly_communication/refs/heads/main/Brock_OpenAlex_Harvest_Start_Date_2024-07-01_to_2025-06-30_retrieved_2025-10-03.csv" # @param {"type":"string"}

# @markdown URL 'stub' of the Share Your Papers form

url_syp = "https://researchguides.library.brocku.ca/getting-your-research-out/green-open-access"  # @param {"type":"string"}


# @markdown ORCID API details

Client_ID = "" # @param {"type":"string"}
Client_Secret = "" # @param {"type":"string"}
search_orcid = "" # @param {"type":"string"}

api = orcid.PublicAPI(Client_ID, Client_Secret, sandbox=False)
search_token = api.get_search_token_from_orcid()

print("\nDataset Details...")
df = pd.read_csv(url_of_dataset)
#Gah!
df = df.fillna("None")
print("Total items: ",len(df))

total_eligible_authors = df[df.eval('oa_status == "closed"')]
total_eligible_authors = total_eligible_authors[total_eligible_authors['submitted_oa_policy'].str.contains('fee_no', case=False) | total_eligible_authors['accepted_oa_policy'].str.contains('fee_no', case=False) | total_eligible_authors['published_oa_policy'].str.contains('fee_no', case=False)]
print("Total Eligible Authors:",len(total_eligible_authors))


authors_to_contact = df[df.eval('(corresponding_author == "yes") or (author_position == "first")')]
authors_to_contact = authors_to_contact[authors_to_contact.eval('already_oa == "no"')]
authors_to_contact = authors_to_contact[authors_to_contact['submitted_oa_policy'].str.contains('fee_no', case=False) | authors_to_contact['accepted_oa_policy'].str.contains('fee_no', case=False) | authors_to_contact['published_oa_policy'].str.contains('fee_no', case=False)]
print("Total items we can contact: ",len(authors_to_contact))

In [None]:
total_eligible_authors = df[df.eval('already_oa == "no"')]
total_eligible_authors = total_eligible_authors[\
                                                total_eligible_authors['submitted_oa_policy'].str.contains('fee_no', case=False)\
                                                | total_eligible_authors['accepted_oa_policy'].str.contains('fee_no', case=False)\
                                                | total_eligible_authors['published_oa_policy'].str.contains('fee_no', case=False)]

#print("Total Eligible Authors:",len(total_eligible_authors))
total_eligible_authors.head(10)


In [None]:
#Total set of items
df.sample(5)

In [None]:
#Total items safe to contact about
authors_to_contact.sample(5)

## Augment Authors to Contact

In [None]:
#Add author name from a fetch for authors_to_contact
name_list = []

for row in authors_to_contact.iterrows():

  try:
    orcid = row[1]["ORCID"].split("/")[3]
    o_data = api.read_record_public(orcid, '',search_token)
    name_string = o_data['person']['name']['family-name']['value'] + " , " + o_data['person']['name']['given-names']['value']
    #print(name_string)
  except:
    name_string = "No ORCID"

  name_list.append(name_string)

authors_to_contact['name'] = name_list

#full URL for pre-popped form
pre_pop_list = []

for row in authors_to_contact.iterrows():
  doi_trimmed = row[1]["DOI"].split("doi.org/")[-2:][1]
  pre_pop_list.append(url_syp + "?doi=" + doi_trimmed)

authors_to_contact['form_url'] = pre_pop_list
print("done")

In [None]:
#Authors to Contact
authors_to_contact.sample(5)

In [None]:
# Author to Contact as CSV
csv_file_name = YEAR+"_Author_To_Contact.csv"
print("Saving to file: "+csv_file_name)
authors_to_contact.to_csv(csv_file_name,index=False)
files.download(csv_file_name)

## Augment Total Eligible Authors



In [None]:
#Add author name from a fetch for total_eligible_authors
name_list = []

for row in total_eligible_authors.iterrows():

  try:
    orcid = row[1]["ORCID"].split("/")[3]
    o_data = api.read_record_public(orcid, '',search_token)
    name_string = o_data['person']['name']['family-name']['value'] + " , " + o_data['person']['name']['given-names']['value']
    #print(name_string)
  except:
    name_string = "No ORCID"

  name_list.append(name_string)

total_eligible_authors['name'] = name_list

#full URL for pre-popped form
pre_pop_list = []

for row in total_eligible_authors.iterrows():
  doi_trimmed = row[1]["DOI"].split("doi.org/")[-2:][1]
  pre_pop_list.append(url_syp + "?doi=" + doi_trimmed)

total_eligible_authors['form_url'] = pre_pop_list
print("done")

In [None]:
#Total Eligible Authors
total_eligible_authors.sample(5)

In [None]:
# Total Eligible Authors as CSV
# Author to Contact as CSV
csv_file_name = YEAR+"_Total_Eligible_Authors.csv"
print("Saving to file: "+csv_file_name)
total_eligible_authors.to_csv(csv_file_name,index=False)
files.download(csv_file_name)