# Notebook containing our code for searching and analyzing platforms where posters are shared

## Imports

In [1]:
import json
import os
import time
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
from googlesearch import search

In [2]:
sns.set(
    font="Franklin Gothic Book",
    rc={
        "axes.axisbelow": False,
        "axes.edgecolor": "lightgrey",
        "axes.facecolor": "None",
        "axes.grid": False,
        "axes.labelcolor": "dimgrey",
        "axes.spines.right": False,
        "axes.spines.top": False,
        "figure.facecolor": "white",
        "lines.solid_capstyle": "round",
        "patch.edgecolor": "w",
        "patch.force_edgecolor": True,
        "text.color": "dimgrey",
        "xtick.bottom": False,
        "xtick.color": "dimgrey",
        "xtick.direction": "out",
        "xtick.top": False,
        "ytick.color": "dimgrey",
        "ytick.direction": "out",
        "ytick.left": False,
        "ytick.right": False,
    },
)
sns.set_context(
    "notebook", rc={"font.size": 16, "axes.titlesize": 20, "axes.labelsize": 18}
)

colors = ["#073b4c", "#ffd166", "#06d6a0", "#118ab2", "#ef476f", "#fb5607"]
patterns = ["/", "\\", "-", "+", "x", "o", "O", ".", "*", "|"]
markers = ["o", "s", "D", "^", "v", "*", "X", "+", "p", "h"]

## Finding platform where posters are shared

Note: The combined results from all methods are in the "poster-platforms-review.xlsx" file included in the dataset associated witht his work (see README for details)

### Method 1: Google search

In [36]:
rows_list = []
count = 0
for result in search("poster sharing publication", num_results=100):
    count += 1
    rows_list.append([count, result])
df = pd.DataFrame(rows_list, columns=["index", "link"])
df.to_csv("outputs/find-posters-google.csv", index=False)

### Method 2: Query LLMs

Done manually on https://openai.com/index/chatgpt/ and https://gemini.google.com/

### Method 3: Look at Datacite's poster related DOI

There is no specific category (or resourceType) for posters in the DataCite schema. However, it seems that platforms like Zenodo that publish posters enter "Poster" as a free text for the "resourceType" when generating a DOI for posters. So we queried the Datacite API for all resourceType set to "Poster" to find their related platforms.

#### Get all posters metadata from Datacite

In [None]:
# Note: the results of this code are already in the datacite.json file included in the output folder
# That file is from a run of this code on January 26th 2025
dict_results = {}
count = 0
next_page = True
get_link = "https://api.datacite.org/dois?query=types.resourceType:Poster&page[cursor]=1&page[size]=1000"

while next_page:
    r = json.loads(requests.get(get_link).content)
    for result in r["data"]:
        dict_results[count] = result
        count += 1
    if "next" in r["links"].keys():
        get_link = r["links"]["next"]
    else:
        next_page = False

# Save results in a json file so we don't have to run this multiple time for post-processing and analysis
with open("outputs/datacite.json", "w", encoding="utf-8") as f:
    json.dump(dict_results, f, ensure_ascii=False, indent=4)

#### Get publisher information from the posters

In [3]:
# load data
with open("outputs/datacite.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [4]:
# Get platform info and one poster doi from each publisher to have an example poster from each
rows_list = []
cliend_id_list = []
for result in results.values():
    client_id = result["relationships"]["client"]["data"]["id"]
    if client_id not in cliend_id_list:
        cliend_id_list.append(client_id)
        get_client_link = "https://api.test.datacite.org/clients/" + client_id
        r = json.loads(requests.get(get_client_link).content)
        if "data" in r.keys():
            result_client = r["data"]
            client_name = result_client["attributes"]["name"]
            client_domains = result_client["attributes"]["domains"]
        else:
            client_name = "NA"
            client_domains = "NA"

        poster_doi = result["attributes"]["doi"]
        poster_doi_link = "https://doi.org/" + poster_doi

        rows_list.append(
            [client_id, client_name, client_domains, poster_doi, poster_doi_link]
        )
# save
df = pd.DataFrame(
    rows_list,
    columns=[
        "Platform id on Datacite",
        "Platform name",
        "Platform domains",
        "Example poster DOI",
        "Example poster DOI link",
    ],
)
df.to_csv("outputs/find-posters/find-posters-datacite.csv", index=False)

In [8]:
# Remove institution specific figshare instances as the related posters are also found in the main figshare site
main_figshare_id = "figshare.ars"
for index, row in df.iterrows():
    client_id = row["Platform id on Datacite"]
    client_domains = row["Platform domains"]
    if "figshare" in client_id or "figsh" in client_domains:
        if client_id != main_figshare_id:
            df = df.drop(index)
df.to_csv("outputs/find-posters/find-posters-datacite.csv", index=False)

## Getting the total number of posters available on each platform (as of December 2024) and their count by year

### Method 1: Using DataCite

In [2]:
# load data
with open("outputs/datacite.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [3]:
df = pd.read_csv("outputs/find-posters/find-posters-datacite.csv")

In [6]:
# Count number of poster for a given client
cliend_id_interest = "cern.zenodo"
count = 0
doi_list = []
for result in results.values():
    client_id = result["relationships"]["client"]["data"]["id"]
    if client_id == cliend_id_interest:
        created_date = result["attributes"]["created"]
        create_year = datetime.fromisoformat(created_date).year
        if versionOfCount == 1:
            if create_year <2025:
                count+=1
                result_zenodo = result
                doi_list.append(result["id"])
print(count)

49476


In [7]:
# Count number of poster for a given client
cliend_id_interest = "cern.zenodo"
count = 0
doi_list = []
for result in results.values():
    client_id = result["relationships"]["client"]["data"]["id"]
    if client_id == cliend_id_interest:
        versionOfCount = result["attributes"]["versionOfCount"]
        versionCount = result["attributes"]["versionCount"]
        created_date = result["attributes"]["created"]
        create_year = datetime.fromisoformat(created_date).year
        if versionOfCount == 0:
            if create_year <2025:
                count+=1
                result_zenodo = result
                doi_list.append(result["id"])
print(count)

34737


In [8]:
result

{'id': '10.13140/rg.2.2.15894.25921',
 'type': 'dois',
 'attributes': {'doi': '10.13140/rg.2.2.15894.25921',
  'identifiers': [],
  'creators': [{'name': 'Golani, Ruchika',
    'nameType': 'Personal',
    'givenName': 'Ruchika',
    'familyName': 'Golani',
    'affiliation': [],
    'nameIdentifiers': []}],
  'titles': [{'title': 'Influence of Living Arrangements on Self-Reported Dietary Changes, Drinking, and Smoking Habits Among Undergraduate Students in India'}],
  'publisher': 'Unpublished',
  'container': {},
  'publicationYear': 2024,
  'subjects': [],
  'contributors': [],
  'dates': [{'date': '2024', 'dateType': 'Issued'}],
  'language': None,
  'types': {'ris': 'RPRT',
   'bibtex': 'article',
   'citeproc': 'article-journal',
   'schemaOrg': 'ScholarlyArticle',
   'resourceType': 'Poster',
   'resourceTypeGeneral': 'Text'},
  'relatedIdentifiers': [],
  'relatedItems': [],
  'sizes': [],
  'formats': [],
  'version': None,
  'rightsList': [],
  'descriptions': [],
  'geoLocati

In [30]:
df_dois = pd.DataFrame({"doi":doi_list})

In [31]:
display(df_dois)

Unnamed: 0,doi
0,10.5281/zenodo.6937
1,10.5281/zenodo.6999
2,10.5281/zenodo.7111
3,10.5281/zenodo.7118
4,10.5281/zenodo.7121
...,...
34732,10.5281/zenodo.14582073
34733,10.5281/zenodo.14580418
34734,10.5281/zenodo.14580064
34735,10.5281/zenodo.14569243


In [27]:
display(result_zenodo)

{'id': '10.5281/zenodo.14583330',
 'type': 'dois',
 'attributes': {'doi': '10.5281/zenodo.14583330',
  'identifiers': [],
  'creators': [{'name': 'Lagos Trigo, Denisse',
    'nameType': 'Personal',
    'givenName': 'Denisse',
    'familyName': 'Lagos Trigo',
    'nameIdentifiers': [{'nameIdentifier': '0009-0004-7408-4128',
      'nameIdentifierScheme': 'ORCID'}],
    'affiliation': []}],
  'titles': [{'title': 'Searching for Superhump-like variations on Cataclysmic Variables using VVV and VVVx data'}],
  'publisher': 'Zenodo',
  'container': {},
  'publicationYear': 2024,
  'subjects': [],
  'contributors': [],
  'dates': [{'date': '2024-12-31', 'dateType': 'Issued'}],
  'language': None,
  'types': {'ris': 'RPRT',
   'bibtex': 'article',
   'citeproc': 'article-journal',
   'schemaOrg': 'ScholarlyArticle',
   'resourceType': 'Poster',
   'resourceTypeGeneral': 'Text'},
  'relatedIdentifiers': [{'relationType': 'HasVersion',
    'relatedIdentifier': '10.5281/zenodo.14583331',
    'rela

In [22]:
result_zenodo["attributes"]["created"]

'2025-01-27T05:54:27Z'

In [23]:
datetime.fromisoformat(result_zenodo["attributes"]["created"]).year

2025

## Getting the total number of posters available on each platform (as of December 2024)

Note: The combined results from all methods are in the "poster-platforms-review.xlsx" file included in the dataset associated witht his work (see README for details)

### Method 1: Manually using the platform's search/filter

When possible, we obtained the total number of posters available on a platform using the platform's search/filter. 

#### Zenodo

##### Get all posters metadata

In [6]:
# token
zenodo_access_token = os.getenv("ZENODO_ACCESS_TOKEN")

In [None]:
# Get metadata of all the posters from Zenodo
# Zenodo doesn't return more than 10k results per request so need to break down into date ranges with less than 10k posters created each
date_range_list = [
    "{* TO 2020-12-31]",
    "[2021-01-01 TO 2022-12-31]",
    "[2023-01-01 TO 2024-12-31]",
]

dict_results = {}
count = 0
for date_range in date_range_list:

    increment_page = True
    page = 0

    while increment_page:
        page += 1
        params = {
            "resource_type": "poster",
            "q": "created:" + date_range,
            "status": "published",
            "sort": "mostrecent",
            "all_versions": "false",
            "size": 300,
            "page": page,
            "access_token": zenodo_access_token,
        }
        response = requests.get(
            "https://zenodo.org/api/records",
            params=params,
        )

        if response.status_code == 200:
            r = response.json()
            if r["hits"]["hits"]:

                results = r["hits"]["hits"]

                for result in results:
                    dict_results[count] = result
                    count += 1
            else:
                print("No more pages", response.status_code, str(page))
                increment_page = False

        else:
            print("Error", response.status_code, response.content, str(page))
            increment_page = False

with open("outputs/zenodo.json", "w", encoding="utf-8") as f:
    json.dump(dict_results, f, ensure_ascii=False, indent=4)

##### Get total number of posters as of December 2024

In [3]:
# load data
with open("outputs/zenodo.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [4]:
print("There are", len(results), "posters shared on Zenodo as of December 31st, 2024")

There are 24836 posters shared on Zenodo as of December 31st, 2024


#### Figshare

##### Get all posters metadata

In [8]:
# token
figshare_access_token = os.getenv("FIGSHARE_ACCESS_TOKEN")

In [None]:
# Inspired from the example available here https://help.figshare.com/article/how-to-use-the-figshare-api#search-ids
# get info of all the posters (unlike Zenodo, this doesn't return all the metadata for each poster
# so we do that only to get all the ids of the posters)
BASE_URL = "https://api.figshare.com/v2"
results = []

posted_after_list = ["2012-02-26", "2021-01-01"]
posted_before_list = ["2020-12-31", "2024-12-31"]

for posted_after, posted_before in zip(posted_after_list, posted_before_list):
    search_logic = (
        ":item_type:poster AND "
        + ":posted_after:"
        + posted_after
        + " AND :posted_before:"
        + posted_before
    )
    query = '{"search_for": "' + search_logic + '"}'
    y = json.loads(query)

    for j in range(1, 11):
        r = json.loads(
            requests.post(
                BASE_URL + "/articles/search?page_size=1000&page={}".format(j), params=y
            ).content
        )
        if r:
            results.extend(r)
        else:
            break

In [38]:
# use figshare ids to get full metadata of each poster
dict_results = {}
count = 0
for result in results:
    if isinstance(result, dict):
        figshare_id = result["id"]
        api_call_headers = {"Authorization": "token " + str(figshare_access_token)}
        r = requests.get(
            BASE_URL + "/articles/" + str(figshare_id), headers=api_call_headers
        )
        metadata = json.loads(r.text)
        dict_results[count] = metadata
        count += 1

with open("outputs/figshare.json", "w", encoding="utf-8") as f:
    json.dump(dict_results, f, ensure_ascii=False, indent=4)

NameError: name 'figshare_access_token' is not defined

##### Get total number of posters as of December 2024

In [6]:
# load data
with open("outputs/figshare.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [7]:
print("There are", len(results), "posters shared on Figshare as of December 31st, 2024")

There are 14382 posters shared on Figshare as of December 31st, 2024


#### NASA Technical Reports Server (NTRS)

Documentation of the api and examples: https://ntrs.nasa.gov/api/openapi/#/default/SearchController_postSearch
https://sti.nasa.gov/harvesting-data-from-ntrs/

##### Get all posters metadata

In [28]:
base_url = "https://ntrs.nasa.gov/api/citations/search"
json_data = {
    "stiTypeDetails": "Poster",
    "created": {"lte": "2024-12-31"},
    "page": {"size": 3600, "from": 0},
}

In [34]:
response = requests.post(base_url, json=json_data)
r = response.json()
with open("outputs/ntrs.json", "w", encoding="utf-8") as f:
    json.dump(r["results"], f, ensure_ascii=False, indent=4)

##### Get total number of posters as of December 2024

In [35]:
# load data
with open("outputs/ntrs.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [36]:
print("There are", len(results), "posters shared on NASA's NTSR as of December 31st, 2024")

There are 3322 posters shared on NASA's NTSR as of December 31st, 2024


### Method 2: Using DataCite

When manual search was not possible on a platform issuing DOIs for posters, we used the DataCite metadata to count the number of posters available on that platform

## Getting the year wise number of posters published per platform (up to 2024)

Note: The combined results from all methods are in the "poster-platforms-review.xlsx" file included in the dataset associated witht his work (see README for details)

### Method 1: Using the platform's API

When possible, we used the API of a platform to get its yearwise poster sharing data

### Zenodo

In [51]:
# load data
with open("outputs/zenodo.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [52]:
# yearwise count
year_list = []
for result in results.values():
    year_list.append(datetime.fromisoformat(result["created"]).year)

rows_list = []
for year in range(min(year_list), max(year_list) + 1):
    count = year_list.count(year)
    rows_list.append([year, count])

df = pd.DataFrame(rows_list, columns=["year", "numberPoster"])
df.to_csv("outputs/find-posters/yearwise-count-zenodo.csv", index=False)
display(df)

Unnamed: 0,year,numberPoster
0,2014,103
1,2015,215
2,2016,462
3,2017,757
4,2018,1417
5,2019,1877
6,2020,2484
7,2021,3299
8,2022,4441
9,2023,4268


### Figshare

In [49]:
# load data
with open("outputs/figshare.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [50]:
# yearwise count
year_list = []
for result in results.values():
    year_list.append(datetime.fromisoformat(result["published_date"]).year)

rows_list = []
for year in range(min(year_list), max(year_list) + 1):
    count = year_list.count(year)
    rows_list.append([year, count])

df = pd.DataFrame(rows_list, columns=["year", "numberPoster"])
df.to_csv("outputs/find-posters/yearwise-count-figshare.csv", index=False)
display(df)

Unnamed: 0,year,numberPoster
0,2012,178
1,2013,476
2,2014,885
3,2015,692
4,2016,763
5,2017,1050
6,2018,871
7,2019,1009
8,2020,2145
9,2021,1274


#### NASA STI Repository (NTRS)

In [45]:
# load data
with open("outputs/ntrs.json", "r", encoding="utf-8") as file:
    results = json.load(file)

In [48]:
# yearwise count
year_list = []
for result in results:
    year_list.append(datetime.fromisoformat(result["created"]).year)

rows_list = []
for year in range(min(year_list), max(year_list) + 1):
    count = year_list.count(year)
    rows_list.append([year, count])

df = pd.DataFrame(rows_list, columns=["year", "numberPoster"])
df.to_csv("outputs/find-posters/yearwise-count-ntrs.csv", index=False)
display(df)

Unnamed: 0,year,numberPoster
0,2013,3
1,2014,1
2,2015,1
3,2016,1
4,2017,3
5,2018,8
6,2019,327
7,2020,519
8,2021,332
9,2022,584


### Method 2: Manually by filtering by year on the platform

When no API was available to query posters programatically, we tried filtering by year on the platform's search feature

### Method 3: Manually counting on the platform

When there was no option to filter posters by publication year, we manually counted posters yearwise if it was reasonable to do so

### Method 4: Using the DataCite metadata

When none of the previous methods were possible for a DOI issuing platform, we used the DataCite metadata to get a yearwise count of posters shared by that platform