## Imports

In [2]:
import json
import os
import time
from datetime import datetime

import pandas as pd
import requests

## Common elements

In [2]:
headers = [
    "doi",
    "doi_url",
    "repository_url",
    "is_open_access",
    "authors_count",
    "authors_affiliation_count",
    "authors_familyname_count",
    "authors_givennames_count",
    "authors_gnd_count",
    "authors_name_count",
    "authors_orcid_count",
    "has_funding_info",
    "journal",
    "keywords_freetext_count",
    "keywords_controlled_vocabularies_count",
    "has_language",
    "has_license",
    "license_name",
    "has_conference_acronym",
    "has_conference_dates",
    "has_conference_place",
    "has_conference_session",
    "has_conference_session_part",
    "has_conference_title",
    "has_conference_website",
    "has_creation_date",
    "creation_date",
    "has_other_dates",
    "other_dates",
    "references_no_identifiers_count",
    "references_with_identifiers_count",
    "has_title",
    "has_version",
    "files",
]

## Get posters metadata from Zenodo

In [3]:
# token
zenodo_access_token = os.getenv("ZENODO_ACCESS_TOKEN")

In [4]:
# Zenodo doesn't return more than 10k results per request so need to break down into date ranges with less than 10k posters created each
date_range_list = [
    "{* TO 2020-12-31]",
    "[2021-01-01 TO 2022-12-31]",
    "[2023-01-01 TO 2024-11-30]",
]

rows_list = []

for date_range in date_range_list:

    increment_page = True
    page = 0

    while increment_page:
        page += 1
        params = {
            "resource_type": "poster",
            "q": "created:" + date_range,
            "status": "published",
            "sort": "mostrecent",
            "all_versions": "false",
            "size": 500,
            "page": page,
            "access_token": zenodo_access_token,
        }
        response = requests.get(
            "https://zenodo.org/api/records",
            params=params,
        )

        if response.status_code == 200:
            r = response.json()
            if r["hits"]["hits"]:

                results = r["hits"]["hits"]

                # get metadata - zenodo record schema https://github.com/zenodo/zenodo/blob/master/zenodo/modules/records/jsonschemas/records/record-v1.0.0.json                                                                                                                                                                         if metadata["access_right"] == "open":
                for result in results:

                    row_dict = {}
                    metadata = result["metadata"]

                    # id and links
                    row_dict["doi"] = False
                    if "doi" in result.keys():
                        row_dict["doi"] = result["doi"]
                    row_dict["doi_url"] = False
                    if "doi_url" in result.keys():
                        row_dict["doi_url"] = result["doi_url"]
                    row_dict["repository_url"] = False
                    if "links" in result.keys():
                        if "self_html" in result["links"].keys():
                            row_dict["repository_url"] = result["links"]["self_html"]

                    # access right
                    row_dict["is_open_access"] = False
                    if "access_right" in metadata.keys():
                        if metadata["access_right"] == "open":
                            row_dict["is_open_access"] = True

                    # authors
                    authors_count = 0
                    authors_affiliation_count = 0
                    authors_familyname_count = 0
                    authors_givennames_count = 0
                    authors_gnd_count = 0
                    authors_name_count = 0
                    authors_orcid_count = 0
                    if "creators" in metadata.keys():
                        authors_list = metadata["creators"]
                        authors_count = len(authors_list)
                        for author in authors_list:
                            if "affiliation" in author.keys():
                                if author["affiliation"] != None:
                                    if len(author["affiliation"].strip()) > 0:
                                        authors_affiliation_count += 1
                            if "familyname" in author.keys():
                                if len(author["familyname"].strip()) > 0:
                                    authors_familyname_count += 1
                            if "givennames" in author.keys():
                                if len(author["givennames"].strip()) > 0:
                                    authors_givennames_count += 1
                            if "gnd" in author.keys():
                                if len(author["gnd"].strip()) > 0:
                                    authors_gnd_count += 1
                            if "name" in author.keys():
                                if len(author["name"].strip()) > 0:
                                    authors_name_count += 1
                            if "orcid" in author.keys():
                                if len(author["orcid"].strip()) > 0:
                                    authors_orcid_count += 1

                    row_dict["authors_count"] = authors_count
                    row_dict["authors_affiliation_count"] = authors_affiliation_count
                    row_dict["authors_familyname_count"] = authors_familyname_count
                    row_dict["authors_givennames_count"] = authors_givennames_count
                    row_dict["authors_gnd_count"] = authors_gnd_count
                    row_dict["authors_name_count"] = authors_name_count
                    row_dict["authors_orcid_count"] = authors_orcid_count

                    # grants
                    row_dict["has_funding_info"] = False
                    if "grants" in metadata.keys():
                        if len(metadata["grants"]) > 0:
                            row_dict["has_funding_info"] = True

                    # journal
                    row_dict["has_journal"] = False
                    if "journal" in metadata.keys():
                        if len(metadata["journal"]) > 0:
                            row_dict["has_journal"] = True

                    # keywords free text
                    row_dict["keywords_freetext_count"] = 0
                    if "references" in metadata.keys():
                        if len(metadata["references"]) > 0:
                            row_dict["keywords_freetext_count"] = len(
                                metadata["references"]
                            )

                    # keywords controlled vocabularies
                    row_dict["keywords_controlled_vocabularies_count"] = 0
                    if "related_identifiers" in metadata.keys():
                        if len(metadata["related_identifiers"]) > 0:
                            row_dict["keywords_controlled_vocabularies_count"] = len(
                                metadata["related_identifiers"]
                            )

                    # language
                    row_dict["has_language"] = False
                    if "language" in metadata.keys():
                        if len(metadata["language"]) > 0:
                            row_dict["has_language"] = True

                    # license
                    row_dict["has_license"] = False
                    row_dict["license_name"] = "NA"
                    if "license" in metadata.keys():
                        row_dict["has_license"] = True
                        row_dict["license_name"] = metadata["license"]["id"]

                    # conference
                    row_dict["has_conference_acronym"] = False
                    row_dict["has_conference_dates"] = False
                    row_dict["has_conference_place"] = False
                    row_dict["has_conference_session"] = False
                    row_dict["has_conference_session_part"] = False
                    row_dict["has_conference_title"] = False
                    row_dict["has_conference_website"] = False
                    if "meeting" in metadata.keys():
                        conference_metadata = metadata["meeting"]

                        if "acronym" in conference_metadata.keys():
                            if len(conference_metadata["acronym"].strip()) > 0:
                                row_dict["has_conference_acronym"] = True

                        if "dates" in conference_metadata.keys():
                            if len(conference_metadata["dates"].strip()) > 0:
                                row_dict["has_conference_dates"] = True

                        if "place" in conference_metadata.keys():
                            if len(conference_metadata["place"].strip()) > 0:
                                row_dict["has_conference_place"] = True

                        if "session" in conference_metadata.keys():
                            if len(conference_metadata["session"].strip()) > 0:
                                row_dict["has_conference_session"] = True

                        if "session_part" in conference_metadata.keys():
                            if len(conference_metadata["session_part"].strip()) > 0:
                                row_dict["has_conference_session_part"] = True

                        if "title" in conference_metadata.keys():
                            if len(conference_metadata["title"].strip()) > 0:
                                row_dict["has_conference_title"] = True

                        if "url" in conference_metadata.keys():
                            if len(conference_metadata["url"].strip()) > 0:
                                row_dict["has_conference_website"] = True

                    # creation_date
                    row_dict["has_creation_date"] = False
                    row_dict["creation_date"] = "NA"
                    if "created" in result.keys():
                        if len(result["created"].strip()) > 0:
                            row_dict["has_creation_date"] = True
                            row_dict["creation_date"] = result["created"]

                    # other dates:
                    row_dict["has_other_dates"] = False
                    row_dict["other_dates"] = "NA"
                    if "dates" in metadata.keys():
                        if len(metadata["dates"]) > 0:
                            row_dict["has_other_dates"] = True
                            dates_count = 0
                            dates_info = ""
                            for date in metadata["dates"]:
                                if dates_count > 0:
                                    dates_info += "; "
                                dates_info += str(date)
                                files_count += 1
                            row_dict["other_dates"] = dates_info

                    # references
                    row_dict["references_no_identifiers_count"] = 0
                    if "references" in metadata.keys():
                        if len(metadata["references"]) > 0:
                            row_dict["references_no_identifiers_count"] = len(
                                metadata["references"]
                            )

                    # related identifiers count
                    row_dict["references_with_identifiers_count"] = 0
                    if "related_identifiers" in metadata.keys():
                        if len(metadata["related_identifiers"]) > 0:
                            row_dict["references_with_identifiers_count"] = len(
                                metadata["related_identifiers"]
                            )

                    # title
                    row_dict["has_title"] = False
                    if "title" in metadata.keys():
                        if len(metadata["title"].strip()) > 0:
                            row_dict["has_title"] = True

                    # version
                    row_dict["has_version"] = False
                    if "version" in metadata.keys():
                        if len(metadata["version"].strip()) > 0:
                            row_dict["has_version"] = True

                    # files
                    files = result["files"]
                    files_names = ""
                    files_count = 0
                    for file in files:
                        if files_count > 0:
                            files_names += "; "
                        files_names += str(file["key"])
                        files_count += 1
                    row_dict["files"] = files_names

                    # add to list
                    rows_list.append(row_dict)

            else:
                print("No more pages", response.status_code, str(page))
                increment_page = False

        else:
            print("Error", response.status_code, response.content, str(page))
            increment_page = False

        # increment_page = False

# save
df = pd.DataFrame(rows_list)
df.to_csv("zenodo.csv", index=False)

No more pages 200 16
No more pages 200 17
No more pages 200 20


### Notes:  
- Role of researchers are not shown on the online record or the api record: why ask it?
- Additional dates: The dates are shown in the online record but not included in the api record, why?
- publication_date: provided by user
- creation_date: date when the record was created (since we are only looking at the latest version, this is the creation date of the latest version)

## Get posters metadata from Figshare

In [3]:
# token
figshare_access_token = os.getenv("FIGSHARE_ACCESS_TOKEN")

In [6]:
# Inspired from the example available here https://help.figshare.com/article/how-to-use-the-figshare-api#search-ids
# get info of all the posters (unlike Zenodo, this doesn't return all the metadata for each poster
# so we do that only to get all the ids of the posters)
BASE_URL = "https://api.figshare.com/v2"
results = []

posted_after_list = ["2012-02-26", "2021-01-01"]
posted_before_list = ["2020-12-31", "2024-11-30"]


for posted_after, posted_before in zip(posted_after_list, posted_before_list):
    search_logic = (
        ":item_type:poster AND "
        + ":posted_after:"
        + posted_after
        + " AND :posted_before:"
        + posted_before
    )
    query = '{"search_for": "' + search_logic + '"}'
    y = json.loads(query)

    for j in range(1, 11):
        r = json.loads(
            requests.post(
                BASE_URL + "/articles/search?page_size=1000&page={}".format(j), params=y
            ).content
        )
        if r:
            results.extend(r)
        else:
            break

In [7]:
len(results)

14261

In [14]:
len(rows_list)

8069

In [25]:
results[1:2]

[{'project_id': None,
  'id': 9676241,
  'title': 'Thermal sensitivity and heat hardening capacity of Drosophila melanogaster vary during ontogenyUntitled Item',
  'doi': '10.6084/m9.figshare.9676241.v1',
  'handle': '',
  'url': 'https://api.figshare.com/v2/articles/9676241',
  'published_date': '2019-08-19T13:35:43Z',
  'thumb': 'https://s3-eu-west-1.amazonaws.com/pfigshare-u-previews/17338601/thumb.png',
  'defined_type': 5,
  'defined_type_name': 'poster',
  'group_id': None,
  'url_private_api': 'https://api.figshare.com/v2/account/articles/9676241',
  'url_public_api': 'https://api.figshare.com/v2/articles/9676241',
  'url_private_html': 'https://figshare.com/account/articles/9676241',
  'url_public_html': 'https://figshare.com/articles/poster/Thermal_sensitivity_and_heat_hardening_capacity_of_Drosophila_melanogaster_vary_during_ontogenyUntitled_Item/9676241',
  'timeline': {'posted': '2019-08-19T13:35:43',
   'firstOnline': '2019-08-19T13:35:43'},
  'resource_title': '',
  'reso

In [23]:
if isinstance(results[8069], dict):
    print("yes")

In [33]:
# use figshare ids to get full metadata of each poster
rows_list = []
for result in results:
    if isinstance(result, dict):
        figshare_id = result["id"]
        api_call_headers = {"Authorization": "token " + str(figshare_access_token)}
        r = requests.get(
            BASE_URL + "/articles/" + str(figshare_id), headers=api_call_headers
        )
        metadata = json.loads(r.text)

        row_dict = {}

        # id and links
        row_dict["doi"] = False
        row_dict["doi_url"] = False
        if "doi" in metadata.keys():
            row_dict["doi"] = metadata["doi"]
            row_dict["doi_url"] = "https://doi.org/" + metadata["doi"]
        row_dict["repository_url"] = False
        if "url_public_html" in metadata.keys():
            row_dict["repository_url"] = metadata["url_public_html"]

        # access right
        row_dict["is_open_access"] = False
        if "is_public" in metadata.keys():
            if metadata["is_public"]:
                row_dict["is_open_access"] = True

        # authors
        authors_count = 0
        authors_affiliation_count = -1
        authors_familyname_count = -1
        authors_givennames_count = -1
        authors_gnd_count = -1
        authors_name_count = 0
        authors_orcid_count = 0
        if "authors" in metadata.keys():
            authors_list = metadata["authors"]
            authors_count = len(authors_list)
            for author in authors_list:
                if "full_name" in author.keys():
                    if len(author["full_name"].strip()) > 0:
                        authors_name_count += 1
                if "orcid_id" in author.keys():
                    if len(author["orcid_id"].strip()) > 0:
                        authors_orcid_count += 1

        row_dict["authors_count"] = authors_count
        row_dict["authors_affiliation_count"] = authors_affiliation_count
        row_dict["authors_familyname_count"] = authors_familyname_count
        row_dict["authors_givennames_count"] = authors_givennames_count
        row_dict["authors_gnd_count"] = authors_gnd_count
        row_dict["authors_name_count"] = authors_name_count
        row_dict["authors_orcid_count"] = authors_orcid_count

        # grants
        row_dict["has_funding_info"] = False
        if "funding_list" in metadata.keys():
            if len(metadata["funding_list"]) > 0:
                row_dict["has_funding_info"] = True

        # journal
        row_dict["has_journal"] = "Not supported"

        # keywords free text
        row_dict["keywords_freetext_count"] = 0
        if "tags" in metadata.keys():
            if len(metadata["tags"]) > 0:
                row_dict["keywords_freetext_count"] = len(metadata["tags"])

        # keywords controlled vocabularies
        row_dict["keywords_controlled_vocabularies_count"] = -1

        # language
        row_dict["has_language"] = "Not supported"

        # license
        row_dict["has_license"] = False
        row_dict["license_name"] = "NA"
        if "license" in metadata.keys():
            row_dict["has_license"] = True
            row_dict["license_name"] = metadata["license"]["name"]

        # conference
        row_dict["has_conference_acronym"] = "Not supported"
        row_dict["has_conference_dates"] = "Not supported"
        row_dict["has_conference_place"] = "Not supported"
        row_dict["has_conference_session"] = "Not supported"
        row_dict["has_conference_session_part"] = "Not supported"
        row_dict["has_conference_title"] = "Not supported"
        row_dict["has_conference_website"] = "Not supported"

        # creation date
        row_dict["has_creation_date"] = False
        row_dict["creation_date"] = "NA"
        if "published_date" in metadata.keys():
            if len(metadata["published_date"].strip()) > 0:
                row_dict["has_creation_date"] = True
                row_dict["creation_date"] = metadata["published_date"]

        # other dates
        row_dict["has_other_dates"] = "Not supported"
        row_dict["other_dates"] = "NA"

        # references with no identifiers count
        row_dict["references_no_identifiers_count"] = -1

        # references with identifiers count
        row_dict["references_with_identifiers_count"] = 0
        if "related_materials" in metadata.keys():
            if len(metadata["related_materials"]) > 0:
                row_dict["references_with_identifiers_count"] = len(
                    metadata["related_materials"]
                )

        # title
        row_dict["has_title"] = False
        if "title" in metadata.keys():
            if len(metadata["title"].strip()) > 0:
                row_dict["has_title"] = True

        # version
        row_dict["has_version"] = False
        if "version" in metadata.keys():
            row_dict["has_version"] = True

        # files
        if "files" in metadata.keys():
            files = metadata["files"]
            files_names = ""
            files_count = 0
            for file in files:
                if files_count > 0:
                    files_names += "; "
                files_names += str(file["name"])
                files_count += 1
            row_dict["files"] = files_names

        # save
        rows_list.append(row_dict)

In [34]:
df = pd.DataFrame(rows_list)
df.to_csv("figshare.csv", index=False)

## Analysis

### Import data

In [14]:
df1 = pd.read_csv("inputs/dataset/primary/zenodo.csv")
df2 = pd.read_csv("inputs/dataset/primary/figshare.csv")
df1["repository"] = "Zenodo"
df2["repository"] = "Figshare"
df = pd.concat([df1, df2], ignore_index=True, sort=False)

### Total number of posters

### Number of posters per year