# Preamble

In [1]:
# For querying the API (need to install python3-requests on your system first)
import requests

# For loading the dataset
import os
import gzip
import json

# Python utility libraries for handling lists
import itertools
import collections

# Imports the "pprint" function, which pretty-prints large objects
from pprint import pprint

## Getting the dataset

First, we need to download the data from DataCite's API. This will take some time. We will also save it locally so we don't need to re-download it next time.

In [2]:
DATASET_LOCATION = './datacite_software.json.gz'


In [3]:
if os.path.isfile(DATASET_LOCATION):
    # We already have a cached copy of the datacite on the filesystem,
    # let's use it
    with gzip.open(DATASET_LOCATION, 'rt') as fd:
        data = json.load(fd)
else:
    # We don't already have a cached copy of the dataset, let's download
    # it from DataCite
    
    # Initial URL; it will get all objects on datasite whose type is "Software"
    url = 'https://api.datacite.org/dois?query=types.resourceTypeGeneral:Software&page[size]=1000&page[cursor]=1'

    # We'll put all the results in this list
    data = []

    while True:
        # Send the request
        headers = {'accept': 'application/vnd.api+json'}
        print('Downloading {}'.format(url))
        response = requests.get(url, headers=headers)

        # Parse results and add them to the set
        j = response.json()
        data.extend(j['data'])

        # Finished?
        if 'next' not in j['links'] or url == j['links']['next']:
            break

        # Continue with next URL
        url = j['links']['next']
    print('Finished downloading dataset.')

    # Write the dataset to the filesystem so we can re-use it later
    with gzip.open(DATASET_LOCATION, 'wt') as fd:
        json.dump(data, fd)
        
all_dois = {x['id'] for x in data}

In [4]:
print('Number of objects in dataset: {}'.format(len(data)))

Number of objects in dataset: 107229


In [5]:
# Print the first object in the dataset
pprint(data[0])

{'attributes': {'container': {},
                'contentUrl': None,
                'contributors': [{'affiliation': ['Leibniz Institute of Plant '
                                                  'Genetics and Crop Plant '
                                                  'Research (IPK), Seeland OT '
                                                  'Gatersleben, Corrensstraße '
                                                  '3, 06466, Germany'],
                                  'contributorType': 'ProjectMember',
                                  'familyName': 'Friedrich',
                                  'givenName': 'Christian',
                                  'name': 'Friedrich, Christian',
                                  'nameType': 'Personal'},
                                 {'affiliation': ['Leibniz Institute of Plant '
                                                  'Genetics and Crop Plant '
                                                  'Research (IPK), Se

# Relationship between software and versions

DataCite considers "software" both the software itself and each of its versions/release. Fortunately, there are references between the two.

In [6]:
# We can get the set of all DOIs that are target of an "IsVersionOf":
target_of_IsVersionOf = set()
for obj in data:
    if obj['attributes']['relatedIdentifiers']:
        for relation in obj['attributes']['relatedIdentifiers']:
            if relation['relationType'] == 'IsVersionOf':
                target_of_IsVersionOf.add(relation['relatedIdentifier'])

In [7]:
# Count them:
print(len(target_of_IsVersionOf))

22962


In [8]:
# We can also get the set of all DOIs which have an "HasVersion" relationship:
has_version = set()
for obj in data:
    if obj['attributes']['relatedIdentifiers']:
        if any(relation['relationType'] == 'HasVersion' for relation in obj['attributes']['relatedIdentifiers']):
            has_version.add(obj['id']) 

In [9]:
# There are slightly less:
print(len(has_version))

22788


In [10]:
# There is one in the first set that is not in the second one (because its version is not publicly available):
print(has_version - target_of_IsVersionOf)

{'10.5281/zenodo.1312404'}


In [11]:
# There are 175 the other way around:
missing = target_of_IsVersionOf - has_version
print(len(missing))

175


In [12]:
# For some of them, it's because they use an "HasPart" property instead of "HasVersion":
pprint([x['attributes']['relatedIdentifiers'] for x in data if x['id'] in missing])

[[{'relatedIdentifier': 'https://github.com/opencobra/cobratoolbox/tree/v2.0.0',
   'relatedIdentifierType': 'URL',
   'relationType': 'IsSupplementTo'},
  {'relatedIdentifier': '10.5281/zenodo.268399',
   'relatedIdentifierType': 'DOI',
   'relationType': 'HasPart'}],
 [{'relatedIdentifier': '10.1016/j.jprocont.2007.07.006',
   'relatedIdentifierType': 'DOI',
   'relationType': 'Cites'},
  {'relatedIdentifier': '10.5281/zenodo.888135',
   'relatedIdentifierType': 'DOI',
   'relationType': 'HasPart'}],
 [{'relatedIdentifier': 'https://github.com/sbmlteam/sbml-test-suite/tree/3.3.0',
   'relatedIdentifierType': 'URL',
   'relationType': 'IsSupplementTo'},
  {'relatedIdentifier': '10.5281/zenodo.1112521',
   'relatedIdentifierType': 'DOI',
   'relationType': 'HasPart'}],
 [{'relatedIdentifier': 'https://github.com/ancolli/concLaminarTurbulentFoam/tree/v1.0.2',
   'relatedIdentifierType': 'URL',
   'relationType': 'IsSupplementTo'},
  {'relatedIdentifier': '10.5281/zenodo.1145874',
   're

In [13]:
# The other ones are missing from the dataset, because they are
# not software (eg. regular dataset).
# They are referenced because some of their versions is
# (probably erroneously) marked as a software

# Let's pick one and analyze it:
example_missing_doi = list(missing - all_dois)[0]
print(example_missing_doi)

10.5281/zenodo.2532876


In [14]:
# Here are its relationships:
example_missing = requests.get('https://api.datacite.org/dois/' + example_missing_doi).json()['data']
pprint(example_missing['attributes']['relatedIdentifiers'])

[{'relatedIdentifier': 'https://github.com/jlaaser/pogil-polymers/tree/v0.1.0',
  'relatedIdentifierType': 'URL',
  'relationType': 'IsSupplementTo'},
 {'relatedIdentifier': '10.5281/zenodo.2532877',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'},
 {'relatedIdentifier': '10.5281/zenodo.2538840',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'},
 {'relatedIdentifier': '10.5281/zenodo.3333941',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'},
 {'relatedIdentifier': '10.5281/zenodo.3333942',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'},
 {'relatedIdentifier': '10.5281/zenodo.3344564',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'},
 {'relatedIdentifier': '10.5281/zenodo.3344565',
  'relatedIdentifierType': 'DOI',
  'relationType': 'HasVersion'}]


In [15]:
# Let's fetch its relationships' type:
for relation in example_missing['attributes']['relatedIdentifiers']:
    if relation['relatedIdentifierType'] == 'DOI':
        obj = requests.get('https://api.datacite.org/dois/' + relation['relatedIdentifier']).json()['data']
        print(obj['attributes']['types']['resourceTypeGeneral'])

InteractiveResource
Software
Software
Software
Software
InteractiveResource


# Analyzing DOI sources

In [16]:
pprint(collections.Counter(obj['relationships']['client']['data']['id'] for obj in data))

Counter({'cern.zenodo': 89175,
         'figshare.ars': 7009,
         'ncicbiit.nciphub': 3156,
         'purdue.hubzero': 2669,
         'osti.doe': 1126,
         'ocean.ocean': 839,
         'kim.openkim': 529,
         'brainl.iu': 239,
         'ands.centre-8': 213,
         'crui.infncnaf': 196,
         'tind.caltech': 177,
         'ethz.da-rd': 172,
         'ands.centre41': 128,
         'cdl.uci': 120,
         'tib.dagst': 100,
         'usgs.prod': 94,
         'ands.centre13': 82,
         'bl.stfc': 76,
         'bl.shef': 71,
         'tib.gfz': 51,
         'cul.columbia': 49,
         'doinz.nzau': 47,
         'umich.rsclid': 46,
         'bl.ed': 38,
         'ands.centre-3': 34,
         'iu.bl': 32,
         'umd.lib': 30,
         'dartlib.crawdad': 30,
         'comses.cml': 24,
         'tib.hzdr': 24,
         'bl.lboro': 24,
         'bl.strath': 24,
         'ucar.ucar': 24,
         'crui.unict': 23,
         'bl.cam': 22,
         'figshare.epa': 22,
    

# Analyzing titles

In [17]:
list_of_titles = [obj['attributes']['titles'][0]['title'] for obj in data]
print(len(list_of_titles))  # Same number as the dataset size

107229


In [18]:
# Number of unique titles:
print(len(set(list_of_titles)))

73197


In [19]:
# Most "popular" titles:
counter = collections.Counter(list_of_titles)
pprint(counter.most_common(10))

[('buddhi1980/mandelbulber2: Continuous build', 121),
 ('Modules for Experiments in Stellar Astrophysics (MESA)', 86),
 ('Test', 77),
 ('Berkeley Computational Nanoscience Class Tools', 52),
 ('Sugaraid', 51),
 ('Polymer Modeler', 43),
 ('DCM - A Software Platform for Advanced 3D Materials Modelling, '
  'Characterisation and Visualization',
  42),
 ('First Release', 37),
 ('Band Structure Lab', 35),
 ('samapriya/porder: porder: Simple CLI for Planet ordersV2 API', 34)]


## Project uniqueness

As we saw, multiple "software" objects are different version of the same software, and the software itself. There is also the possibility of having multiple objects corresponding to the same software. It is not easy to find those, but we can do a coarse approximation by deduplicating by project name on the most popular software sources.

In [20]:
# Fetch all URL relations
urls = []
for obj in data:
    if obj['attributes']['relatedIdentifiers']:
        for rel in obj['attributes']['relatedIdentifiers']:
            if rel['relatedIdentifierType'] == 'URL':
                urls.append(rel['relatedIdentifier']) 

In [21]:
print('Number of URLs: {}'.format(len(urls)))
print('Number of unique URLs: {}'.format(len(set(urls))))

Number of URLs: 86754
Number of unique URLs: 61302


In [22]:
# Most popular domains:
counter = collections.Counter(url.split('/')[2] for url in urls if '/' in url)
pprint(counter.most_common(10))

[('github.com', 81461),
 ('zenodo.org', 1723),
 ('maven.research-infrastructures.eu', 1461),
 ('dx.doi.org', 337),
 ('arxiv.org', 102),
 ('gitlab.com', 98),
 ('bitbucket.org', 67),
 ('research.csiro.au', 65),
 ('fmriprep.org', 54),
 ('pypi.org', 50)]


In [23]:
unique_github_urls = set(url for url in urls if url.startswith('https://github.com/'))
unique_zenodo_urls = set(url for url in urls if url.startswith('https://zenodo.org/'))
print('Unique URLs on Github: {}'.format(len(unique_github_urls)))
print('Unique URLs on Zenodo: {}'.format(len(unique_zenodo_urls)))

Unique URLs on Github: 59013
Unique URLs on Zenodo: 277


In [24]:
# Unique projects on GitHub (we transform
# "https://github.com/Organization/Project/blahblah" into
# ("Organization", "Project"):
unique_github_projects = set(tuple(url.split('/')[3:5]) for url in unique_github_urls)
# Unique projects on GitHub (we transform
# "https://zenodo.org/communities/project/blahblah" into
# "project":
unique_zenodo_projects = set(url.split('/')[4] for url in unique_zenodo_urls)
print('Unique projects on Github: {}'.format(len(unique_github_projects)))
print('Unique projects on Zenodo: {}'.format(len(unique_zenodo_projects)))

Unique projects on Github: 22811
Unique projects on Zenodo: 277


Note that 22811+277 approximately matches the ~23k software "non-version" we found earlier