Priem, J., Piwowar, H., & Orr, R. (2022). OpenAlex: A fully-open index of scholarly works, authors, venues, institutions, and concepts. ArXiv. https://arxiv.org/abs/2205.01833

In [1]:
!pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [2]:
!pip install pyalex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyalex
  Downloading pyalex-0.9-py3-none-any.whl (9.3 kB)
Installing collected packages: pyalex
Successfully installed pyalex-0.9


In [33]:
import datetime
import kaleido
import math
import numpy as np
import pandas as pd
import xarray as xr
import plotly.graph_objects as go
import random
import requests
from numpy.random import default_rng
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from collections import OrderedDict, defaultdict
from pyalex import Works, Authors, Sources, Institutions, Concepts, Publishers
import pyalex
import pickle

In [4]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [5]:
# Location to save results e.g. plots - should be modified appropriately
result_file_location = '/content/drive/MyDrive/Epoch (employees)/Personal workspaces/Keith\'s Folder/Researcher impact project/results/'

Create a random number generator, with a fixed random seed for reproducibility

In [6]:
rng = default_rng(seed=20230105)

In [7]:
#@title Useful constants
SECONDS_PER_HOUR = 60 * 60
SECONDS_PER_YEAR = 365 * 24 * SECONDS_PER_HOUR

# From https://colab.research.google.com/drive/1O99z9b1I5O66bT78r9ScslE_nOj5irN9?usp=sharing
GPT4_TRAINING_COMPUTE = 2.1e25
GPT4_TRAINING_COMPUTE_LOW = 8.4e24
GPT4_TRAINING_COMPUTE_HIGH = 4.7e25

In [8]:
#@title General utilities
def printe(x):
  print(f"{x:.2e}")

def log_mean(*x):
  return np.exp(np.mean(np.log(np.array(x))))

def print_squiggle_stats(samples):
  print('Mean: {}, SD: {}'.format(round(np.mean(samples), 2),
                                  round(np.std(samples), 2)))
  percentiles = sq.get_percentiles(samples, digits=0)
  pprint(percentiles)

def ci_ratio(samples, ci_lo=5, ci_hi=95):
  percentiles = sq.get_percentiles(samples, digits=0)
  ratio_lo = np.log10(percentiles[ci_lo] / percentiles[50])
  ratio_hi = np.log10(percentiles[ci_hi] / percentiles[50])
  return ratio_lo, ratio_hi

def get_relative_dist(dist, samples):
  percentiles = sq.get_percentiles(samples, digits=0)
  relative_dist = dist / percentiles[50]
  return relative_dist

def get_id_from_url(id_url):
  return id_url.rsplit('/', 1)[-1]

In [9]:
#@title Linear regression functions

def fit_linear_regression(x, y):
  X = sm.add_constant(x)
  ols_model = sm.OLS(y, X)
  est = ols_model.fit()
  return est

def stochastic_regression(x, y, dist):
  """
  Returns a sample of log-linear regression models, accounting for uncertainty 
  in the y-values.
  The relative uncertainty in the y-values is expressed by the distribution 
  `dist`.

  For some number of samples, the y-values are multiplied element-wise by an 
  equal-length vector of values sampled from `dist`.
  Then, a log-linear regression is fitted to the resulting (x, y) values.

  Returns a list of regression models.
  """
  bootstrap_size = 1000
  regression_size = len(y)
  models = list()

  logy = np.log10(y)

  for i in range(bootstrap_size):
    # Multiply by a random factor to account for uncertainty
    random_multiplier = sq.sample(dist, n=regression_size, lclip=1e-3)  # lclip to avoid non-positive multipliers
    noisy_logy = logy + np.log10(random_multiplier)
    # Fit log-linear regression
    reg_result = fit_linear_regression(x, noisy_logy)
    models.append(reg_result)

  return models

def regression_bootstrap(x, y, data_ci_ratio=[0, 0]):
  """
  Performs a bootstrap of the log-linear regression, while accounting for 
  uncertainty in the y-values.

  `data_ci` represents a relative, uniform uncertainty about each data point.
  Each y value is multiplied by a random factor sampled uniformly from this
  range.
  The CI is expressed in log10 space.
  For example, if `data_ci == [-1, 1]`, then in each bootstrap sample, each y
  value is multiplied by a number between 0.1 and 10.
  """
  bootstrap_size = 1000
  regression_size = len(y)
  models = list()

  for i in range(bootstrap_size):
    # Resample the data
    resampled_idxs = rng.choice(np.arange(regression_size), size=regression_size, replace=True)
    x_resample = x[resampled_idxs]
    # Check if the sample is uniform - can't do linear regression, so reject it
    while np.sum(x_resample / x_resample[0]) == len(x_resample):
      resampled_idxs = rng.choice(np.arange(regression_size), size=regression_size, replace=True)
      x_resample = x[resampled_idxs]
    y_resample = y[resampled_idxs]
    # Multiply by a random factor to account for uncertainty in compute
    random_multiplier = 10 ** rng.uniform(data_ci_ratio[0], data_ci_ratio[1], size=regression_size)
    y_resample *= random_multiplier
    # Fit log-linear regression
    reg_result = fit_linear_regression(x_resample, np.log10(y_resample))
    if len(reg_result.params) < 2:
      print(i)
      print(x_resample, y_resample)
    models.append(reg_result)

  return models

def regression_results_bootstrap(models):
  slopes = np.zeros(len(models))
  for i, model in enumerate(models):
    slopes[i] = model.params[1] # * SECONDS_PER_YEAR
  mean = np.mean(slopes)
  median = np.median(slopes)
  ci = np.percentile(slopes, [5,95])  # 90% CI
  print(f"""Bootstrapped regression result for slope:
    Mean: {mean:.2f}
    Median: {median:.2f}
    90% CI: {ci[0]:.2f} to {ci[1]:.2f}"""
  )
  return dict(mean=mean, median=median, ci=ci)

def predict(model, x_start, x_end, num_predictions=100):
  x_pred = np.linspace(x_start, x_end, num_predictions)
  X = sm.add_constant(x_pred)
  log_preds = model.get_prediction(X).summary_frame()
  log_preds = pd.concat(
      [pd.DataFrame({'x': x_pred}), log_preds], axis=1
  )
  return log_preds

def predict_bootstrap(models, x_start, x_end, num_predictions=100):
  log_pred_means = np.zeros((len(models), num_predictions))
  for i, model in enumerate(models):
    log_preds = predict(model, x_start, x_end, num_predictions)
    log_pred_means[i] = log_preds['mean']
  
  bootstrapped_log_pred_mean = np.mean(log_pred_means, axis=0)
  bootstrapped_log_pred_median = np.median(log_pred_means, axis=0)
  bootstrapped_log_pred_ci = np.percentile(log_pred_means, [5, 95], axis=0)

  return dict(
    mean=bootstrapped_log_pred_mean,
    median=bootstrapped_log_pred_median,
    ci=bootstrapped_log_pred_ci,
  )

def end_date_predictions(preds_bootstrapped):
  print(f"""End date predictions:
    mean: {10**preds_bootstrapped['mean'][-1]:.0e}
    median: {10**preds_bootstrapped['median'][-1]:.0e}
    90% CI: [{10**preds_bootstrapped['ci'][0][-1]:.0e}, {10**preds_bootstrapped['ci'][1][-1]:.0e}]"""
  )

def scale_range_of_predictions(preds_bootstrapped):
  print(f"Scale range of predictions: {10**preds_bootstrapped['mean'][0]:.0e} to {10**preds_bootstrapped['mean'][-1]:.0e}")

In [10]:
#@title Plotting functions

def set_default_fig_layout(fig, xtickvals, xticktext, ytickvals, yticktext):
  fig.add_annotation(
    text="CC BY Epoch",
    xref="paper",
    yref="paper",
    x=1.0,
    y=-0.14,
    showarrow=False,
    font=dict(
      size=12,
      color="#999999"
    ),
  )
  fig.update_layout(
    xaxis = dict(
      tickmode='array',
      tickvals=xtickvals,
      ticktext=xticktext,
    ),
    yaxis=dict(
        tickmode='array',
        tickvals=ytickvals,
        ticktext=yticktext,
    )
  )
  fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
  ))
  fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
  )
  return fig

# Implementation plan

MVP goal: plot of the number of AI researchers at leading institutions in AI research (dummy example below)


In [11]:
dates = np.arange(2010, 2022 + 1)
num_institutions = 5

fig = go.Figure()
for i in range(num_institutions):
  fig.add_trace(
    go.Scatter(
      x=dates,
      y=np.cumsum(np.random.randint(-5, 20, size=13)),
      name=f"Institution{i}",
    ),
  )

## Plot layout

cost_ticks = np.arange(0, 101, 10)
cost_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

set_default_fig_layout(fig, dates, dates, cost_ticks, cost_tick_text)

fig.update_layout(
  title='[DEMO ONLY] Number of AI researchers at leading institutions in AI research',
  xaxis_title='Year',
  yaxis_title='Number of researchers',
)

## Save plot
# fig.write_image(result_file_location + 'demo.png', scale=2)

## Show plot
fig.show()

Assumed data to start with:

- A set of paper objects
- Each paper object has a list of author objects
- Each author is associated with zero or more institution objects

Intermediate result: dictionary
- Key: institution object
- Value: dictionary
  - Key: year
  - Value: list of author objects. Each author was affiliated with this institution in this year.

To get from start to intermediate result:
- Initialise dictionary `institution_author_data`
- For each paper
  - Store the year of publication `pub_year`
  - For each author `a`
    - For each institution `institution` associated with author `a`
    - `institution_author_data[institution][pub_year].append(a.name)`

Final result: dictionary
- Key: institution object
  - Minimum: string name of the institution
- Value: xarray <year, num_researchers>

# Pipeline for affiliation data

In [12]:
TYPE_TO_PYALEX_CLASS = {
  'A': Authors,
  'I': Institutions,
}

def get_entity_name(url):
  id = get_id_from_url(url)
  openalex_class = TYPE_TO_PYALEX_CLASS[id[0]]
  return openalex_class()[id]['display_name']

In [13]:
class OpenAlexProcessor:
  def __init__(self):
    pass
  
  def gather_institution_author_data(self, works):
    institution_author_data = defaultdict(lambda: defaultdict(set))
    named_institution_author_data = defaultdict(lambda: defaultdict(set))
    for work in works:
      pub_year = work['publication_year']
      for authorship in work['authorships']:
        if len(authorship['institutions']) > 0:
          author_id = authorship['author']['id']
          author_name = authorship['author']['display_name']
          for ins in authorship['institutions']:
            if ins.get('id') == None:
              continue
            institution_author_data[ins['id']][pub_year].add(author_id)

            ins_name = ins['display_name']
            named_institution_author_data[ins_name][pub_year].add(author_name)
        else:
          institutions = 'none'
    return institution_author_data, named_institution_author_data

In [14]:
def calculate_institution_author_count(institution_author_data):
  institution_author_count = dict()
  for institution, author_series in institution_author_data.items():
    author_counts = OrderedDict()
    for year, authors in sorted(author_series.items()):
      author_counts[year] = len(authors)
    author_count_array = xr.DataArray(
      list(author_counts.values()),
      dims=("year",),
      coords={"year": list(author_counts.keys()),}
    )
    institution_author_count[institution] = author_count_array
  return institution_author_count

In [15]:
def name_institution_author_data(institution_author_data):
  named_institution_author_data = defaultdict(lambda: defaultdict(set))
  for ins, author_data in institution_author_data.items():
    ins_name = get_entity_name(ins)
    for year, authors in author_data.items():
      author_names = {get_entity_name(a) for a in authors}
      named_institution_author_data[ins_name][year] = author_names
  return named_institution_author_data

In [16]:
test_institution_author_data = {
  'Google': OrderedDict([
    (2014, ['Alice',]),
    (2015, ['Alice', 'Bob',]),
    (2016, ['Bob', 'Xin',]),
  ]),
  'OpenAI': OrderedDict([
    (2014, []),
    (2015, ['Lakeith',]),
    (2016, ['Lakeith', 'Anita', 'Wenjie']),
  ]),
}

calculate_institution_author_count(test_institution_author_data)

{'Google': <xarray.DataArray (year: 3)>
 array([1, 2, 2])
 Coordinates:
   * year     (year) int64 2014 2015 2016,
 'OpenAI': <xarray.DataArray (year: 3)>
 array([0, 1, 3])
 Coordinates:
   * year     (year) int64 2014 2015 2016}

In [17]:
get_entity_name('https://openalex.org/I4210114444')

'Meta (United States)'

# Gather raw data

In [18]:
# I'm not sure how to get more than 200 items at once other than to create a pager and then merge the pages
def merge_pages(pager):
  items = list()
  for page in pager:
    items.extend(page)
  return items

In [54]:
dl_concept_results = Concepts().search("deep learning").get()
dl_concept = dl_concept_results[0]
dl_concept_id = get_id_from_url(dl_concept['id'])
dl_concept_id

'C108583219'

In [53]:
ml_concept_results = Concepts().search("machine learning").get()
ml_concept = ml_concept_results[0]
ml_concept_id = get_id_from_url(ml_concept['id'])
ml_concept_id

'C119857082'

In [20]:
# This takes a few minutes for 10,000 entries
# most_cited_dl_works = merge_pages(
#   Works() \
#     .filter(concepts={"id": dl_concept_id}) \
#     .sort(cited_by_count="desc") \
#     .paginate(n_max=100)
# )

In [76]:
google_us_id = "https://openalex.org/I1291425158"

In [55]:
# This takes ~1 minute on Macbook Pro 2019
google_us_ml_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": google_us_id}}) \
    .filter(concepts={"id": ml_concept_id}) \
    .paginate(n_max=100000)
)

In [56]:
len(google_us_ml_works)

5171

In [60]:
Institutions().search("deepmind").get()

[{'id': 'https://openalex.org/I4210090411',
  'ror': 'https://ror.org/00971b260',
  'display_name': 'DeepMind (United Kingdom)',
  'relevance_score': 17836.32,
  'country_code': 'GB',
  'type': 'company',
  'homepage_url': 'https://deepmind.com/',
  'image_url': 'https://upload.wikimedia.org/wikipedia/commons/8/8c/DeepMind_headquarters_S2_Handyside.jpg',
  'image_thumbnail_url': 'https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/DeepMind_headquarters_S2_Handyside.jpg/100px-DeepMind_headquarters_S2_Handyside.jpg',
  'display_name_acronyms': [],
  'display_name_alternatives': [],
  'repositories': [],
  'works_count': 1007,
  'cited_by_count': 100586,
  'summary_stats': {'2yr_mean_citedness': 23.681818181818183,
   'h_index': 91,
   'i10_index': 325},
  'ids': {'openalex': 'https://openalex.org/I4210090411',
   'ror': 'https://ror.org/00971b260',
   'grid': 'grid.498210.6',
   'wikipedia': 'https://en.wikipedia.org/wiki/Google%20DeepMind'},
  'geo': {'city': 'London',
   'geoname

In [74]:
openai_id = "https://openalex.org/I4210161460"

In [65]:
openai_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": openai_id}}) \
    .paginate(n_max=100000)
)

In [66]:
len(openai_works)

83

In [75]:
deepmind_id = "https://openalex.org/I4210090411"

In [63]:

deepmind_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": deepmind_id}}) \
    .paginate(n_max=100000)
)

In [64]:
len(deepmind_works)

1031

In [38]:
# Save to avoid fetching every time
# with open(result_file_location + "google_us_works_openalex", "wb") as f:
#   pickle.dump(google_us_works, f)

with open(result_file_location + "google_us_works_openalex", "rb") as f:
  google_us_works = pickle.load(f)

In [67]:
works = google_us_ml_works + openai_works + deepmind_works

In [68]:
len(works)

6285

Sanity checking against listed [Google publications](https://web.archive.org/web/20230504213235/https://research.google/pubs/)

In [44]:
google_us_works_2019 = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": "https://openalex.org/I1291425158"}}) \
    .filter(publication_year=2019) \
    .paginate(n_max=10000)
)
len(google_us_works_2019)

2836

In [46]:
for w in google_us_works_2019[:10]:
  print(w['display_name'])

SciPy 1.0: fundamental algorithms for scientific computing in Python
Reproducible, interactive, scalable and extensible microbiome data science using QIIME 2
Quantum supremacy using a programmable superconducting processor
XLNet: Generalized Autoregressive Pretraining for Language Understanding
Searching for MobileNetV3
SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
MnasNet: Platform-Aware Neural Architecture Search for Mobile
A guide to deep learning in healthcare
EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks
Regularized Evolution for Image Classifier Architecture Search


# Pipeline execution

In [69]:
institution_author_data, named_institution_author_data = OpenAlexProcessor().gather_institution_author_data(works)

In [70]:
institution_author_data

defaultdict(<function __main__.OpenAlexProcessor.gather_institution_author_data.<locals>.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(set,
                         {2015: {'https://openalex.org/A1111049960',
                           'https://openalex.org/A1124557692',
                           'https://openalex.org/A1171453863',
                           'https://openalex.org/A1431897541',
                           'https://openalex.org/A158011298',
                           'https://openalex.org/A1969031609',
                           'https://openalex.org/A1974776862',
                           'https://openalex.org/A1976350085',
                           'https://openalex.org/A1985611198',
                           'https://openalex.org/A1986159270',
                           'https://openalex.org/A1987172238',
                           'https://openalex.org/A1994677000',
                           'https://openalex.org/A2007821246',
        

In [71]:
named_institution_author_data['Google (United States)'][2019]

{'Francesco Visin',
 'Laurent Orseau',
 'Ankur Bapna',
 'Josip Djolonga',
 'Vincent Dumoulin',
 'Olivier Pauly',
 'Chenjie Gu',
 'Christopher P. Burgess',
 'Heiga Zen',
 'Patricia Suriana',
 'Oscar Täckström',
 'R. Reeve Ingle',
 'Mathieu Blondel',
 'Sébastien Lahaie',
 'Dong Yin',
 'Daniel Z. Freedman',
 'Jonathan Huang',
 'Sagi Perel',
 'Sean Augenstein',
 'Manfred K. Warmuth',
 'Brendan McMahan',
 'Thomas Anthony',
 'Shengyu Feng',
 'Blaise Aguera y Arcas',
 'Syrine Krichene',
 'Bradley A. Green',
 'Ian S. Fischer',
 'Wenhan Lu',
 'Marco Tagliasacchi',
 'Hartwig Adam',
 'Abhijit Ogale',
 'Sai Srivatsa Ravindranath',
 'James Wexler',
 'Joan Puigcerver',
 'Yinlam Chow',
 'Julian Martin Eisenschlos',
 'Olivier Siohan',
 'Noah Constant',
 'Marvin Ritter',
 'Cheng Zhi Huang',
 'Satinder Singh',
 'Weilong Yang',
 'George Tucker',
 'Dale A. Webster',
 'Samy Bengio',
 'Satrajit Chatterjee',
 'Sam Greydanus',
 'Rodolphe Jenatton',
 'Lucy J. Colwell',
 'Ting Yu',
 'Joshua S. Dillon',
 'Rory S

In [72]:
institution_author_count = calculate_institution_author_count(institution_author_data)

In [73]:
institution_author_count

{'https://openalex.org/I1291425158': <xarray.DataArray (year: 22)>
 array([   2,    2,    3,   12,   14,   34,   33,   59,   70,   78,  107,
         149,  148,  240,  329,  504,  810, 1214, 1500, 1429,  503,  104])
 Coordinates:
   * year     (year) int64 1999 2002 2004 2005 2006 ... 2019 2020 2021 2022 2023,
 'https://openalex.org/I114027177': <xarray.DataArray (year: 6)>
 array([1, 1, 1, 1, 7, 1])
 Coordinates:
   * year     (year) int64 2014 2015 2018 2019 2021 2022,
 'https://openalex.org/I27837315': <xarray.DataArray (year: 9)>
 array([ 4,  6,  5,  4, 11, 20, 27, 17,  4])
 Coordinates:
   * year     (year) int64 2013 2014 2015 2017 2018 2019 2020 2021 2022,
 'https://openalex.org/I4210148872': <xarray.DataArray (year: 2)>
 array([1, 1])
 Coordinates:
   * year     (year) int64 2014 2015,
 'https://openalex.org/I4210161460': <xarray.DataArray (year: 8)>
 array([ 2, 10, 15, 14, 41, 22, 11,  5])
 Coordinates:
   * year     (year) int64 2016 2017 2018 2019 2020 2021 2022 2023,
 'http

In [29]:
# Narrow down to the biggest institutions
institution_max_author_count = dict()
for ins, author_counts in institution_author_count.items():
  institution_max_author_count[ins] = np.max(author_counts)

biggest_institution_author_count = dict()
num_institutions = 1
for i, (ins, max_author_count) in enumerate(sorted(institution_max_author_count.items(), key=lambda item: item[1], reverse=True)):
  if i >= num_institutions:
    break
  biggest_institution_author_count[ins] = institution_author_count[ins]

biggest_institution_author_count

{'https://openalex.org/I1291425158': <xarray.DataArray (year: 27)>
 array([   2,    1,    5,    8,   10,   12,   29,   46,   75,  178,  283,
         397,  466,  555,  622,  692,  878,  834, 1084, 1467, 1863, 2152,
        3044, 3334, 3578, 2134,  685])
 Coordinates:
   * year     (year) int64 1994 1995 1999 2000 2001 ... 2019 2020 2021 2022 2023}

In [77]:
# Choose specific institutions
handpicked_ins = [google_us_id, openai_id, deepmind_id]
handpicked_institution_author_count = {ins: institution_author_count[ins] for ins in handpicked_ins}

In [83]:
fig = go.Figure()
for ins, author_counts in handpicked_institution_author_count.items():
  fig.add_trace(
    go.Scatter(
      x=author_counts['year'],
      y=author_counts,
      name=get_entity_name(ins),
      mode='lines+markers',
    ),
  )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
  title='Number of unique authors on ML papers',
  xaxis_title='Year',
  yaxis_title='Number of unique authors',
)

fig.update_layout(
  legend=dict(
    title="Affiliation with:"
  )
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

# [END]