In [1]:
import datetime
import kaleido
import numpy as np
from numpy.random import default_rng
import os
import plotly.graph_objects as go
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
import pickle

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [2]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [3]:
# Location to save results e.g. plots - should be modified appropriately
result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

Create a random number generator, with a fixed random seed for reproducibility

In [4]:
SEED = 20230105
rng = default_rng(seed=SEED)

# Implementation plan

MVP goal: plot of the number of AI researchers at leading institutions in AI research (dummy example below)


In [5]:
dates = np.arange(2010, 2022 + 1)
num_institutions = 5

fig = go.Figure()
for i in range(num_institutions):
    fig.add_trace(
        go.Scatter(
            x=dates,
              y=np.cumsum(np.random.randint(-5, 20, size=13)),
                name=f"Institution{i}",
        ),
    )

## Plot layout

cost_ticks = np.arange(0, 101, 10)
cost_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

set_default_fig_layout(fig, dates, dates, cost_ticks, cost_tick_text)

fig.update_layout(
    title='[DEMO ONLY] Number of AI researchers at leading institutions in AI research',
    xaxis_title='Year',
    yaxis_title='Number of researchers',
)

## Save plot
# fig.write_image(result_file_location + 'demo.png', scale=2)

## Show plot
fig.show()

Assumed data to start with:

- A set of paper objects
- Each paper object has a list of author objects
- Each author is associated with zero or more institution objects

Intermediate result: dictionary
- Key: institution object
- Value: dictionary
  - Key: year
  - Value: list of author objects. Each author was affiliated with this institution in this year.

To get from start to intermediate result:
- Initialise dictionary `institution_author_data`
- For each paper
  - Store the year of publication `pub_year`
  - For each author `a`
    - For each institution `institution` associated with author `a`
    - `institution_author_data[institution][pub_year].append(a.name)`

Final result: dictionary
- Key: institution object
  - Minimum: string name of the institution
- Value: xarray <year, num_researchers>

# Field-Weighted Citation Impact experiments

In [6]:
processor = OpenAlexProcessor

In [7]:
sampled_works = merge_sample(
    Works() \
        .filter(publication_year=2018) \
        .filter(type='journal-article'),
    sample_size=1000,
    seed=535,
)
len(sampled_works)

1000

In [8]:
sampled_works = Works() \
    .filter(publication_year=2018) \
    .filter(type='journal-article') \
    .sample(10000, seed=535) \
    .get(per_page=200, page=2)
len(sampled_works)

200

In [9]:
# See https://openalex.org/I4210114444
meta_id = 'I4210114444'

In [11]:
meta_works = processor.get_institution_works(meta_id)

In [12]:
len(meta_works)

2198

In [14]:
# TODO use SEED constant once done experimenting
n = 1
test_fwcis = np.zeros(n)
for i in range(n):
    test_fwcis[i] = fwci(meta_works[:10], processor, seed=i)
np.mean(test_fwcis), np.std(test_fwcis)

Work: Deep learning (1 of 10)
Citation count: 9046
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 5.094
Concept: Speech recognition (2 of 5)
Expected citations from 1000 works: 12.191
Concept: Organic chemistry (3 of 5)
Expected citations from 1000 works: 7.208
Concept: Epistemology (4 of 5)
Expected citations from 1000 works: 1.704
Concept: Law (5 of 5)
Expected citations from 1000 works: 2.143
Overall expected citation count: 3.4000609672207593
Work: Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks (2 of 10)
Citation count: 6019
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 6.03
Concept: Algorithm (2 of 5)
Expected citations from 1000 works: 4.819
Concept: Information retrieval (3 of 5)
Expected citations from 1000 works: 2.579
Concept: Programming language (4 of 5)
Expected citations from 1000 works: 4.462
Concept: Embedded system (5 of 5)
Expected citations from 1000 works: 7.951
Overall ex

(892.3371781764876, 0.0)

In [22]:
Concepts().search('artificial intelligence').get()[0]['works_count']

13947420

In [None]:
Works().search("attention is all you need").get()[0]['concepts']

In [15]:
top_work = Works().filter(publication_year=2018).sort(cited_by_count="desc").get()[0]

In [16]:
top_work

{'id': 'https://openalex.org/W2889646458',
 'doi': 'https://doi.org/10.3322/caac.21492',
 'title': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'display_name': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'publication_year': 2018,
 'publication_date': '2018-11-01',
 'ids': {'openalex': 'https://openalex.org/W2889646458',
  'doi': 'https://doi.org/10.3322/caac.21492',
  'mag': '2889646458',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/30207593'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.3322/caac.21492',
  'pdf_url': 'https://acsjournals.onlinelibrary.wiley.com/doi/pdfdirect/10.3322/caac.21492',
  'source': {'id': 'https://openalex.org/S126094547',
   'display_name': 'CA: A Cancer Journal for Clinicians',
   'issn_l': '0007-9235',
   'issn': ['1542-4863', '0007-9235'],
   'host_o

In [None]:
top_work['counts_by_year']

In [None]:
get_citation_count_in_first_years(top_work)

In [None]:
random_work = Works()["W2127841864"]

In [None]:
random_work['counts_by_year']

In [None]:
random_work['publication_year']

In [None]:
get_citation_count_in_first_years(random_work)

In [None]:
random_work['concepts']

# Gather raw data

In [None]:
dl_concept_results = Concepts().search("deep learning").get()
dl_concept = dl_concept_results[0]
dl_concept_id = get_id_from_url(dl_concept['id'])
dl_concept_id

In [None]:
ml_concept_results = Concepts().search("machine learning").get()
ml_concept = ml_concept_results[0]
ml_concept_id = get_id_from_url(ml_concept['id'])
ml_concept_id

In [None]:
# This takes a few minutes for 10,000 entries
# most_cited_dl_works = merge_pages(
#   Works() \
#     .filter(concepts={"id": dl_concept_id}) \
#     .sort(cited_by_count="desc") \
#     .paginate(n_max=100)
# )

In [None]:
google_us_id = "https://openalex.org/I1291425158"

In [None]:
# This takes ~1 minute on Macbook Pro 2019
google_us_ml_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": google_us_id}}) \
    .filter(concepts={"id": ml_concept_id}) \
    .paginate(n_max=100000)
)

In [None]:
len(google_us_ml_works)

In [None]:
Institutions().search("deepmind").get()

In [None]:
openai_id = "https://openalex.org/I4210161460"

In [None]:
openai_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": openai_id}}) \
    .paginate(n_max=100000)
)

In [None]:
len(openai_works)

In [None]:
deepmind_id = "https://openalex.org/I4210090411"

In [None]:

deepmind_works = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": deepmind_id}}) \
    .paginate(n_max=100000)
)

In [None]:
len(deepmind_works)

In [None]:
# Save to avoid fetching every time
# with open(result_file_location + "google_us_works_openalex", "wb") as f:
#   pickle.dump(google_us_works, f)

with open(result_file_location + "google_us_works_openalex", "rb") as f:
  google_us_works = pickle.load(f)

In [None]:
works = google_us_ml_works + openai_works + deepmind_works

In [None]:
len(works)

Sanity checking against listed [Google publications](https://web.archive.org/web/20230504213235/https://research.google/pubs/)

In [None]:
google_us_works_2019 = merge_pages(
  Works() \
    .filter(authorships={"institutions": {"id": "https://openalex.org/I1291425158"}}) \
    .filter(publication_year=2019) \
    .paginate(n_max=10000)
)
len(google_us_works_2019)

In [None]:
for w in google_us_works_2019[:10]:
  print(w['display_name'])

# Pipeline execution

In [None]:
institution_author_data, named_institution_author_data = OpenAlexProcessor.gather_institution_author_data(works)

In [None]:
institution_author_data

In [None]:
named_institution_author_data['Google (United States)'][2019]

In [None]:
institution_author_count = calculate_institution_author_count(institution_author_data)

In [None]:
institution_author_count

In [None]:
# Narrow down to the biggest institutions
institution_max_author_count = dict()
for ins, author_counts in institution_author_count.items():
  institution_max_author_count[ins] = np.max(author_counts)

biggest_institution_author_count = dict()
num_institutions = 1
for i, (ins, max_author_count) in enumerate(sorted(institution_max_author_count.items(), key=lambda item: item[1], reverse=True)):
  if i >= num_institutions:
    break
  biggest_institution_author_count[ins] = institution_author_count[ins]

biggest_institution_author_count

In [None]:
# Choose specific institutions
handpicked_ins = [google_us_id, openai_id, deepmind_id]
handpicked_institution_author_count = {ins: institution_author_count[ins] for ins in handpicked_ins}

In [None]:
fig = go.Figure()
for ins, author_counts in handpicked_institution_author_count.items():
  fig.add_trace(
    go.Scatter(
      x=author_counts['year'],
      y=author_counts,
      name=get_entity_name(ins),
      mode='lines+markers',
    ),
  )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
  title='Number of unique authors on ML papers',
  xaxis_title='Year',
  yaxis_title='Number of unique authors',
)

fig.update_layout(
  legend=dict(
    title="Affiliation with:"
  )
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

# [END]