In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import kaleido
import numpy as np
from numpy.random import default_rng
import os
import plotly.graph_objects as go
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
import pickle

from researcher_impact.authors import *
from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

Create a random number generator, with a fixed random seed for reproducibility

In [5]:
SEED = 20230105
rng = default_rng(seed=SEED)

In [6]:
processor = OpenAlexProcessor

# Implementation plan

MVP goal: plot of the number of AI researchers at leading institutions in AI research (dummy example below)


In [7]:
dates = np.arange(2010, 2022 + 1)
num_institutions = 5

fig = go.Figure()
for i in range(num_institutions):
    fig.add_trace(
        go.Scatter(
            x=dates,
              y=np.cumsum(np.random.randint(-5, 20, size=13)),
                name=f"Institution{i}",
        ),
    )

## Plot layout

cost_ticks = np.arange(0, 101, 10)
cost_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

set_default_fig_layout(fig, dates, dates, cost_ticks, cost_tick_text)

fig.update_layout(
    title='[DEMO ONLY] Number of AI researchers at leading institutions in AI research',
    xaxis_title='Year',
    yaxis_title='Number of researchers',
)

## Save plot
# fig.write_image(result_file_location + 'demo.png', scale=2)

## Show plot
fig.show()

Assumed data to start with:

- A set of paper objects
- Each paper object has a list of author objects
- Each author is associated with zero or more institution objects

Intermediate result: dictionary
- Key: institution object
- Value: dictionary
  - Key: year
  - Value: list of author objects. Each author was affiliated with this institution in this year.

To get from start to intermediate result:
- Initialise dictionary `institution_author_data`
- For each paper
  - Store the year of publication `pub_year`
  - For each author `a`
    - For each institution `institution` associated with author `a`
    - `institution_author_data[institution][pub_year].append(a.name)`

Final result: dictionary
- Key: institution object
  - Minimum: string name of the institution
- Value: xarray <year, num_researchers>

# Field-Weighted Citation Impact experiments

In [7]:
sampled_works = merge_sample(
    Works() \
        .filter(publication_year=2018) \
        .filter(type='journal-article'),
    sample_size=1000,
    seed=535,
)
len(sampled_works)

1000

In [8]:
sampled_works = Works() \
    .filter(publication_year=2018) \
    .filter(type='journal-article') \
    .sample(10000, seed=535) \
    .get(per_page=200, page=2)
len(sampled_works)

200

In [9]:
# See https://openalex.org/I4210114444
meta_id = 'I4210114444'

In [10]:
meta_works = processor.get_institution_works(meta_id)

In [11]:
len(meta_works)

2214

In [14]:
# TODO use SEED constant once done experimenting
n = 1
test_fwcis = np.zeros(n)
for i in range(n):
    test_fwcis[i] = fwci(meta_works[:10], processor, seed=i)
np.mean(test_fwcis), np.std(test_fwcis)

Work: Deep learning (1 of 10)
Citation count: 9046
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 5.094
Concept: Speech recognition (2 of 5)
Expected citations from 1000 works: 12.191
Concept: Organic chemistry (3 of 5)
Expected citations from 1000 works: 7.208
Concept: Epistemology (4 of 5)
Expected citations from 1000 works: 1.704
Concept: Law (5 of 5)
Expected citations from 1000 works: 2.143
Overall expected citation count: 3.4000609672207593
Work: Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks (2 of 10)
Citation count: 6019
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 6.03
Concept: Algorithm (2 of 5)
Expected citations from 1000 works: 4.819
Concept: Information retrieval (3 of 5)
Expected citations from 1000 works: 2.579
Concept: Programming language (4 of 5)
Expected citations from 1000 works: 4.462
Concept: Embedded system (5 of 5)
Expected citations from 1000 works: 7.951
Overall ex

(892.3371781764876, 0.0)

In [12]:
Concepts().search('artificial intelligence').get()[0]

{'id': 'https://openalex.org/C154945302',
 'wikidata': 'https://www.wikidata.org/wiki/Q11660',
 'display_name': 'Artificial intelligence',
 'relevance_score': 509500.84,
 'level': 1,
 'description': 'field of computer science and engineering practices for intelligence demonstrated by machines and intelligent agents',
 'works_count': 13947420,
 'cited_by_count': 138732706,
 'summary_stats': {'2yr_mean_citedness': 1.6750187341614713,
  'h_index': 2191,
  'i10_index': 2210278},
 'ids': {'openalex': 'https://openalex.org/C154945302',
  'wikidata': 'https://www.wikidata.org/wiki/Q11660',
  'mag': '154945302',
  'wikipedia': 'https://en.wikipedia.org/wiki/Artificial%20intelligence',
  'umls_cui': ['C0003916']},
 'image_url': None,
 'image_thumbnail_url': None,
 'international': {'display_name': {'af': 'Kunsmatige intelligensie',
   'am': 'ሠው ሰራሽ ዕውቀት',
   'an': 'Intelichencia artificial',
   'ar': 'ذكاء اصطناعي',
   'arz': 'ذكاء صناعى',
   'as': 'কৃত্ৰিম বুদ্ধিমত্তা',
   'ast': 'intelixencia

In [None]:
Works().search("attention is all you need").get()[0]['concepts']

In [15]:
top_work = Works().filter(publication_year=2018).sort(cited_by_count="desc").get()[0]

In [16]:
top_work

{'id': 'https://openalex.org/W2889646458',
 'doi': 'https://doi.org/10.3322/caac.21492',
 'title': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'display_name': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'publication_year': 2018,
 'publication_date': '2018-11-01',
 'ids': {'openalex': 'https://openalex.org/W2889646458',
  'doi': 'https://doi.org/10.3322/caac.21492',
  'mag': '2889646458',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/30207593'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.3322/caac.21492',
  'pdf_url': 'https://acsjournals.onlinelibrary.wiley.com/doi/pdfdirect/10.3322/caac.21492',
  'source': {'id': 'https://openalex.org/S126094547',
   'display_name': 'CA: A Cancer Journal for Clinicians',
   'issn_l': '0007-9235',
   'issn': ['1542-4863', '0007-9235'],
   'host_o

In [None]:
top_work['counts_by_year']

In [None]:
get_counts_in_first_years(top_work)

In [None]:
random_work = Works()["W2127841864"]

In [14]:
Works()["https://openalex.org/W2127841864"]

{'id': 'https://openalex.org/W2127841864',
 'doi': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
 'title': 'Septic Shock Caused by <i>Ochrobactrum anthropi</i> in an Otherwise Healthy Host',
 'display_name': 'Septic Shock Caused by <i>Ochrobactrum anthropi</i> in an Otherwise Healthy Host',
 'publication_year': 2003,
 'publication_date': '2003-03-01',
 'ids': {'openalex': 'https://openalex.org/W2127841864',
  'doi': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
  'mag': '2127841864',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/12624082',
  'pmcid': 'https://www.ncbi.nlm.nih.gov/pmc/articles/150285'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S53087977',
   'display_name': 'Journal of Clinical Microbiology',
   'issn_l': '0095-1137',
   'issn': ['1070-633X', '1098-660X', '0095-1137'],
   'host_organization': 'https://openalex.org/P4

In [None]:
random_work['counts_by_year']

In [None]:
random_work['publication_year']

In [None]:
get_counts_in_first_years(random_work)

In [None]:
random_work['concepts']

# Gather raw data

In [9]:
concept_ids = [
    'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [1]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (US)
    "https://openalex.org/I4210090411",  # DeepMind
    "https://openalex.org/I4210161460",  # OpenAI
]

In [11]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

In [12]:
institution_query = "|".join(selected_institution_ids)
institution_query

'https://openalex.org/I1291425158|https://openalex.org/I4210090411|https://openalex.org/I4210161460'

In [30]:
# Filter to publications between 2010 and 2021 inclusive
publication_year_query = "|".join(str(y) for y in list(range(2010, 2019 + 1)))
publication_year_query

'2010|2011|2012|2013|2014|2015|2016|2017|2018|2019'

In [38]:
# Takes O(1 minute) just for Google (probably the biggest one) on Macbook Pro 2019
# works_instance = Works()
# n_max = 100000
# works = merge_pages(
#     works_instance \
#         .filter(authorships={"institutions": {"id": institution_query}}) \
#         .filter(concepts={"id": concept_query}) \
#         .filter(publication_year=publication_year_query) \
#         .paginate(per_page=200, n_max=n_max)
# )
# assert len(works) < n_max


In [43]:
# Save to avoid fetching every time
# with open(data_file_location + "selected_institution_works_openalex", "wb") as f:
#     obj = {
#         "params": works_instance.params,  # for reproducibility
#         "works": works,
#     }
#     pickle.dump(obj, f)

In [44]:
with open(data_file_location + "selected_institution_works_openalex", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]

# Author count

In [45]:
institution_author_data, named_institution_author_data = processor.get_institution_author_data(works, selected_institution_ids=selected_institution_ids)

In [46]:
institution_author_data

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_author_data.<locals>.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(set,
                         {2015: {'https://openalex.org/A102229238',
                           'https://openalex.org/A1111049960',
                           'https://openalex.org/A1124557692',
                           'https://openalex.org/A112736181',
                           'https://openalex.org/A114680377',
                           'https://openalex.org/A1171453863',
                           'https://openalex.org/A1193770850',
                           'https://openalex.org/A1431897541',
                           'https://openalex.org/A1509133722',
                           'https://openalex.org/A1570576689',
                           'https://openalex.org/A158011298',
                           'https://openalex.org/A168317965',
                           'https://openalex.org/A18989658

In [47]:
named_institution_author_data['Google (United States)'][2019]

{'Charbel El Kaed',
 'Jascha Sohl-Dickstein',
 'Marc Najork',
 'John W.B. Hershey',
 'Jean Pouget-Abadie',
 'Ashish V. Naik',
 'Tim Harley',
 'Vivek Rathod',
 'Greg S. Corrado',
 'Alexander Mordvintsev',
 'Dokook Choe',
 'Madhuri Duggirala',
 'P. Chang',
 'Orhan Firat',
 'Anton Geraschenko',
 'David Balduzzi',
 'Anthony Francis',
 'Jakob Uszkoreit',
 'Alex Beutel',
 'Zhe Zhao',
 'Siqi Liu',
 'Gregory Sizikov',
 'James Laudon',
 'Daniel M. Low',
 'Nikola Momchev',
 'Patrick O. Riley',
 'N. E. Shilin-Terentiev',
 'Arunachalam Narayanaswamy',
 'Effrosyni Kokiopoulou',
 'Guy Lever',
 'Hado van Hasselt',
 'Sadeep Jayasumana',
 'Bo Li',
 'Andy Ju An Wang',
 'Yi Wu',
 'Hao Zhang',
 'Tyler Zhu',
 'Sema Berkiten',
 'Matthew Henderson',
 'Nikolay Chirkov',
 'Daniel Holtmann-Rice',
 'Azin Ashkan',
 'Kaiyuan Wang',
 'Yiding Jiang',
 'Mark Rowland',
 'Naveen Ari',
 'Marius Pasca',
 'Chris Ying',
 'Zhifeng Chen',
 'Danila Sinopalnikov',
 'Hui Miao',
 'Chris Bregler',
 'Tibor Tihon',
 'Michael Terry'

In [48]:
institution_author_count = calculate_institution_author_count(institution_author_data)

In [49]:
institution_author_count

{'https://openalex.org/I1291425158': <xarray.DataArray (year: 10)>
 array([ 226,  295,  358,  379,  428,  577,  830, 1062, 1532, 2193])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019,
 'https://openalex.org/I4210090411': <xarray.DataArray (year: 8)>
 array([  2,   4,   3,  38,  94, 133, 205, 247])
 Coordinates:
   * year     (year) int64 2011 2013 2014 2015 2016 2017 2018 2019,
 'https://openalex.org/I4210161460': <xarray.DataArray (year: 4)>
 array([ 1,  9, 15,  6])
 Coordinates:
   * year     (year) int64 2016 2017 2018 2019}

In [71]:
institution_cited_by_counts, institution_work_counts = processor.get_institution_counts(works, selected_institution_ids=selected_institution_ids)

In [72]:
institution_cited_by_counts

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_counts.<locals>.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(int,
                         {2015: 109504,
                          2017: 237352,
                          2016: 190391,
                          2014: 44306,
                          2018: 369410,
                          2013: 28640,
                          2011: 8330,
                          2012: 16380,
                          2019: 298609,
                          2010: 5381}),
             'https://openalex.org/I4210090411': defaultdict(int,
                         {2015: 64995,
                          2016: 95788,
                          2017: 90446,
                          2018: 65023,
                          2019: 78887,
                          2013: 194,
                          2014: 52,
                          2011: 6}),
             'https://openalex.org/I4210161460': defau

In [73]:
institution_work_counts

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_counts.<locals>.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(int,
                         {2015: 995,
                          2017: 2165,
                          2016: 1570,
                          2014: 675,
                          2018: 3873,
                          2013: 575,
                          2011: 394,
                          2012: 487,
                          2019: 5311,
                          2010: 335}),
             'https://openalex.org/I4210090411': defaultdict(int,
                         {2015: 42,
                          2016: 140,
                          2017: 245,
                          2018: 300,
                          2019: 404,
                          2013: 4,
                          2014: 3,
                          2011: 2}),
             'https://openalex.org/I4210161460': defaultdict(int,
                     

In [75]:
institution_avg_cited_by_counts = defaultdict(lambda: defaultdict(float))
for ins, cited_by_counts in institution_cited_by_counts.items():
    for year, cited_by_count in cited_by_counts.items():
        institution_avg_cited_by_counts[ins][year] = cited_by_count / institution_work_counts[ins][year]
institution_avg_cited_by_counts

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(float,
                         {2015: 110.05427135678391,
                          2017: 109.63140877598153,
                          2016: 121.26815286624203,
                          2014: 65.63851851851852,
                          2018: 95.38084172476117,
                          2013: 49.80869565217391,
                          2011: 21.14213197969543,
                          2012: 33.634496919917865,
                          2019: 56.22462813029561,
                          2010: 16.062686567164178}),
             'https://openalex.org/I4210090411': defaultdict(float,
                         {2015: 1547.5,
                          2016: 684.2,
                          2017: 369.1673469387755,
                          2018: 216.74333333333334,
                          2019: 195.2648514851485,
                          2013: 48.5,
                          2014: 

In [84]:
institution_cited_by_count_per_author = defaultdict(lambda: defaultdict(float))
for ins, cited_by_counts in institution_cited_by_counts.items():
    # print(f"{Institutions()[ins]['display_name']}")
    for year, cited_by_count in cited_by_counts.items():
        institution_cited_by_count_per_author[ins][year] = cited_by_count / institution_author_count[ins].sel(year=year)
institution_cited_by_count_per_author

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I1291425158': defaultdict(float,
                         {2015: <xarray.DataArray ()>
                          array(189.78162912)
                          Coordinates:
                              year     int64 2015,
                          2017: <xarray.DataArray ()>
                          array(223.4952919)
                          Coordinates:
                              year     int64 2017,
                          2016: <xarray.DataArray ()>
                          array(229.38674699)
                          Coordinates:
                              year     int64 2016,
                          2014: <xarray.DataArray ()>
                          array(103.51869159)
                          Coordinates:
                              year     int64 2014,
                          2018: <xarray.DataArray ()>
                          array(241.12924282)
                          Coo

## Plots

In [50]:
fig = go.Figure()
for ins, author_counts in institution_author_count.items():
    fig.add_trace(
        go.Scatter(
            x=author_counts['year'],
            y=author_counts,
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of unique authors on AI papers',
    xaxis_title='Year',
    yaxis_title='Number of unique authors',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

In [70]:
fig = go.Figure()
for ins, cited_by_counts in institution_cited_by_counts.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=counts,
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of citations on AI papers',
    xaxis_title='Publication year',
    yaxis_title='Total citations 3 years after publication year',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

In [77]:
fig = go.Figure()
for ins, cited_by_counts in institution_avg_cited_by_counts.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=counts,
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of citations on AI papers',
    xaxis_title='Publication year',
    yaxis_title='Average citations 3 years after publication year',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

In [85]:
fig = go.Figure()
for ins, cited_by_counts in institution_cited_by_count_per_author.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=counts,
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of citations on AI papers',
    xaxis_title='Publication year',
    yaxis_title='Citations 3 years after publication year, per author',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

# [END]