In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import kaleido
import numpy as np
from numpy.random import default_rng
import os
import plotly.graph_objects as go
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
import pickle

from researcher_impact.authors import *
from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

Create a random number generator, with a fixed random seed for reproducibility

In [5]:
SEED = 20230105
rng = default_rng(seed=SEED)

In [6]:
processor = OpenAlexProcessor

# Implementation plan

MVP goal: plot of the number of AI researchers at leading institutions in AI research (dummy example below)


In [12]:
dates = np.arange(2010, 2022 + 1)
num_institutions = 5

fig = go.Figure()
for i in range(num_institutions):
    fig.add_trace(
        go.Scatter(
            x=dates,
              y=np.cumsum(np.random.randint(-5, 20, size=13)),
                name=f"Institution{i}",
        ),
    )

## Plot layout

cost_ticks = np.arange(0, 101, 10)
cost_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

set_default_fig_layout(fig, dates, dates, cost_ticks, cost_tick_text)

fig.update_layout(
    title='[DEMO ONLY] Number of AI researchers at leading institutions in AI research',
    xaxis_title='Year',
    yaxis_title='Number of researchers',
)

## Save plot
# fig.write_image(result_file_location + 'demo.png', scale=2)

## Show plot
fig.show()

Assumed data to start with:

- A set of paper objects
- Each paper object has a list of author objects
- Each author is associated with zero or more institution objects

Intermediate result: dictionary
- Key: institution object
- Value: dictionary
  - Key: year
  - Value: list of author objects. Each author was affiliated with this institution in this year.

To get from start to intermediate result:
- Initialise dictionary `institution_author_data`
- For each paper
  - Store the year of publication `pub_year`
  - For each author `a`
    - For each institution `institution` associated with author `a`
    - `institution_author_data[institution][pub_year].append(a.name)`

Final result: dictionary
- Key: institution object
  - Minimum: string name of the institution
- Value: xarray <year, num_researchers>

# Field-Weighted Citation Impact experiments

In [7]:
sampled_works = merge_sample(
    Works() \
        .filter(publication_year=2018) \
        .filter(type='journal-article'),
    sample_size=1000,
    seed=535,
)
len(sampled_works)

1000

In [8]:
sampled_works = Works() \
    .filter(publication_year=2018) \
    .filter(type='journal-article') \
    .sample(10000, seed=535) \
    .get(per_page=200, page=2)
len(sampled_works)

200

In [9]:
# See https://openalex.org/I4210114444
meta_id = 'I4210114444'

In [10]:
meta_works = processor.get_institution_works(meta_id)

In [11]:
len(meta_works)

2214

In [14]:
# TODO use SEED constant once done experimenting
n = 1
test_fwcis = np.zeros(n)
for i in range(n):
    test_fwcis[i] = fwci(meta_works[:10], processor, seed=i)
np.mean(test_fwcis), np.std(test_fwcis)

Work: Deep learning (1 of 10)
Citation count: 9046
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 5.094
Concept: Speech recognition (2 of 5)
Expected citations from 1000 works: 12.191
Concept: Organic chemistry (3 of 5)
Expected citations from 1000 works: 7.208
Concept: Epistemology (4 of 5)
Expected citations from 1000 works: 1.704
Concept: Law (5 of 5)
Expected citations from 1000 works: 2.143
Overall expected citation count: 3.4000609672207593
Work: Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks (2 of 10)
Citation count: 6019
Concept: Artificial intelligence (1 of 5)
Expected citations from 1000 works: 6.03
Concept: Algorithm (2 of 5)
Expected citations from 1000 works: 4.819
Concept: Information retrieval (3 of 5)
Expected citations from 1000 works: 2.579
Concept: Programming language (4 of 5)
Expected citations from 1000 works: 4.462
Concept: Embedded system (5 of 5)
Expected citations from 1000 works: 7.951
Overall ex

(892.3371781764876, 0.0)

In [12]:
Concepts().search('artificial intelligence').get()[0]

{'id': 'https://openalex.org/C154945302',
 'wikidata': 'https://www.wikidata.org/wiki/Q11660',
 'display_name': 'Artificial intelligence',
 'relevance_score': 509500.84,
 'level': 1,
 'description': 'field of computer science and engineering practices for intelligence demonstrated by machines and intelligent agents',
 'works_count': 13947420,
 'cited_by_count': 138732706,
 'summary_stats': {'2yr_mean_citedness': 1.6750187341614713,
  'h_index': 2191,
  'i10_index': 2210278},
 'ids': {'openalex': 'https://openalex.org/C154945302',
  'wikidata': 'https://www.wikidata.org/wiki/Q11660',
  'mag': '154945302',
  'wikipedia': 'https://en.wikipedia.org/wiki/Artificial%20intelligence',
  'umls_cui': ['C0003916']},
 'image_url': None,
 'image_thumbnail_url': None,
 'international': {'display_name': {'af': 'Kunsmatige intelligensie',
   'am': 'ሠው ሰራሽ ዕውቀት',
   'an': 'Intelichencia artificial',
   'ar': 'ذكاء اصطناعي',
   'arz': 'ذكاء صناعى',
   'as': 'কৃত্ৰিম বুদ্ধিমত্তা',
   'ast': 'intelixencia

In [None]:
Works().search("attention is all you need").get()[0]['concepts']

In [15]:
top_work = Works().filter(publication_year=2018).sort(cited_by_count="desc").get()[0]

In [16]:
top_work

{'id': 'https://openalex.org/W2889646458',
 'doi': 'https://doi.org/10.3322/caac.21492',
 'title': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'display_name': 'Global cancer statistics 2018: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries',
 'publication_year': 2018,
 'publication_date': '2018-11-01',
 'ids': {'openalex': 'https://openalex.org/W2889646458',
  'doi': 'https://doi.org/10.3322/caac.21492',
  'mag': '2889646458',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/30207593'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.3322/caac.21492',
  'pdf_url': 'https://acsjournals.onlinelibrary.wiley.com/doi/pdfdirect/10.3322/caac.21492',
  'source': {'id': 'https://openalex.org/S126094547',
   'display_name': 'CA: A Cancer Journal for Clinicians',
   'issn_l': '0007-9235',
   'issn': ['1542-4863', '0007-9235'],
   'host_o

In [None]:
top_work['counts_by_year']

In [None]:
get_counts_in_first_years(top_work)

In [None]:
random_work = Works()["W2127841864"]

In [14]:
Works()["https://openalex.org/W2127841864"]

{'id': 'https://openalex.org/W2127841864',
 'doi': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
 'title': 'Septic Shock Caused by <i>Ochrobactrum anthropi</i> in an Otherwise Healthy Host',
 'display_name': 'Septic Shock Caused by <i>Ochrobactrum anthropi</i> in an Otherwise Healthy Host',
 'publication_year': 2003,
 'publication_date': '2003-03-01',
 'ids': {'openalex': 'https://openalex.org/W2127841864',
  'doi': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
  'mag': '2127841864',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/12624082',
  'pmcid': 'https://www.ncbi.nlm.nih.gov/pmc/articles/150285'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1128/jcm.41.3.1339-1341.2003',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S53087977',
   'display_name': 'Journal of Clinical Microbiology',
   'issn_l': '0095-1137',
   'issn': ['1070-633X', '1098-660X', '0095-1137'],
   'host_organization': 'https://openalex.org/P4

In [None]:
random_work['counts_by_year']

In [None]:
random_work['publication_year']

In [None]:
get_counts_in_first_years(random_work)

In [None]:
random_work['concepts']

# Gather raw data

In [7]:
concept_ids = [
    'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [8]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210090411",  # DeepMind
    "https://openalex.org/I4210161460",  # OpenAI
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    # "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    # "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I63966007",  # Massachusetts Institute of Technology
    # "https://openalex.org/I74973139",  # Carnegie Mellon University
    # "https://openalex.org/I20231570",  # Peking University
    "https://openalex.org/I19820366",  # Chinese Academy of Sciences
]

In [9]:
ins_results = Institutions().search("Baidu").get(per_page=200)
print(len(ins_results))
for ins in ins_results:
    print(f"{ins['id']} {ins['display_name']}: {ins['cited_by_count']} citations; {ins['works_count']} works")

1
https://openalex.org/I98301712 Baidu (China): 62429 citations; 3229 works


In [10]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

In [11]:
institution_query = "|".join(selected_institution_ids)
institution_query

'https://openalex.org/I1291425158|https://openalex.org/I4210090411|https://openalex.org/I4210161460|https://openalex.org/I1290206253|https://openalex.org/I4210114444|https://openalex.org/I63966007|https://openalex.org/I19820366'

In [12]:
# Filter to publications between 2010 and 2022 inclusive
publication_years = list(range(2010, 2022 + 1))
publication_year_query = "|".join(str(y) for y in publication_years)
publication_year_query

'2010|2011|2012|2013|2014|2015|2016|2017|2018|2019|2020|2021|2022'

In [13]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
# TODO 
# works_instance = Works()
# n_max = int(1e6)
# works = merge_pages(
#     works_instance \
#         .filter(authorships={"institutions": {"id": institution_query}}) \
#         .filter(concepts={"id": concept_query}) \
#         .filter(publication_year=publication_year_query) \
#         .paginate(per_page=200, n_max=n_max)
# )
# assert len(works) < n_max
# len(works)


In [14]:
# Save to avoid fetching every time
# timestamp = datetime.datetime.now()
# with open(data_file_location + f"selected_institution_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
#     obj = {
#         "params": works_instance.params,  # for reproducibility
#         "works": works,
#     }
#     pickle.dump(obj, f)

In [15]:
with open(data_file_location + "selected_institution_works_openalex_2023-05-29_15-29-57", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

134178

# Data analysis

In [16]:
ai_concept = Concepts()["https://openalex.org/C154945302"]
ml_concept = Concepts()["https://openalex.org/C119857082"]
total_ai_citations = 0
for concept in [ai_concept]:
    for year_counts in concept['counts_by_year']:
        if year_counts['year'] in publication_years:
            print(year_counts['year'])
            total_ai_citations += year_counts['cited_by_count']
total_ai_citations

2022
2021
2020
2019
2018
2017
2016
2015
2014
2013
2012


92571279

In [17]:
ai_concept['cited_by_count']

139958704

In [18]:
len(works)

134178

In [19]:
ai_concept['works_count']

14013953

In [45]:
ai_concept_cited_by_count = xr.DataArray(np.zeros(len(publication_years)), dims=['year'], coords={'year': publication_years})
for year_counts in ai_concept['counts_by_year']:
    year = year_counts['year']
    if year in publication_years:
        ai_concept_cited_by_count.loc[year] = year_counts['cited_by_count']
ai_concept_cited_by_count

In [21]:
total_citations = defaultdict(int)
for work in works:
    total_citations[work['publication_year']] += work['cited_by_count']
total_citations

defaultdict(int,
            {2016: 482522,
             2015: 480272,
             2017: 524526,
             2014: 289259,
             2010: 200079,
             2018: 421530,
             2021: 153249,
             2013: 267281,
             2011: 220045,
             2012: 205949,
             2020: 277913,
             2019: 374526,
             2022: 29978})

In [22]:
institution_author_data, named_institution_author_data = processor.get_institution_author_data(works, selected_institution_ids=selected_institution_ids)

In [23]:
institution_author_data

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_author_data.<locals>.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(set,
                         {2015: {'https://openalex.org/A1152290301',
                           'https://openalex.org/A130200899',
                           'https://openalex.org/A1748740921',
                           'https://openalex.org/A2023078892',
                           'https://openalex.org/A2054002173',
                           'https://openalex.org/A2160372455',
                           'https://openalex.org/A2161837407',
                           'https://openalex.org/A2223933823',
                           'https://openalex.org/A2230757216',
                           'https://openalex.org/A2253370142',
                           'https://openalex.org/A2428339827',
                           'https://openalex.org/A2512114774',
                           'https://openalex.org/A2582

In [24]:
named_institution_author_data['Google (United States)'][2019]

{'Kaiwen Guo',
 'M. Zając',
 'Amelia Archer',
 'Alex Irpan',
 'Michael Krainin',
 'Peter Thomas Denny',
 'Wu Yonghui',
 'Yuhua Chen',
 'David F. Steiner',
 'Angeliki Lazaridou',
 'David S.P. Tan',
 'Hanxiao Liu',
 'Kenton Lee',
 'Luciano Sbaiz',
 'Petros Maniatis',
 'Ziping Zheng',
 'Amir Najmi',
 'Pierre-Antoine Manzagol',
 'Ryan Poplin',
 'Marc Lanctot',
 'Lala Li',
 'Ruofei Du',
 'Eva Schlinger',
 'Aaron D. Wilson',
 'Hauke Heibel',
 'Natasha Noy',
 'GollapudiSreenivas',
 'Junfeng He',
 'Yukun Zhu',
 'Lasse Espeholt',
 'Madhuri Duggirala',
 'Greg Wayne',
 'Anand Rajagopalan',
 'Felicia S. C. Lim',
 'Kyle Gorman',
 'Ben Laurie',
 'Keqin Gu',
 'Ameya Velingker',
 'Om Thakkar',
 'Pei-Yu Peggy Chi',
 'Travis Wolfe',
 'Siqi Liu',
 'Jaeyoung Kim',
 'Han Zhang',
 'Pranjal Awasthi',
 'Nicolas Carlini',
 'Adarsh Kowdle',
 'Tom Schaul',
 'Basilio Garcia',
 'Itay Laish',
 'Vidhya Navalpakkam',
 'Philip L. Quinn',
 'Ron Weiss',
 'Hugo Larochelle',
 'Yian Zhou',
 'Daniel S. Park',
 'Ming Jack Po

In [25]:
institution_author_count = calculate_institution_author_count(institution_author_data)

In [26]:
institution_author_count

{'https://openalex.org/I4210114444': <xarray.DataArray (year: 13)>
 array([  6,  24,  21,  17,  46,  43,  27,  80, 136, 217, 432, 509, 217])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022,
 'https://openalex.org/I1291425158': <xarray.DataArray (year: 13)>
 array([ 227,  296,  359,  379,  428,  580,  831, 1062, 1532, 2193, 2428,
        2480, 1575])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022,
 'https://openalex.org/I63966007': <xarray.DataArray (year: 13)>
 array([1498, 1501, 1508, 1526, 1781, 1848, 1887, 2159, 2517, 2751, 3024,
        3144, 2674])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022,
 'https://openalex.org/I4210090411': <xarray.DataArray (year: 11)>
 array([  2,   4,   3,  38,  94, 133, 205, 247, 314, 345, 281])
 Coordinates:
   * year     (year) int64 2011 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022,
 'https://openalex.or

In [27]:
institution_cited_by_counts, institution_work_counts = processor.get_institution_counts(works, selected_institution_ids=selected_institution_ids)

In [28]:
institution_cited_by_counts

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_counts.<locals>.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(int,
                         {2015: 15284,
                          2017: 22203,
                          2013: 804,
                          2014: 6104,
                          2020: 37134,
                          2021: 13207,
                          2011: 2657,
                          2019: 8329,
                          2018: 10086,
                          2012: 585,
                          2016: 1150,
                          2010: 17,
                          2022: 578}),
             'https://openalex.org/I1291425158': defaultdict(int,
                         {2015: 109533,
                          2017: 237194,
                          2016: 190377,
                          2014: 44304,
                          2018: 369442,
                          2013: 28639,
                    

In [29]:
institution_work_counts

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.get_institution_counts.<locals>.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(int,
                         {2015: 70,
                          2017: 108,
                          2013: 20,
                          2014: 53,
                          2020: 605,
                          2021: 677,
                          2011: 29,
                          2019: 278,
                          2018: 202,
                          2012: 26,
                          2016: 32,
                          2010: 7,
                          2022: 230}),
             'https://openalex.org/I1291425158': defaultdict(int,
                         {2015: 999,
                          2017: 2164,
                          2016: 1571,
                          2014: 675,
                          2018: 3872,
                          2013: 575,
                          2011: 395,
                 

In [30]:
institution_works_per_author = defaultdict(lambda: defaultdict(float))
for ins, author_counts in institution_author_count.items():
    for year, count in zip(author_counts.coords['year'].values, author_counts.values):
        institution_works_per_author[ins][year] = institution_work_counts[ins][year] / count
institution_works_per_author

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(float,
                         {2010: 1.1666666666666667,
                          2011: 1.2083333333333333,
                          2012: 1.2380952380952381,
                          2013: 1.1764705882352942,
                          2014: 1.1521739130434783,
                          2015: 1.627906976744186,
                          2016: 1.1851851851851851,
                          2017: 1.35,
                          2018: 1.4852941176470589,
                          2019: 1.2811059907834101,
                          2020: 1.400462962962963,
                          2021: 1.3300589390962672,
                          2022: 1.0599078341013826}),
             'https://openalex.org/I1291425158': defaultdict(float,
                         {2010: 1.4801762114537445,
                          2011: 1.3344594594594594,
                          2012: 1.3593314763231197,
  

In [31]:
institution_avg_cited_by_counts = defaultdict(lambda: defaultdict(float))
for ins, cited_by_counts in institution_cited_by_counts.items():
    for year, cited_by_count in cited_by_counts.items():
        institution_avg_cited_by_counts[ins][year] = cited_by_count / institution_work_counts[ins][year]
institution_avg_cited_by_counts

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(float,
                         {2015: 218.34285714285716,
                          2017: 205.58333333333334,
                          2013: 40.2,
                          2014: 115.16981132075472,
                          2020: 61.37851239669421,
                          2021: 19.50812407680945,
                          2011: 91.62068965517241,
                          2019: 29.96043165467626,
                          2018: 49.93069306930693,
                          2012: 22.5,
                          2016: 35.9375,
                          2010: 2.4285714285714284,
                          2022: 2.5130434782608697}),
             'https://openalex.org/I1291425158': defaultdict(float,
                         {2015: 109.64264264264264,
                          2017: 109.6090573012939,
                          2016: 121.18204964990451,
                          2014:

In [32]:
institution_cited_by_count_per_author = defaultdict(lambda: defaultdict(float))
for ins, cited_by_counts in institution_cited_by_counts.items():
    # print(f"{Institutions()[ins]['display_name']}")
    for year, cited_by_count in cited_by_counts.items():
        institution_cited_by_count_per_author[ins][year] = cited_by_count / institution_author_count[ins].sel(year=year)
institution_cited_by_count_per_author

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(float,
                         {2015: <xarray.DataArray ()>
                          array(355.44186047)
                          Coordinates:
                              year     int64 2015,
                          2017: <xarray.DataArray ()>
                          array(277.5375)
                          Coordinates:
                              year     int64 2017,
                          2013: <xarray.DataArray ()>
                          array(47.29411765)
                          Coordinates:
                              year     int64 2013,
                          2014: <xarray.DataArray ()>
                          array(132.69565217)
                          Coordinates:
                              year     int64 2014,
                          2020: <xarray.DataArray ()>
                          array(85.95833333)
                          Coordina

In [33]:
institution_avg_cited_by_count_per_author = defaultdict(lambda: defaultdict(float))
for ins, cited_by_counts in institution_avg_cited_by_counts.items():
    for year, cited_by_count in cited_by_counts.items():
        institution_avg_cited_by_count_per_author[ins][year] = cited_by_count / institution_author_count[ins].sel(year=year)
institution_avg_cited_by_count_per_author

defaultdict(<function __main__.<lambda>()>,
            {'https://openalex.org/I4210114444': defaultdict(float,
                         {2015: <xarray.DataArray ()>
                          array(5.07774086)
                          Coordinates:
                              year     int64 2015,
                          2017: <xarray.DataArray ()>
                          array(2.56979167)
                          Coordinates:
                              year     int64 2017,
                          2013: <xarray.DataArray ()>
                          array(2.36470588)
                          Coordinates:
                              year     int64 2013,
                          2014: <xarray.DataArray ()>
                          array(2.50369155)
                          Coordinates:
                              year     int64 2014,
                          2020: <xarray.DataArray ()>
                          array(0.14207989)
                          Coordinates:

## Regression

## Citation-weighted publications in a year, as a function of cumulative citation-weighted publications in the whole field, and the number of researchers in major corporations

I will define the "number of researchers in major corporations" as the total number of authors across the selected institutions, in each year.
So if you were affiliated with one of the selected institutions on a work that was published in a given year, then you are counted in the "number of researchers in major corporations" for that year.

In [35]:
total_author_counts = xr.DataArray(np.zeros(len(publication_years)), dims=['year'], coords={'year': publication_years})
for author_counts in institution_author_count.values():
    print(author_counts)
    for count in author_counts:
        total_author_counts.loc[count.year] += count
        # print(year, count)
        # total_author_counts += author_counts
    print(total_author_counts)
    print("====")

<xarray.DataArray (year: 13)>
array([  6,  24,  21,  17,  46,  43,  27,  80, 136, 217, 432, 509, 217])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
<xarray.DataArray (year: 13)>
array([  6.,  24.,  21.,  17.,  46.,  43.,  27.,  80., 136., 217., 432.,
       509., 217.])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
====
<xarray.DataArray (year: 13)>
array([ 227,  296,  359,  379,  428,  580,  831, 1062, 1532, 2193, 2428,
       2480, 1575])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
<xarray.DataArray (year: 13)>
array([ 233.,  320.,  380.,  396.,  474.,  623.,  858., 1142., 1668.,
       2410., 2860., 2989., 1792.])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
====
<xarray.DataArray (year: 13)>
array([1498, 1501, 1508, 1526, 1781, 1848, 1887, 2159, 2517, 2751, 3024,
       3144, 2674])
Coordinate

In [80]:
input_start_year = publication_years[2]
input_end_year = publication_years[-2]
output_start_year = publication_years[3]
output_end_year = publication_years[-1]

In [89]:
num_researchers = total_author_counts.loc[input_start_year:input_end_year]
num_researchers = num_researchers.expand_dims(dim={'feature': ['num_researchers']}, axis=1)
num_researchers

Defining the "whole field" as just the set of works affiliated with the selected institutions.
Setting the citation weighting to just be the number of citations.

In [91]:
field_citation_weighted_works = ai_concept_cited_by_count.loc[input_start_year:input_end_year]
field_citation_weighted_works = np.cumsum(field_citation_weighted_works)
field_citation_weighted_works = field_citation_weighted_works.expand_dims(dim={'feature': ['field_citation_weighted_works']}, axis=1)
field_citation_weighted_works

In [99]:
design_matrix = xr.concat([field_citation_weighted_works, num_researchers], dim='feature')
design_matrix

In [100]:
citation_weighted_works = ai_concept_cited_by_count.loc[output_start_year:output_end_year]
citation_weighted_works

In [101]:
est = fit_linear_regression(np.log10(design_matrix.data), np.log10(citation_weighted_works.data))
est.summary()


kurtosistest only valid for n>=20 ... continuing anyway, n=10



0,1,2,3
Dep. Variable:,y,R-squared:,0.966
Model:,OLS,Adj. R-squared:,0.956
Method:,Least Squares,F-statistic:,98.13
Date:,"Wed, 31 May 2023",Prob (F-statistic):,7.58e-06
Time:,15:07:38,Log-Likelihood:,23.595
No. Observations:,10,AIC:,-41.19
Df Residuals:,7,BIC:,-40.28
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.0563,0.205,19.769,0.000,3.571,4.541
x1,-0.0012,0.060,-0.020,0.985,-0.144,0.142
x2,0.7216,0.127,5.680,0.001,0.421,1.022

0,1,2,3
Omnibus:,0.231,Durbin-Watson:,1.687
Prob(Omnibus):,0.891,Jarque-Bera (JB):,0.276
Skew:,-0.264,Prob(JB):,0.871
Kurtosis:,2.381,Cond. No.,214.0


In [126]:
citation_weighted_works_preds = predict(est, np.log10(design_matrix))
citation_weighted_works_preds

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,6.769797,0.02437,6.71217,6.827424,6.683227,6.856367
1,6.784015,0.013437,6.752242,6.815787,6.712022,6.856008
2,6.828334,0.011192,6.801869,6.8548,6.758521,6.898148
3,6.842162,0.013346,6.810605,6.87372,6.770264,6.914061
4,6.871581,0.013824,6.838893,6.904269,6.799179,6.943983
5,6.916887,0.011965,6.888595,6.945179,6.846361,6.987413
6,6.970121,0.010423,6.945475,6.994768,6.900977,7.039266
7,7.033712,0.011697,7.006054,7.06137,6.963438,7.103986
8,7.077217,0.014201,7.043637,7.110798,7.004409,7.150026
9,7.140393,0.019494,7.094296,7.186489,7.061031,7.219755


In [138]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=num_researchers['year'],
        y=np.log10(num_researchers.squeeze()),
        name='log R: number of researchers',
        mode='lines+markers',
    ),
)
fig.add_trace(
    go.Scatter(
        x=field_citation_weighted_works['year'],
        y=np.log10(field_citation_weighted_works.squeeze()),
        name='log A: citation-weighted works (cumulative)',
        mode='lines+markers',
    ),
)
fig.add_trace(
    go.Scatter(
        x=citation_weighted_works['year'],
        y=np.log10(citation_weighted_works.squeeze()),
        name='log \dot A: citation-weighted works (this year)',
        mode='lines+markers',
    ),
)
fig.add_trace(
    go.Scatter(
        x=citation_weighted_works['year'],
        y=citation_weighted_works_preds['mean'],
        name=f'Predicted: log \dot A = {est.params[0]:.2f} + {est.params[1]:.2f} log A + {est.params[2]:.2f} log R',
        mode='lines+markers',
    ),
)

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    # title='Number of unique authors on AI papers',
    xaxis_title='Year',
    # yaxis_title='Number of unique authors',
)
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

## Plots

### Field authors

In [38]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=total_author_counts['year'],
        y=np.log10(total_author_counts),
        name=get_entity_name(ins),
        mode='lines+markers',
    ),
)

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of unique authors on AI papers',
    xaxis_title='Year',
    yaxis_title='Number of unique authors',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
# fig.write_image(result_file_location + 'num_authors_google_oai_dm.png', scale=2)

## Show plot
fig.show()

### Field authors

In [46]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=ai_concept_cited_by_count.loc[ai_concept_cited_by_count != 0]['year'],
        y=np.log10(np.cumsum(ai_concept_cited_by_count.loc[ai_concept_cited_by_count != 0])),
        mode='lines+markers',
    ),
)

## Plot layout

fig.update_layout(
    title='Cumulative citations for AI works',
    xaxis_title='Year',
    yaxis_title='Cumulative citations (log10)',
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'ai_citations.png', scale=2)

## Show plot
fig.show()

### Authors

In [228]:
fig = go.Figure()
for ins, author_counts in institution_author_count.items():
    fig.add_trace(
        go.Scatter(
            x=author_counts['year'],
            y=np.log10(author_counts),
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of unique authors on AI/ML works',
    xaxis_title='Year',
    yaxis_title='Number of unique authors (log10)',
)

fig.update_layout(
    legend=dict(
        title="At least one author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'num_authors_log10.png', scale=2)

## Show plot
fig.show()

In [229]:
total_author_counts = xr.DataArray(np.zeros(13), dims=['year'], coords={'year': np.arange(2010, 2022 + 1)})
for author_counts in institution_author_count.values():
    print(author_counts)
    for count in author_counts:
        total_author_counts.loc[count.year] += count
        # print(year, count)
        # total_author_counts += author_counts
    print(total_author_counts)
    print("====")

<xarray.DataArray (year: 13)>
array([  6,  24,  21,  17,  46,  43,  27,  80, 136, 217, 432, 509, 217])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
<xarray.DataArray (year: 13)>
array([  6.,  24.,  21.,  17.,  46.,  43.,  27.,  80., 136., 217., 432.,
       509., 217.])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
====
<xarray.DataArray (year: 13)>
array([ 227,  296,  359,  379,  428,  580,  831, 1062, 1532, 2193, 2428,
       2480, 1575])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
<xarray.DataArray (year: 13)>
array([ 233.,  320.,  380.,  396.,  474.,  623.,  858., 1142., 1668.,
       2410., 2860., 2989., 1792.])
Coordinates:
  * year     (year) int64 2010 2011 2012 2013 2014 ... 2018 2019 2020 2021 2022
====
<xarray.DataArray (year: 13)>
array([1498, 1501, 1508, 1526, 1781, 1848, 1887, 2159, 2517, 2751, 3024,
       3144, 2674])
Coordinate

In [231]:
fig = go.Figure()
for ins, cited_by_counts in institution_works_per_author.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=counts,
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of AI/ML works per author',
    xaxis_title='Publication year',
    yaxis_title='Average works per affiliated author',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'works_per_author.png', scale=2)

## Show plot
fig.show()

### Citations

In [232]:
fig = go.Figure()
for ins, cited_by_counts in institution_cited_by_counts.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=np.log10(counts),
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Number of citations on AI/ML works',
    xaxis_title='Publication year',
    yaxis_title='Total citations in next 3 years (log10)',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'citations_log10.png', scale=2)

## Show plot
fig.show()

### Citations per work

In [233]:
fig = go.Figure()
for ins, cited_by_counts in institution_avg_cited_by_counts.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=np.log10(counts),
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Average number of citations on AI/ML works',
    xaxis_title='Publication year',
    yaxis_title='Average citations in next 3 years (log10)',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'avg_citations_log10.png', scale=2)

## Show plot
fig.show()

### Citations per author

In [234]:
fig = go.Figure()
for ins, cited_by_counts in institution_cited_by_count_per_author.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=np.log10(counts),
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Citations per author on AI/ML works',
    xaxis_title='Publication year',
    yaxis_title='Citations in next 3 years, per affiliated author (log10)',
)

fig.update_layout(
    legend=dict(
        title=">=1 author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'citations_per_author_log10.png', scale=2)

## Show plot
fig.show()

### Citations per work per author

Note: this is the average number of citations per work, divided by the TOTAL number of authors affiliated with the given organization on all works in the given year.

It is NOT dividing by the number of authors on _each_ work.

In [235]:
fig = go.Figure()
for ins, cited_by_counts in institution_avg_cited_by_count_per_author.items():
    sorted_counts = sorted(cited_by_counts.items())
    years = list(zip(*sorted_counts))[0]
    counts = list(zip(*sorted_counts))[1]
    fig.add_trace(
        go.Scatter(
            x=years,
            y=np.log10(counts),
            name=get_entity_name(ins),
            mode='lines+markers',
        ),
    )

## Plot layout
dates = np.arange(2010, 2022 + 1)
y_ticks = np.arange(0, 101, 10)
y_tick_text = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# set_default_fig_layout(fig, dates, dates, y_ticks, y_tick_text)

fig.update_layout(
    title='Average citations per author on AI/ML works',
    xaxis_title='Publication year',
    yaxis_title='Average citations in next 3 years, per affiliated author (log10)',
)

fig.update_layout(
    legend=dict(
        title="At least one author affiliated with:"
    )
)

fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image(result_file_location + 'avg_citations_per_author_log10.png', scale=2)

## Show plot
fig.show()

# [END]