In [1]:
%load_ext autoreload
%autoreload 2

In [110]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import plotly
import plotly.graph_objects as go
import plotly.io as pio
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm.notebook import tqdm

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
pio.templates.default = "plotly_white"

In [5]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# PCD database

In [62]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
notable_df = pd.read_csv(data_url + 'NOTABLE%20ML%20SYSTEMS')

In [63]:
notable_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,Gen-2,Text-to-Video,Video generation,Runway,Industry,,2023-12-31,,https://research.runwayml.com/gen2,,...,,,,,,,,Unverified,,2023-06-15 19:55:47
1,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,,...,,,,,,,,Unverified,,2023-07-05 16:08:00
2,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-06-27 15:14:23
3,RoboCat,Robotics,,"Google DeepMind,Google",Industry,"Konstantinos Bousmalis, Giulia Vezzani, Dushya...",2023-06-20,RoboCat: A Self-Improving Foundation Agent for...,https://arxiv.org/abs/2306.11706,,...,,,,,,,Industry,Speculative,The ability to leverage heterogeneous robotic ...,2023-08-10 15:22:11
4,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-08-10 15:21:27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,,Language,Speech recognition,Google,Industry,"Daniel S. Park, William Chan, Yu Zhang, Chung...",2019-04-18,SpecAugment: A Simple Data Augmentation Method...,https://arxiv.org/abs/1904.08779,1409.0,...,,,,,,,Industry,,,2023-08-10 15:24:25
385,,Recommendation,,Google,Industry,"J Davidson, B Liebald, J Liu, P Nandy",2010-09-26,The YouTube Video Recommendation System,https://dl.acm.org/doi/10.1145/1864708.1864770,1074.0,...,,,,,,,Industry,,,2023-08-10 15:25:05
386,,Vision,,"UT Austin,UC Berkeley,Google",Industry - Academia Collaboration,"V Gulshan, L Peng, M Coram, MC Stumpe, D Wu",2016-12-13,Development and Validation of a Deep Learning ...,https://jamanetwork.com/journals/jama/article-...,3540.0,...,,,,,,,Industry,,,2023-08-10 15:28:21
387,,Vision,Video,"University of Maryland,University of Texas,Google",Industry - Academia Collaboration,"Joe Yue-Hei Ng, Matthew Hausknecht, Sudheendra...",2015-05-01,Beyond Short Snippets: Deep Networks for Video...,https://www.cv-foundation.org/openaccess/conte...,2260.0,...,,,,,,,Industry,,,2023-08-10 15:27:53


In [64]:
# Count the number of notable ML systems for each Organization since 2010.
organization_system_count = defaultdict(int)
for i, row in notable_df.iterrows():
    pub_date = row['Publication date']
    if type(pub_date) == str and int(pub_date[:4]) >= 2010:
        orgs = row['Organization']
        if type(orgs) != str: continue
        for org in orgs.split(','):
            org = org.strip().lower()
            organization_system_count[org] += 1

In [65]:
# Print organization and its system count, in descending order of count
for org, count in sorted(organization_system_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{org}: {count} systems")

google: 63 systems
deepmind: 19 systems
university of toronto: 16 systems
openai: 13 systems
stanford: 13 systems
microsoft research: 10 systems
facebook ai research: 9 systems
uc berkeley: 9 systems
google deepmind: 8 systems
university of oxford: 7 systems
johns hopkins university: 7 systems
university of montreal: 7 systems
tsinghua university: 5 systems
carnegie mellon university: 5 systems
nyu: 5 systems
meta ai: 4 systems
metaai: 4 systems
facebook: 4 systems
chinese university of hong kong: 4 systems
university of washington: 4 systems
microsoft: 4 systems
university of amsterdam: 4 systems
brno university of technology: 4 systems
runway: 3 systems
baidu: 3 systems
chinese academy of sciences: 3 systems
nvidia: 3 systems
stanford university: 3 systems
xi’an jiaotong university: 3 systems
ut austin: 3 systems
university of michigan: 3 systems
the chinese university of hong kong: 3 systems
university of maryland: 3 systems
alibaba group: 2 systems
facebook ai: 2 systems
mit: 2 sys

1. Google: 67
  1. Google: 43
  1. Johns Hopkins University,Stanford,Google: 2
  1. Toyota Technological Institute at Chicago,Google: 1 systems
  2. Google, TTIC: 1 systems
  3. UC Berkeley,Google: 1 systems
  4. Carnegie Mellon University,Google: 1 systems
  5. Google Brain: 1
  6. DeepMind,University of Oxford,Carnegie Mellon University,,Google: 1 systems
  7. CMU,Google: 1 systems
  8. Google Inc., University King College, Johns Hopkins University: 1 systems
  9. University King College,Johns Hopkins University,Google: 1 systems
  10. UT Austin, Google Inc, UC Berkeley: 1 systems
  11. Google, University of Montreal: 1 systems
  12. Google, University College London: 1 systems
  13. Google, Carnegie Mellon University: 1 systems
  14. Google,University of Michigan,University of North Carolina: 1 systems
  15. University of Maryland, University of Texas, Google Inc.: 1 systems
  16. Google, University of California Los Angeles: 1 systems
  17. Stanford,Google: 1 systems
  18. CNRS,Google: 1 systems
  19. Univeristy of California Berkley, Technion- Israel Institute of Technology, Google: 1 systems
  20. Google Inc: 1 systems
  21. UT Austin,UC Berkeley,Google: 1 systems
  22. University of Maryland,University of Texas,Google: 1 systems
2. DeepMind: 27
   1. DeepMind: 16
   2. Google DeepMind: 7
   3. Google DeepMind,Google: 1
   4. DeepMind,University of Oxford,Carnegie Mellon University,,Google: 1 systems
   5. Heriot-Watt University,DeepMind: 1 systems
   6. DeepMind, University of Oxford: 1 systems
3. Meta: 23
   1. Meta AI: 4
   2. MetaAI: 4
   3. Facebook AI Research: 4
   4. Facebook AI research: 4
   5. Facebook: 2
   6. Facebook AI: 2
   7. UC San Diego, Facebook: 1 systems
   8. Tsinghua University,Cornell,Facebook AI research: 1 systems
   9. Tel Aviv University, Facebook: 1 systems
4. University of Toronto: 18
   1. University of Toronto: 11
   2. Univeristy of Toronto: 2
   3. University of Toronto, Twitter: 1 systems
   4. NYU, University of Toronto, MIT: 1 systems
   5. University of Amsterdam, OpenAI, University of Toronto: 1 systems
   6. University of Toronto, University of Sherbrooke, Harvard University: 1 systems
   7. University of Montreal, University of Toronto: 1 systems
5. Stanford: 16
   1. Stanford: 7
   2. Johns Hopkins University,Stanford,Google: 2
   3. Stanford University: 1 systems
   4. DeepScale, UC Berkeley, Stanford: 1 systems
   5. Stanford, MetaMind Inc: 1 systems
   6. University of Oxford, Stanford University, Baidu: 1 systems
   7. Stanford,Google: 1 systems
   8. University of Michigan, Stanford University: 1 systems
   9. Stanford,Microsoft Research,NVIDIA: 1 systems
6. Microsoft: 15
   1. Microsoft Research: 5
   2. Microsoft: 3
   3. Microsoft Research,Peking University: 1
   4. Microsoft Bing: 1
   5. Microsoft research, Tsinghua university: 1 systems
   6. University of California,Microsoft Research: 1 systems
   7. Microsoft,Xi’an Jiaotong University,University of Science and Technology of China: 1 systems
   8. Microsoft Research,Brno University of Technology: 1 systems
   9. Stanford,Microsoft Research,NVIDIA: 1 systems
7. OpenAI: 14
   1. OpenAI: 12
   2. Open AI: 1
   3. University of Amsterdam, OpenAI, University of Toronto: 1 systems
8. UC Berkeley: 11
   1. UC Berkeley: 4
   2. UC Berkeley,Google: 1 systems
   3. UT Austin, Google Inc, UC Berkeley: 1 systems
   4. DeepScale, UC Berkeley, Stanford: 1 systems
   5. University of California, Berkeley: 1 systems
   6. UT Austin, UMass Lowell, UC Berkeley: 1 systems
   7. Univeristy of California Berkley, Technion- Israel Institute of Technology, Google: 1 systems
   8. UT Austin,UC Berkeley,Google: 1 systems
9. University of Montreal: 9
    1.  University of Montreal: 3
    2.  Google, University of Montreal: 1 systems
    3.  Universite de Montréal,Jacobs University Bremen: 1 systems
    4.  Universite de Montréal: 1 systems
    5.  University of Montreal, Jacobs University, University du Maine: 1 systems
    6.  University of Montreal,University of Illinois at Urbana- Champaigne: 1 systems
    7.  University of Montreal, University of Toronto: 1 systems
10.  University of Oxford: 8
    1.  University of Oxford: 3
    2.  DeepMind,University of Oxford,Carnegie Mellon University,,Google: 1 systems
    3.  Chinese Academy of Sciences ; University of Oxford: 1 systems
    4.  DeepMind, University of Oxford: 1 systems
    5.  Graz University of Technology, University of Oxford: 1 systems
    6.  University of Oxford, Stanford University, Baidu: 1 systems
11. The Chinese University of Hong Kong: 7
    1.  The Chinese University of Hong Kong: 2
    2.  Chinese University of Hong Kong, Chinese Academy of Sciences, Nanyang Technological University: 1 systems
    3.  Chinese University of Hong Kong: 1 systems
    4.  ETH Zurich, The Chinese University of Hong Kong, Shenzhen Institute of Advanced Technology: 1 systems
    5.  Chinese Academy of Sciences, Chinese University of Hong Kong: 1 systems
    6.  Chinese University of Hong Kong, Chinese Academy of Sciences: 1 systems
11. Johns Hopkins University: 7
    1.  Brno University of Technology, Johns Hopkins University: 3 systems
    2.  Johns Hopkins University,Stanford,Google: 2
    3.  Google Inc., University King College, Johns Hopkins University: 1 systems
    4.  University King College,Johns Hopkins University,Google: 1 systems
12. CMU: 5
    1. CMU,Google: 1 systems
    2. Carnegie Mellon University,Google: 1 systems
    3. DeepMind,University of Oxford,Carnegie Mellon University,,Google: 1 systems
    4. Carnegie Mellon University: 1 systems
    5. The Robotics Institute, Carnegie Mellon University: 1 systems
    6. Google, Carnegie Mellon University: 1 systems
12. NYU: 5
    1.  NYU: 2
    2.  NYU, University of Toronto, MIT: 1 systems
    3.  INRIA, Ecole, NYU: 1 systems
    4.  INRIA,Ecole,NYU: 1 systems
    5.  New York University: 1 systems
12. Tsinghua University: 5
    1.  Tsinghua University,BAAI: 1
    2.  Tsinghua University, Megvii Inc: 1 systems
    3.  Tsinghua University,University of Technology Sydney,University of Texas at San Antonio: 1 systems
    4.  Tsinghua University,Cornell,Facebook AI research: 1 systems
    5.  Microsoft research, Tsinghua university: 1 systems
12. University of Amsterdam: 5
    1.  university of amsterdam: 4 systems
    2.  univeristy of amsterdam: 1 systems
13. Brno University of Technology: 4
    1.  Brno University of Technology, Johns Hopkins University: 3 systems
    2.  Microsoft Research,Brno University of Technology: 1 systems
13. Baidu: 4
    1.  Baidu: 2
    2.  Baidu Research- Silicon Valley AI Lab: 1 systems
    3.  University of Oxford, Stanford University, Baidu: 1 systems
13. University of Washington: 4
    1.  University of Washington: 2
    2.  AllenAI, University of Washington: 1 systems
    3.  University of Washington, Allen Institute for AI: 1 systems
13. chinese academy of sciences: 4
    1.  chinese academy of sciences: 3
    2.  chinese academy of sciences ; university of oxford: 1 systems
14. IDSIA: 3
    1.  IDSIA: 2
    2.  IDSIA ; University of Lugano & SUPSI: 1 systems
14. NVIDIA: 3
   1. Nvidia: 1 systems
   2. NVIDIA: 1 systems
   3. Stanford,Microsoft Research,NVIDIA: 1 systems
14. Runway: 3
    1.  Runway: 2
    2.  Stability AI, Runway: 1
14. xi’an jiaotong university: 3
14. ut austin: 3
14. university of michigan: 3
14. university of maryland: 3
15. Alibaba Group: 2
15. MIT: 2
    1. MIT: 1
    2. NYU, University of Toronto, MIT: 1 systems
15. Seoul National University: 2
15. TTIC: 2
    1. Toyota Technological Institute at Chicago,Google: 1 systems
    2. Google, TTIC: 1 systems
15. University of Technology Sydney: 2
    1.  Tsinghua University,University of Technology Sydney,University of Texas at San Antonio: 1 systems
    2.  University of Technology Sydney: 1 systems
19. Inflection AI: 1
19. IDEA CCNL: 1
10. Amazon: 1
19. BAAI: 1
19. Stability AI: 
    1.  Stability AI, Runway: 1
19. EleutherAI: 1
19. Peking University: 
    1.  Microsoft Research,Peking University: 1
19. Beijing University of Posts and Telecommunications: 1
19. Meituan Inc.: 1
19. TU Darmstadt: 1
19. Jane Street: 1

In [83]:
pcd_rankings = {
    "Google": 67,
    "DeepMind": 27,
    "Meta": 23,
    "University of Toronto": 18,
    "Stanford University": 16,
    "Microsoft": 15,
    "OpenAI": 14,
    "University of California, Berkeley": 11,
    "Université de Montréal": 9,
    "University of Oxford": 8,
    "Chinese University of Hong Kong": 7,
    "Johns Hopkins University": 7,
    "Carnegie Mellon University": 5,
    "New York University": 5,
    "Tsinghua University": 5,
    "University of Amsterdam": 5,
    "Brno University of Technology": 4,
    "Baidu": 4,
    "University of Washington": 4,
    "Chinese Academy of Sciences": 4,
    "Dalle Molle Institute for Artificial Intelligence Research": 3,
    "NVIDIA": 3,
    "Runway": 3,
    "Xi'an Jiaotong University": 3,
    "The University of Texas at Austin": 3,
    "University of Michigan–Ann Arbor": 3,
    "University of Maryland, College Park": 3,
    "Alibaba": 2,
    "Massachusetts Institute of Technology": 2,
    "Seoul National University": 2,
    "Toyota Technological Institute at Chicago": 2,
    "University of Technology Sydney": 2,
    "Amazon": 1,
    "Beijing Academy of Artificial Intelligence": 1,
    "Peking University": 1,
    "Beijing University of Posts and Telecommunications": 1,
    "Technical University of Darmstadt": 1,
}

In [84]:
len(pcd_rankings)

37

# OpenAlex

In [6]:
concept_ids = [
    # "https://openalex.org/C41008148",  # Computer science
    "https://openalex.org/C154945302",  # Artificial intelligence
    "https://openalex.org/C119857082",  # Machine learning
]

In [7]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

In [10]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
n_max = int(1e5)
works_instance = Works()
works = merge_pages(
    works_instance \
        .filter(concepts={"id": concept_query}) \
        .filter(from_publication_date="2010-01-01") \
        .filter(to_publication_date="2023-06-15") \
        .sort(cited_by_count="desc") \
        .paginate(per_page=200, n_max=n_max)
)
len(works)

500page [15:23,  1.85s/page]


100000

In [11]:
# Save to avoid fetching every time
timestamp = datetime.datetime.now()
with open(data_file_location + f"top_cited_ai_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
    obj = {
        "params": works_instance.params,  # for reproducibility
        "works": works,
    }
    pickle.dump(obj, f)

In [12]:
# Took 43s for ~100K works on Macbook Pro 2019
with open(data_file_location + "top_cited_ai_works_openalex_2023-08-11_16-26-48", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

100000

In [21]:
works[98000]

{'id': 'https://openalex.org/W1988695007',
 'doi': 'https://doi.org/10.1371/journal.pone.0092197',
 'title': 'De Novo Structure Prediction of Globular Proteins Aided by Sequence Variation-Derived Contacts',
 'display_name': 'De Novo Structure Prediction of Globular Proteins Aided by Sequence Variation-Derived Contacts',
 'publication_year': 2014,
 'publication_date': '2014-03-17',
 'ids': {'openalex': 'https://openalex.org/W1988695007',
  'doi': 'https://doi.org/10.1371/journal.pone.0092197',
  'mag': '1988695007',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/24637808',
  'pmcid': 'https://www.ncbi.nlm.nih.gov/pmc/articles/3956894'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.1371/journal.pone.0092197',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S202381698',
   'display_name': 'PLOS ONE',
   'issn_l': '1932-6203',
   'issn': ['1932-6203'],
   'is_oa': True,
   'is_in_doaj': True,
   'host_organization': 'https://open

## Data processing

In [22]:
CITATION_YEAR_BOUND = 3

In [42]:
processor = OpenAlexProcessor(works, None, None, citation_year_bound=CITATION_YEAR_BOUND)

In [43]:
processor.process_works()

In [44]:
processor.institution_id_to_name

{'https://openalex.org/I4210164937': 'Microsoft Research (United Kingdom)',
 'https://openalex.org/I4210114444': 'Meta (United States)',
 'https://openalex.org/I57206974': 'New York University',
 'https://openalex.org/I70931966': 'Université de Montréal',
 'https://openalex.org/I1291425158': 'Google (United States)',
 'https://openalex.org/I185261750': 'University of Toronto',
 'https://openalex.org/I4210117453': 'Dana-Farber Cancer Institute',
 'https://openalex.org/I111088046': 'Boston University',
 'https://openalex.org/I4210138560': 'European Molecular Biology Laboratory',
 'https://openalex.org/I4210142260': 'Max Planck Institute for Molecular Genetics',
 'https://openalex.org/I69740276': 'Tokyo Metropolitan University',
 'https://openalex.org/I55732556': 'Arizona State University',
 'https://openalex.org/I130769515': 'Pennsylvania State University',
 'https://openalex.org/I185163786': 'King Abdulaziz University',
 'https://openalex.org/I16733864': 'National Taiwan University',
 '

In [45]:
institution_individual_bounded_citations = processor.get_individual_bounded_citations()

In [46]:
len(institution_individual_bounded_citations)

16704

In [50]:
institution_total_citations = {}
for ins_id, citations in institution_individual_bounded_citations.items():
    institution_total_citations[ins_id] = sum(citations)

Merge institutions to alias

In [79]:
institution_aliases = {
    "google": "Google",
    "microsoft": "Microsoft",
    "facebook": "Meta",
    "meta (": "Meta",  # include extra chars to avoid false positives
    "deepmind": "DeepMind",
    "baidu": "Baidu",
    "amazon (": "Amazon",  # include extra chars to avoid false positives
    "tencent": "Tencent",
    "alibaba": "Alibaba",
    "openai": "OpenAI",
    "nvidia": "NVIDIA",
}

In [80]:
merged_institution_total_citations = defaultdict(int)
for ins_id, total_citations in institution_total_citations.items():
    institution_name = processor.institution_id_to_name[ins_id]
    alias_used = False
    for keyword, alias in institution_aliases.items():
        if keyword in institution_name.lower():
            merged_institution_total_citations[alias] += total_citations
            alias_used = True
            break
    if not alias_used:
        merged_institution_total_citations[institution_name] += total_citations
merged_institution_total_citations

defaultdict(int,
            {'Microsoft': 231519,
             'Meta': 164273,
             'New York University': 79745,
             'Université de Montréal': 67557,
             'Google': 393993,
             'University of Toronto': 117415,
             'Dana-Farber Cancer Institute': 23356,
             'Boston University': 77467,
             'European Molecular Biology Laboratory': 30216,
             'Max Planck Institute for Molecular Genetics': 6742,
             'Tokyo Metropolitan University': 32464,
             'Arizona State University': 73589,
             'Pennsylvania State University': 56591,
             'King Abdulaziz University': 45208,
             'National Taiwan University': 19809,
             'University of North Carolina at Chapel Hill': 62123,
             'University of Michigan–Ann Arbor': 106208,
             'Magic Leap (United States)': 8323,
             'Stanford University': 282006,
             'Massachusetts Institute of Technology': 180322,
  

In [81]:
len(merged_institution_total_citations)

16516

In [82]:
for i, (ins_name, total_citations) in enumerate(sorted(merged_institution_total_citations.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins_name}: {total_citations}")

1. Google: 393993
2. Stanford University: 282006
3. University of California, Berkeley: 246023
4. Microsoft: 231519
5. University of Oxford: 198393
6. Harvard University: 181773
7. Massachusetts Institute of Technology: 180322
8. Chinese Academy of Sciences: 170581
9. Meta: 164273
10. Tsinghua University: 161117
11. Carnegie Mellon University: 144098
12. University of Washington: 142480
13. ETH Zurich: 119392
14. University of Toronto: 117415
15. University of Cambridge: 114923
16. Nanyang Technological University: 111322
17. Imperial College London: 109265
18. University of Michigan–Ann Arbor: 106208
19. University College London: 102694
20. Chinese University of Hong Kong: 99353
21. Cornell University: 93912
22. National University of Singapore: 90257
23. Harbin Institute of Technology: 89722
24. University of California, Los Angeles: 87753
25. University of California, San Diego: 83984
26. Columbia University: 83357
27. New York University: 79745
28. University of Pennsylvania: 7773

In [60]:
openalex_rankings = merged_institution_total_citations

# Aggregation

In [85]:
set(openalex_rankings.keys()).difference(pcd_rankings.keys())

{'Hygeia Hospital',
 'Imagine Institute for Genetic Diseases',
 'Health Education and Training Institute',
 'The University of Texas Health Science Center at San Antonio',
 'ConsenSys (United States)',
 'National Evolutionary Synthesis Center',
 'Kobe City Medical Center General Hospital',
 'University of Minnesota Rochester',
 "St James's University Hospital",
 'University of Agricultural Sciences, Bangalore',
 'University of Turabo',
 'Harcourt Butler Technical University',
 'Center for Life Sciences',
 'University of Guilan',
 'University of Siena',
 'Chinese Arctic and Antarctic Administration',
 'Fortiss',
 'National Marine Fisheries Service',
 'Onze Lieve Vrouwe Gasthuis',
 'Nederlands Instituut Voor Zuivel Oonderzoek',
 'New England College',
 'Max Planck Institute for Mathematics',
 'National Institute of Technology Patna',
 'Rakuten (Japan)',
 'Fatima Jinnah Women University',
 'Türk Otomobil Fabrikası (Turkey)',
 'Einstein Medical Center Philadelphia',
 'Uni Research (Norway)

In [86]:
set(pcd_rankings.keys()).difference(openalex_rankings.keys())

{'Runway'}

In [87]:
set(pcd_rankings.keys()).intersection(openalex_rankings.keys())

{'Alibaba',
 'Amazon',
 'Baidu',
 'Beijing Academy of Artificial Intelligence',
 'Beijing University of Posts and Telecommunications',
 'Brno University of Technology',
 'Carnegie Mellon University',
 'Chinese Academy of Sciences',
 'Chinese University of Hong Kong',
 'Dalle Molle Institute for Artificial Intelligence Research',
 'DeepMind',
 'Google',
 'Johns Hopkins University',
 'Massachusetts Institute of Technology',
 'Meta',
 'Microsoft',
 'NVIDIA',
 'New York University',
 'OpenAI',
 'Peking University',
 'Seoul National University',
 'Stanford University',
 'Technical University of Darmstadt',
 'The University of Texas at Austin',
 'Toyota Technological Institute at Chicago',
 'Tsinghua University',
 'University of Amsterdam',
 'University of California, Berkeley',
 'University of Maryland, College Park',
 'University of Michigan–Ann Arbor',
 'University of Oxford',
 'University of Technology Sydney',
 'University of Toronto',
 'University of Washington',
 'Université de Montré

In [88]:
pcd_total = len(notable_df)
pcd_scores = {ins: n / pcd_total for ins, n in pcd_rankings.items()}
pcd_scores

{'Google': 0.17223650385604114,
 'DeepMind': 0.06940874035989718,
 'Meta': 0.05912596401028278,
 'University of Toronto': 0.04627249357326478,
 'Stanford University': 0.04113110539845758,
 'Microsoft': 0.038560411311053984,
 'OpenAI': 0.03598971722365039,
 'University of California, Berkeley': 0.028277634961439587,
 'Université de Montréal': 0.02313624678663239,
 'University of Oxford': 0.02056555269922879,
 'Chinese University of Hong Kong': 0.017994858611825194,
 'Johns Hopkins University': 0.017994858611825194,
 'Carnegie Mellon University': 0.012853470437017995,
 'New York University': 0.012853470437017995,
 'Tsinghua University': 0.012853470437017995,
 'University of Amsterdam': 0.012853470437017995,
 'Brno University of Technology': 0.010282776349614395,
 'Baidu': 0.010282776349614395,
 'University of Washington': 0.010282776349614395,
 'Chinese Academy of Sciences': 0.010282776349614395,
 'Dalle Molle Institute for Artificial Intelligence Research': 0.007712082262210797,
 'NVIDI

In [89]:
openalex_total = sum(openalex_rankings.values())
openalex_total

24391287

In [90]:
openalex_scores = {ins: n / openalex_total for ins, n in openalex_rankings.items()}
openalex_scores

{'Microsoft': 0.009491873061064797,
 'Meta': 0.00673490496831922,
 'New York University': 0.00326940517734878,
 'Université de Montréal': 0.0027697185474468812,
 'Google': 0.01615302218370027,
 'University of Toronto': 0.0048138091278250306,
 'Dana-Farber Cancer Institute': 0.0009575550482432517,
 'Boston University': 0.0031760111715302274,
 'European Molecular Biology Laboratory': 0.001238803020111239,
 'Max Planck Institute for Molecular Genetics': 0.00027641017876588474,
 'Tokyo Metropolitan University': 0.0013309670785309525,
 'Arizona State University': 0.0030170199711068955,
 'Pennsylvania State University': 0.0023201317749243816,
 'King Abdulaziz University': 0.001853448733558012,
 'National Taiwan University': 0.0008121342674537838,
 'University of North Carolina at Chapel Hill': 0.002546934075270403,
 'University of Michigan–Ann Arbor': 0.004354341777865186,
 'Magic Leap (United States)': 0.0003412284066847313,
 'Stanford University': 0.01156175153857195,
 'Massachusetts Insti

In [141]:
aggregate_scores = dict()
merged_keys = set(openalex_scores.keys()) | set(pcd_scores.keys())
for ins in merged_keys:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    aggregate_score = openalex_score + pcd_score
    aggregate_scores[ins] = aggregate_score

In [143]:
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins}: {score:.2f}")

1. Google: 0.19
2. DeepMind: 0.07
3. Meta: 0.07
4. Stanford University: 0.05
5. University of Toronto: 0.05
6. Microsoft: 0.05
7. University of California, Berkeley: 0.04
8. OpenAI: 0.04
9. University of Oxford: 0.03
10. Université de Montréal: 0.03
11. Chinese University of Hong Kong: 0.02
12. Johns Hopkins University: 0.02
13. Tsinghua University: 0.02
14. Carnegie Mellon University: 0.02
15. Chinese Academy of Sciences: 0.02
16. University of Washington: 0.02
17. New York University: 0.02
18. University of Amsterdam: 0.01
19. Massachusetts Institute of Technology: 0.01
20. University of Michigan–Ann Arbor: 0.01
21. Baidu: 0.01
22. The University of Texas at Austin: 0.01
23. Brno University of Technology: 0.01
24. University of Maryland, College Park: 0.01
25. Xi'an Jiaotong University: 0.01
26. NVIDIA: 0.01
27. Dalle Molle Institute for Artificial Intelligence Research: 0.01
28. Runway: 0.01
29. Harvard University: 0.01
30. University of Technology Sydney: 0.01
31. Seoul National Un

# Plots

In [102]:
top_openalex_institutions = []
top_openalex_scores = []
for i, ins in enumerate(sorted(openalex_scores.keys(), key=lambda k: openalex_scores[k], reverse=True)):
    # if ins == "The Chinese University of Hong Kong":
    #     ins_alias = "CUHK"
    # elif ins == "Chinese Academy of Sciences":
    #     ins_alias = "CAS"
    # elif ins == "Beijing University of Posts and Telecommunications":
    #     ins_alias = "BUPT"
    # else:
    #     ins_alias = ins
    ins_alias = ins
    top_openalex_institutions.append(ins_alias)
    openalex_score = openalex_scores[ins]
    top_openalex_scores.append(openalex_score)
    if i >= 19:
        break
top_openalex_scores = np.array(top_openalex_scores)
top_openalex_institutions

['Google',
 'Stanford University',
 'University of California, Berkeley',
 'Microsoft',
 'University of Oxford',
 'Harvard University',
 'Massachusetts Institute of Technology',
 'Chinese Academy of Sciences',
 'Meta',
 'Tsinghua University',
 'Carnegie Mellon University',
 'University of Washington',
 'ETH Zurich',
 'University of Toronto',
 'University of Cambridge',
 'Nanyang Technological University',
 'Imperial College London',
 'University of Michigan–Ann Arbor',
 'University College London',
 'Chinese University of Hong Kong']

In [114]:
fig = go.Figure(data=[
    go.Bar(name='Citations score', x=top_openalex_institutions, y=100*top_openalex_scores),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Share of citations in the 100K most-cited publications (%)',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'ranking_citations_academic_industry')

## Show plot
fig.show()

In [115]:
top_pcd_institutions = []
top_pcd_scores = []
for i, ins in enumerate(sorted(pcd_scores.keys(), key=lambda k: pcd_scores[k], reverse=True)):
    # if ins == "The Chinese University of Hong Kong":
    #     ins_alias = "CUHK"
    # elif ins == "Chinese Academy of Sciences":
    #     ins_alias = "CAS"
    # elif ins == "Beijing University of Posts and Telecommunications":
    #     ins_alias = "BUPT"
    # else:
    #   ins_alias = ins
    ins_alias = ins
    top_pcd_institutions.append(ins_alias)
    pcd_score = pcd_scores[ins]
    top_pcd_scores.append(pcd_score)
    if i >= 19:
        break
top_pcd_scores = np.array(top_pcd_scores)
top_pcd_institutions

['Google',
 'DeepMind',
 'Meta',
 'University of Toronto',
 'Stanford University',
 'Microsoft',
 'OpenAI',
 'University of California, Berkeley',
 'Université de Montréal',
 'University of Oxford',
 'Chinese University of Hong Kong',
 'Johns Hopkins University',
 'Carnegie Mellon University',
 'New York University',
 'Tsinghua University',
 'University of Amsterdam',
 'Brno University of Technology',
 'Baidu',
 'University of Washington',
 'Chinese Academy of Sciences']

In [138]:
fig = go.Figure(data=[
    go.Bar(name='Citations score', x=top_pcd_institutions, y=100*top_pcd_scores, marker_color='rgb(230, 65, 30)'),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Share of notable ML systems in dataset (%)',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'ranking_pcd_academic_industry')

## Show plot
fig.show()

In [145]:
top_institutions = []
top_pcd_scores = []
top_openalex_scores = []
for i, ins in enumerate(sorted(aggregate_scores.keys(), key=lambda k: aggregate_scores[k], reverse=True)):
    # if ins == "The Chinese University of Hong Kong":
    #     ins_alias = "CUHK"
    # elif ins == "Chinese Academy of Sciences":
    #     ins_alias = "CAS"
    # elif ins == "Beijing University of Posts and Telecommunications":
    #     ins_alias = "BUPT"
    # else:
    #     ins_alias = ins
    ins_alias = ins
    top_institutions.append(ins_alias)
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    top_pcd_scores.append(pcd_score)
    top_openalex_scores.append(openalex_score)
    if i >= 19:
        break
top_institutions

['Google',
 'DeepMind',
 'Meta',
 'Stanford University',
 'University of Toronto',
 'Microsoft',
 'University of California, Berkeley',
 'OpenAI',
 'University of Oxford',
 'Université de Montréal',
 'Chinese University of Hong Kong',
 'Johns Hopkins University',
 'Tsinghua University',
 'Carnegie Mellon University',
 'Chinese Academy of Sciences',
 'University of Washington',
 'New York University',
 'University of Amsterdam',
 'Massachusetts Institute of Technology',
 'University of Michigan–Ann Arbor']

In [146]:
fig = go.Figure(data=[
    go.Bar(name='Citations score', x=top_institutions, y=top_openalex_scores),
    go.Bar(name='Notable ML systems score', x=top_institutions, y=top_pcd_scores),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Research impact score',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
# save_plot(fig, result_file_location, 'ranking_academic_industry')

## Show plot
fig.show()

# Final list of institutions based on ranking

Get the final list of institution IDs based on this ranking

In [43]:
for ins, score in sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True):
    search_results = Institutions().search(ins).get()
    for r in search_results:
        print(f"\"{r['id']}\",  # {r['display_name']}")

"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I

Just use top 10 aliases, and eliminate spurious results e.g. Amazon Conservation Association.

In [None]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210113297",  # Google (United Kingdom)
    "https://openalex.org/I4210100430",  # Google (Switzerland)
    "https://openalex.org/I4210148186",  # Google (Canada)
    "https://openalex.org/I4210117425",  # Google (Israel)
    "https://openalex.org/I4210131802",  # Google (Ireland)
    "https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
    "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I4210111288",  # Meta (United Kingdom)
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    "https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
    "https://openalex.org/I4210124949",  # Microsoft Research (India)
    "https://openalex.org/I4210105678",  # Microsoft (Finland)
    "https://openalex.org/I4210087053",  # Microsoft (Germany)
    "https://openalex.org/I4210125051",  # Microsoft (Israel)
    "https://openalex.org/I4210162141",  # Microsoft (India)
    "https://openalex.org/I4210086099",  # Microsoft (Brazil)
    "https://openalex.org/I4210153468",  # Microsoft (Canada)
    "https://openalex.org/I4210161634",  # Microsoft (France)
    "https://openalex.org/I4210110431",  # Microsoft (Netherlands)
    "https://openalex.org/I4210099966",  # Microsoft (Denmark)
    "https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
    "https://openalex.org/I4210135422",  # Microsoft (Norway)
    "https://openalex.org/I4210139986",  # Microsoft (Switzerland)
    "https://openalex.org/I4210109507",  # Microsoft (Ireland)
    "https://openalex.org/I4210092974",  # Microsoft (Portugal)
    "https://openalex.org/I4210151458",  # Microsoft (Belgium)
    "https://openalex.org/I4210161460",  # OpenAI (United States)
    "https://openalex.org/I45928872",  # Alibaba Group (China)
    "https://openalex.org/I4210095624",  # Alibaba Group (United States)
    "https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
    "https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
    "https://openalex.org/I4210127875",  # Nvidia (United States)
    "https://openalex.org/I98301712",  # Baidu (China)
    "https://openalex.org/I1311688040",  # Amazon (United States)
    "https://openalex.org/I4210089985",  # Amazon (Germany)
    "https://openalex.org/I4210123934",  # Amazon (United Kingdom)
    "https://openalex.org/I2250653659",  # Tencent (China)
    "https://openalex.org/I4210103558",  # Tencent Healthcare (China)
]
len(selected_institution_ids)

Have the alias mapping for later

In [None]:
selected_institutions_text = """
"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I4210161634",  # Microsoft (France)
"https://openalex.org/I4210110431",  # Microsoft (Netherlands)
"https://openalex.org/I4210099966",  # Microsoft (Denmark)
"https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
"https://openalex.org/I4210135422",  # Microsoft (Norway)
"https://openalex.org/I4210139986",  # Microsoft (Switzerland)
"https://openalex.org/I4210109507",  # Microsoft (Ireland)
"https://openalex.org/I4210092974",  # Microsoft (Portugal)
"https://openalex.org/I4210151458",  # Microsoft (Belgium)
"https://openalex.org/I4210161460",  # OpenAI (United States)
"https://openalex.org/I45928872",  # Alibaba Group (China)
"https://openalex.org/I4210095624",  # Alibaba Group (United States)
"https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
"https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
"https://openalex.org/I4210127875",  # Nvidia (United States)
"https://openalex.org/I98301712",  # Baidu (China)
"https://openalex.org/I1311688040",  # Amazon (United States)
"https://openalex.org/I4210089985",  # Amazon (Germany)
"https://openalex.org/I4210123934",  # Amazon (United Kingdom)
"https://openalex.org/I2250653659",  # Tencent (China)
"https://openalex.org/I4210103558",  # Tencent Healthcare (China)
"""

# Map each institution id to the first word after the comment
institution_aliases = {}
for line in selected_institutions_text.splitlines()[1:]:
    institution_id = line.split(",")[0].strip('"')
    institution_name = line.split("#")[1].strip()
    institution_alias = institution_name.split(" ")[0].strip()
    institution_aliases[institution_id] = institution_alias
institution_aliases

# Spot checks

In [193]:
Institutions().search("Montréal").get()

[{'id': 'https://openalex.org/I70931966',
  'ror': 'https://ror.org/0161xgx34',
  'display_name': 'Université de Montréal',
  'relevance_score': 80228.38,
  'country_code': 'CA',
  'type': 'education',
  'homepage_url': 'http://www.umontreal.ca/english/',
  'image_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Universit%C3%A4t%20Montreal%20Logo.svg',
  'image_thumbnail_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Universit%C3%A4t%20Montreal%20Logo.svg&width=300',
  'display_name_acronyms': [],
  'display_name_alternatives': ['Université de Montréal'],
  'repositories': [{'id': 'https://openalex.org/S4377196102',
    'display_name': 'Érudit (Université de Montréal)',
    'host_organization': 'https://openalex.org/I70931966',
    'host_organization_name': 'Université de Montréal',
    'host_organization_lineage': ['https://openalex.org/I70931966']},
   {'id': 'https://openalex.org/S4306402422',
    'display_name': 'Papyrus : I

In [181]:
for ins in ["Stanford", "University of Toronto", "OpenAI"]:
    top_result = Institutions().search(ins).get()[0]
    print(
        top_result["display_name"],
        "| CS?", any(["Computer science" in concept["display_name"] for concept in top_result["x_concepts"]]),
        "| AI?", any(["Artificial intelligence" in concept["display_name"] for concept in top_result["x_concepts"]]),
        "| ML?", any(["Machine learning" in concept["display_name"] for concept in top_result["x_concepts"]]),
    )

Stanford University | CS? True | AI? False | ML? False
University of Toronto | CS? True | AI? False | ML? False
OpenAI (United States) | CS? True | AI? True | ML? True


In [157]:
Works().filter(institutions={"id": "https://openalex.org/I97018004"}).sort(cited_by_count="desc").get()

[{'id': 'https://openalex.org/W2117539524',
  'doi': 'https://doi.org/10.1007/s11263-015-0816-y',
  'title': 'ImageNet Large Scale Visual Recognition Challenge',
  'display_name': 'ImageNet Large Scale Visual Recognition Challenge',
  'publication_year': 2015,
  'publication_date': '2015-04-11',
  'ids': {'openalex': 'https://openalex.org/W2117539524',
   'doi': 'https://doi.org/10.1007/s11263-015-0816-y',
   'mag': '2117539524'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://doi.org/10.1007/s11263-015-0816-y',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S25538012',
    'display_name': 'International Journal of Computer Vision',
    'issn_l': '0920-5691',
    'issn': ['0920-5691', '1573-1405'],
    'is_oa': False,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/P4310319900',
    'host_organization_name': 'Springer Science+Business Media',
    'host_organization_lineage': ['https://openalex.org/P43103

Check that works tagged Machine Learning are generally the kind of works we are interested in.

In [None]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .get()
[work['display_name'] for work in ml_works]

What is tagged Artificial Intelligence vs. Machine Learning for top institutions.

AI: "https://openalex.org/C154945302"

ML: "https://openalex.org/C119857082"

In [None]:
Concepts()["https://openalex.org/C154945302"]["level"]

In [None]:
Concepts()["https://openalex.org/C119857082"]["level"]

OpenAI: "https://openalex.org/I4210161460"

DeepMind: "https://openalex.org/I4210090411"

In [None]:
institution_id = "https://openalex.org/I4210090411"

In [None]:
# OpenAI
ai_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C154945302"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ai_works)

In [None]:
for work in ai_works:
    print(work['display_name'])

In [None]:
ml_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C119857082"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ml_works)

In [None]:
for work in ml_works:
    print(work['display_name'])

In [None]:
ai_ids = [work['id'] for work in ai_works]
ml_ids = [work['id'] for work in ml_works]
for work_id in set(ai_ids).difference(set(ml_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
for work_id in set(ml_ids).difference(set(ai_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
top_cited_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": "https://openalex.org/I4210161460"}}) \
        .filter(cited_by_count=">100") \
        .paginate(per_page=200, n_max=n_max)
)

for work in top_cited_works:
    print(work['display_name'], get_bounded_citations(work, year_bound=3))

In [61]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .select(["publication_year", "authorships", "cited_by_count", "counts_by_year"]) \
    .get()
ml_works[:2]

[{'publication_year': 2016,
  'authorships': [{'author_position': 'first',
    'author': {'id': 'https://openalex.org/A4344207660',
     'display_name': 'Kaiming He',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsoft Research#TAB#',
    'raw_affiliation_strings': ['Microsoft Research#TAB#']},
   {'author_position': 'middle',
    'author': {'id': 'https://openalex.org/A4358260579',
     'display_name': 'Xiangyu Zhang',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsof