In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import plotly
import plotly.graph_objects as go
import plotly.io as pio
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm.notebook import tqdm

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
pio.templates.default = "plotly_white"

In [5]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# PCD database

In [6]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
# sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
# data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
# notable_df = pd.read_csv(data_url + 'NOTABLE%20ML%20SYSTEMS')
# # Save frozen dataset
# notable_df.to_pickle(data_file_location + 'notable_ml_systems_2023-08-25.pkl')

In [7]:
# Use frozen dataset
notable_df = pd.read_pickle(data_file_location + 'notable_ml_systems_2023-08-25.pkl')

In [8]:
notable_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,Gen-2,Text-to-Video,Video generation,Runway,Industry,,2023-12-31,,https://research.runwayml.com/gen2,0.0,...,,,,,,,,Unverified,,2023-08-15 20:28:23
1,Claude 2,Language,Language modelling,Anthropic,Industry,,2023-07-11,,https://www.anthropic.com/index/claude-2,,...,,,,,,,,Unverified,,2023-08-14 21:28:51
2,InternLM,Language,Language modelling,"Shanghai AI Lab,SenseTime",Academia,,2023-07-06,,https://internlm.org/,,...,Training performance for the open-source Inter...,NVIDIA A100 SXM4 80 GB,,,,,,Speculative,Pre-training a bilingual 100B Foundation model...,2023-08-15 20:04:54
3,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,,...,,,,,,,,Unverified,,2023-07-05 16:08:00
4,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-06-27 15:14:23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,,Vision,,Princeton University,Academia,AM Uttley,1956-07-01,Conditional probability machines,https://www.moma.org/collection/works/illustra...,84.0,...,,,,,,,Academia,,,2023-05-29 20:51:04
377,Self Organizing System,Vision,Pattern recognition,Massachusetts Institute of Technology,Academia,W. A. Clark and B. G. Farley,1955-03-01,Generalization of pattern recognition in a sel...,https://dl.acm.org/doi/10.1145/1455292.1455309,93.0,...,,,,,,,Academia,,,2023-05-29 20:51:04
378,,Vision,Character recognition,Massachusetts Institute of Technology,Academia,O. G. Selfridge,1955-03-01,Pattern recognition and learning,https://dl.acm.org/doi/10.1145/1455292.1455310,290.0,...,,,,,,,Academia,,,2023-05-29 20:51:04
379,,,,Institute for Advanced Study,Academia,NA Barricelli,1954-07-02,Esempi numerici di processi di evoluzione,https://link.springer.com/article/10.1007/BF01...,266.0,...,,,,,,,Academia,,,2023-05-29 20:51:04


In [9]:
# Count the number of notable ML systems for each Organization since 2010.
organization_system_count = defaultdict(int)
for i, row in notable_df.iterrows():
    pub_date = row['Publication date']
    org_cat = row['Organization Categorization']
    if type(pub_date) == str and int(pub_date[:4]) >= 2010:
        orgs = row['Organization']
        if type(orgs) != str: continue
        for org in orgs.split(','):
            org = org.strip().lower()
            organization_system_count[org] += 1

In [10]:
# Print organization and its system count, in descending order of count
for org, count in sorted(organization_system_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{org}: {count} systems")

google: 36 systems
deepmind: 22 systems
university of toronto: 16 systems
google brain: 12 systems
stanford: 11 systems
openai: 9 systems
facebook ai research: 9 systems
google research: 8 systems
microsoft: 8 systems
microsoft research: 8 systems
tsinghua university: 7 systems
facebook: 7 systems
university of oxford: 7 systems
google deepmind: 7 systems
university of montreal: 7 systems
meta ai: 6 systems
carnegie mellon university: 6 systems
johns hopkins university: 6 systems
university of washington: 5 systems
chinese academy of sciences: 5 systems
chinese university of hong kong: 5 systems
uc berkeley: 5 systems
nyu: 5 systems
university of michigan: 4 systems
brno university of technology: 4 systems
runway: 3 systems
baai: 3 systems
university college london: 3 systems
metaai: 3 systems
berkeley: 3 systems
mit: 3 systems
stanford university: 3 systems
the chinese university of hong kong: 3 systems
university of california: 3 systems
university of amsterdam: 3 systems
baidu: 2 sy

In [11]:
institution_aliases = {
    "google": "Google",
    "microsoft": "Microsoft",
    "facebook": "Meta",
    "deepmind": "DeepMind",
    "baidu": "Baidu",
    "amazon": "Amazon",
    "tencent": "Tencent",
    "alibaba": "Alibaba",
    "openai": "OpenAI",
    "nvidia": "NVIDIA",
    "huawei": "Huawei",
    "ibm": "IBM",
    "nec laboratories": "NEC",
    "naver": "Naver",
    "netflix": "Netflix",
    "preferred networks": "Preferred Networks",
    "salesforce": "Salesforce",
    "twitter": "Twitter",
    "uber ai": "Uber",
    "xerox": "Xerox",
    "yandex": "Yandex",
    "adobe systems": "Adobe",
    "enthought": "Enthought",
    "quansight": "Quansight",
    "group sense": "Group Sense",
    "university of toronto": "University of Toronto",
    "stanford": "Stanford University",
    "tsinghua university": "Tsinghua University",
    "university of oxford": "University of Oxford",
    "university of montreal": "University of Montreal",
    "carnegie mellon university": "CMU",
    "johns hopkins university": "Johns Hopkins University",
    "university of washington": "University of Washington",
    "chinese academy of sciences": "Chinese Academy of Sciences",
    "chinese university of hong kong": "Chinese University of Hong Kong",
    "uc berkley": "UC Berkeley",
    "nyu": "New York University",
    "university of michigan": "University of Michigan",
    "brno university of technology": "Brno University of Technology",
    "baai": "BAAI",
    "university college london": "UCL",
    "berkeley": "UC Berkeley",
    "mit": "MIT",
    "stanford university": "Stanford University",
    "the chinese university of hong kong": "Chinese University of Hong Kong",
    "university of amsterdam": "University of Amsterdam",
    "tel aviv university": "Tel Aviv University",
    "university of technology sydney": "University of Technology Sydney",
    "national university of singapore": "National University of Singapore",
    "seoul national university": "Seoul National University",
    "allen institute for ai": "AllenAI",
    "ut austin": "UT Austin",
    "university of adeleide": "University of Adelaide",
    "university of maryland": "University of Maryland",
    "universite de montréal": "University of Montreal",
    "univeristy of toronto": "University of Toronto",
    "idsia": "IDSIA",
    "inria": "INRIA",
    "ecole": "Ecole Normale Supèrieure",
    "shanghai ai lab": "Shanghai AI Laboratory",
    "bigscience": "BigScience",
    "tsinghua keg": "Tsinghua University",
    "eleutherai": "EleutherAI",
    "runway": "Runway",
    "anthropic": "Anthropic",
    "sensetime": "SenseTime",
    "inflection ai": "Inflection AI",
    "meta ai": "Meta",
    "hugging face": "HuggingFace",
    "google brain": "Google",
    "google research": "Google",
    "metaai": "Meta",
    "stability ai": "Stability AI",
    "inspur": "Inspur",
    "search solutions": "Search Solutions",
    "microsoft bing": "Microsoft",
    "ai21 labs": "AI21 Labs",
    "facebook ai research": "Meta",
    "baidu inc.": "Baidu",
    "damo academy alibaba": "Alibaba",
    "alibaba group": "Alibaba",
    "google cloud": "Google",
    "huawei noah's ark lab": "Huawei",
    "open ai": "OpenAI",
    "salesforce research": "Salesforce",
    "facebook ai": "Meta",
    "huggingface": "HuggingFace",
    "heriot-watt university": "Heriot-Watt University",
    "nanyang technological university": "Nanyang Technological University",
    "megvii inc": "Megvii",
    "northeastern university": "Northeastern University",
    "university of rochester": "University of Rochester",
    "preferred networks inc": "Preferred Networks",
    "ritsumeikan university": "Ritsumeikan University",
    "national institute of informatics": "National Institute of Informatics",
    "allenai": "AllenAI",
    "university of san francisco": "University of San Francisco",
    "insight centre nui galway": "University of Galway",
    "university of texas at san antonio": "University of Texas at San Antonio",
    "google ai": "Google",
    "visual computing institute": "Aachen University",
    "chinese academy of sciences ; university of oxford": "Chinese Academy of Sciences",
    "shandong university": "Shandong University",
    "texas a&m": "Texas A&M University",
    "columbia university": "Columbia University",
    "cmu": "CMU",
    "johannes kepler university linz": "Johannes Kepler University Linz",
    "microsoft maluuba": "Microsoft",
    "university king college": "UCL",
    "montreal institute for learning algorithms": "MILA",
    "courant institute of mathematical sciences": "NYU",
    "university of alberta": "University of Alberta",
    "charles university": "Charles University",
    "czech technical university": "Czech Technical University",
    "uc berkeley": "UC Berkeley",
    "university of adelaide": "University of Adelaide",
    "australian centre for robotic vision": "Australian Centre for Robotic Vision",
    "uc san diego": "UC San Diego",
    "google deepmind": "DeepMind",
    "université paris-est": "Université Paris-Est",
    "svcl uc san diego": "UC San Diego",
    "eth zurich": "ETH Zurich",
    "shenzhen institute of advanced technology": "Shenzhen Institute of Advanced Technology",
    "cornell university": "Cornell University",
    "microsoft research": "Microsoft",
    "graz university of technology": "Graz University of Technology",
    "nanjing university": "Nanjing University",
    "deepscale": "DeepScale",
    "the robotics institute": "CMU",
    "baidu research- silicon valley ai lab": "Baidu",
    "princeton university": "Princeton University",
    "intel labs": "Intel",
    "university of edinburgh": "University of Edinburgh",
    "university of north carolina": "University of North Carolina",
    "university of texas": "UT Austin",
    "metamind inc": "MetaMind",
    "university of california los angeles": "UCLA",
    "umass lowell": "UMass Lowell",
    "xi’an jiaotong university": "Xi’an Jiaotong University",
    "university of science and technology of china": "USTC",
    "jacobs university": "Jacobs University",
    "university du maine": "University of Maine",
    "new york university": "NYU",
    "univeristy of amsterdam": "University of Amsterdam",
    "cnrs": "CNRS",
    "universidad nacional de cordoba": "Universidad Nacional de Cordoba",
    "xerox research centre europe": "Xerox",
    "inteligent systems lab amsterdam": "University of Amsterdam",
    "inria grenoble": "INRIA",
    "univeristy of trento": "University of Trento",
    "university of sherbrooke": "University of Sherbrooke",
    "harvard university": "Harvard University",
    "university of wisconsin madison": "University of Wisconsin Madison",
    "princeton": "Princeston University",
    "collège de france": "Collège de France",
    "harvard": "Harvard University",
    "xerox research centre europe (xrce)": "Xerox",
    "university of illinois at urbana- champaigne": "University of Illinois at Urbana-Champaign",
    "idsia ; university of lugano & supsi": "IDSIA",
}

In [12]:
pcd_rankings = defaultdict(int)
for org, count in organization_system_count.items():
    if org in institution_aliases.keys():
        alias = institution_aliases[org]
        pcd_rankings[alias] += count
    else:
        print(f'"{org}": "",')

"inc.": "",
"aachen university": "",
"university of california": "",
"lear team": "",
"college park": "",


In [13]:
pcd_rankings

defaultdict(int,
            {'Runway': 3,
             'Anthropic': 1,
             'Shanghai AI Laboratory': 1,
             'SenseTime': 1,
             'Baidu': 4,
             'Inflection AI': 1,
             'Meta': 26,
             'Google': 59,
             'OpenAI': 10,
             'BAAI': 3,
             'HuggingFace': 2,
             'BigScience': 1,
             'UCL': 4,
             'University of Michigan': 4,
             'Tsinghua University': 8,
             'Amazon': 1,
             'Yandex': 1,
             'DeepMind': 29,
             'Stability AI': 1,
             'Inspur': 1,
             'Alibaba': 3,
             'Naver': 1,
             'Search Solutions': 1,
             'Microsoft': 18,
             'AI21 Labs': 1,
             'Huawei': 1,
             'CMU': 8,
             'EleutherAI': 1,
             'UC Berkeley': 8,
             'Uber': 1,
             'Salesforce': 2,
             'Stanford University': 14,
             'University of Oxford': 7,
 

In [14]:
len(pcd_rankings)

110

In [15]:
sum(pcd_rankings.values())

385

# OpenAlex

In [16]:
concept_ids = [
    # "https://openalex.org/C41008148",  # Computer science
    "https://openalex.org/C154945302",  # Artificial intelligence
    "https://openalex.org/C119857082",  # Machine learning
]

In [17]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

In [18]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
# n_max = int(1e5)
# works_instance = Works()
# works = merge_pages(
#     works_instance \
#         .filter(concepts={"id": concept_query}) \
#         .filter(from_publication_date="2010-01-01") \
#         .filter(to_publication_date="2023-06-15") \
#         .sort(cited_by_count="desc") \
#         .paginate(per_page=200, n_max=n_max)
# )
# len(works)

In [19]:
# Save to avoid fetching every time
# timestamp = datetime.datetime.now()
# with open(data_file_location + f"top_cited_ai_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
#     obj = {
#         "params": works_instance.params,  # for reproducibility
#         "works": works,
#     }
#     pickle.dump(obj, f)

In [20]:
# Took 43s for ~100K works on Macbook Pro 2019
with open(data_file_location + "top_cited_ai_works_openalex_2023-08-11_16-26-48", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

100000

In [21]:
# This set of works roughly corresponds to all works with >=100 citations
print(works[97000]['cited_by_count'])
print(works[98000]['cited_by_count'])
print(works[-1]['cited_by_count'])

100
99
98


## Data processing

In [22]:
CITATION_YEAR_BOUND = 3

In [23]:
processor = OpenAlexProcessor(works, None, None, citation_year_bound=CITATION_YEAR_BOUND)

In [24]:
processor.process_works()

In [25]:
processor.institution_id_to_name

{'https://openalex.org/I4210164937': 'Microsoft Research (United Kingdom)',
 'https://openalex.org/I4210114444': 'Meta (United States)',
 'https://openalex.org/I57206974': 'New York University',
 'https://openalex.org/I70931966': 'Université de Montréal',
 'https://openalex.org/I1291425158': 'Google (United States)',
 'https://openalex.org/I185261750': 'University of Toronto',
 'https://openalex.org/I4210117453': 'Dana-Farber Cancer Institute',
 'https://openalex.org/I111088046': 'Boston University',
 'https://openalex.org/I4210138560': 'European Molecular Biology Laboratory',
 'https://openalex.org/I4210142260': 'Max Planck Institute for Molecular Genetics',
 'https://openalex.org/I69740276': 'Tokyo Metropolitan University',
 'https://openalex.org/I55732556': 'Arizona State University',
 'https://openalex.org/I130769515': 'Pennsylvania State University',
 'https://openalex.org/I185163786': 'King Abdulaziz University',
 'https://openalex.org/I16733864': 'National Taiwan University',
 '

In [26]:
institution_individual_bounded_citations = processor.get_individual_bounded_citations()

In [27]:
len(institution_individual_bounded_citations)

16704

In [28]:
institution_total_citations = {}
for ins_id, citations in institution_individual_bounded_citations.items():
    institution_total_citations[ins_id] = sum(citations)

Merge institutions to alias

Ensure we are detecting all the companies that are also in PCD so their scores can be aggregated

In [29]:
for i, (ins_id, total_citations) in enumerate(sorted(institution_total_citations.items(), key=lambda x: x[1], reverse=True)):
    ins_name = processor.institution_id_to_name[ins_id]
    print(f"{i+1}. {ins_name}: {total_citations}")

1. Google (United States): 350111
2. Stanford University: 282006
3. University of California, Berkeley: 246023
4. University of Oxford: 198393
5. Harvard University: 181773
6. Massachusetts Institute of Technology: 180322
7. Chinese Academy of Sciences: 170581
8. Tsinghua University: 161117
9. Carnegie Mellon University: 144098
10. University of Washington: 142480
11. ETH Zurich: 119392
12. University of Toronto: 117415
13. Meta (Israel): 117191
14. University of Cambridge: 114923
15. Nanyang Technological University: 111322
16. Imperial College London: 109265
17. University of Michigan–Ann Arbor: 106208
18. University College London: 102694
19. Chinese University of Hong Kong: 99353
20. Cornell University: 93912
21. National University of Singapore: 90257
22. Harbin Institute of Technology: 89531
23. Microsoft (United States): 88082
24. University of California, Los Angeles: 87753
25. University of California, San Diego: 83984
26. Columbia University: 83357
27. New York University: 79

In [30]:
set(pcd_rankings.keys()).difference(institution_total_citations.keys())

{'AI21 Labs',
 'Aachen University',
 'Alibaba',
 'AllenAI',
 'Amazon',
 'Anthropic',
 'Australian Centre for Robotic Vision',
 'BAAI',
 'Baidu',
 'BigScience',
 'Brno University of Technology',
 'CMU',
 'CNRS',
 'Charles University',
 'Chinese Academy of Sciences',
 'Chinese University of Hong Kong',
 'Collège de France',
 'Columbia University',
 'Cornell University',
 'Czech Technical University',
 'DeepMind',
 'DeepScale',
 'ETH Zurich',
 'Ecole Normale Supèrieure',
 'EleutherAI',
 'Google',
 'Graz University of Technology',
 'Harvard University',
 'Heriot-Watt University',
 'Huawei',
 'HuggingFace',
 'IBM',
 'IDSIA',
 'INRIA',
 'Inflection AI',
 'Inspur',
 'Intel',
 'Jacobs University',
 'Johannes Kepler University Linz',
 'Johns Hopkins University',
 'MILA',
 'MIT',
 'Megvii',
 'Meta',
 'MetaMind',
 'Microsoft',
 'NEC',
 'NVIDIA',
 'NYU',
 'Nanjing University',
 'Nanyang Technological University',
 'National Institute of Informatics',
 'National University of Singapore',
 'Naver',


In [31]:
institution_aliases = {
    "google": "Google",
    "microsoft": "Microsoft",
    "facebook": "Meta",
    "meta (": "Meta",  # include extra chars to avoid false positives
    "deepmind": "DeepMind",
    "baidu": "Baidu",
    "amazon (": "Amazon",  # include extra chars to avoid false positives
    "tencent": "Tencent",
    "alibaba": "Alibaba",
    "openai": "OpenAI",
    "nvidia": "NVIDIA",
    "huawei": "Huawei",
    "ibm": "IBM",
    "intel (": "Intel",  # include extra chars to avoid false positives
    "nec (": "NEC",  # include extra chars to avoid false positives
    "naver": "Naver",
    "netflix": "Netflix",
    "preferred networks": "Preferred Networks",
    "salesforce": "Salesforce",
    "twitter": "Twitter",
    "uber ai": "Uber",
    "xerox": "Xerox",
    "yandex": "Yandex",
    "adobe systems": "Adobe",
    "enthought": "Enthought",
    "quansight": "Quansight",
    "group sense": "Group Sense",
}

In [32]:
merged_institution_total_citations = defaultdict(int)
for ins_id, total_citations in institution_total_citations.items():
    institution_name = processor.institution_id_to_name[ins_id]
    alias_used = False
    for keyword, alias in institution_aliases.items():
        if keyword in institution_name.lower():
            merged_institution_total_citations[alias] += total_citations
            alias_used = True
            break
    if not alias_used:
        merged_institution_total_citations[institution_name] += total_citations
merged_institution_total_citations

defaultdict(int,
            {'Microsoft': 231519,
             'Meta': 164273,
             'New York University': 79745,
             'Université de Montréal': 67557,
             'Google': 393993,
             'University of Toronto': 117415,
             'Dana-Farber Cancer Institute': 23356,
             'Boston University': 77467,
             'European Molecular Biology Laboratory': 30216,
             'Max Planck Institute for Molecular Genetics': 6742,
             'Tokyo Metropolitan University': 32464,
             'Arizona State University': 73589,
             'Pennsylvania State University': 56591,
             'King Abdulaziz University': 45208,
             'National Taiwan University': 19809,
             'University of North Carolina at Chapel Hill': 62123,
             'University of Michigan–Ann Arbor': 106208,
             'Magic Leap (United States)': 8323,
             'Stanford University': 282006,
             'Massachusetts Institute of Technology': 180322,
  

In [33]:
len(merged_institution_total_citations)

16481

In [34]:
for i, (ins_name, total_citations) in enumerate(sorted(merged_institution_total_citations.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins_name}: {total_citations}")

1. Google: 393993
2. Stanford University: 282006
3. University of California, Berkeley: 246023
4. Microsoft: 231519
5. University of Oxford: 198393
6. Harvard University: 181773
7. Massachusetts Institute of Technology: 180322
8. Chinese Academy of Sciences: 170581
9. Meta: 164273
10. Tsinghua University: 161117
11. Carnegie Mellon University: 144098
12. University of Washington: 142480
13. ETH Zurich: 119392
14. University of Toronto: 117415
15. University of Cambridge: 114923
16. Nanyang Technological University: 111322
17. Imperial College London: 109265
18. University of Michigan–Ann Arbor: 106208
19. University College London: 102694
20. Chinese University of Hong Kong: 99353
21. Cornell University: 93912
22. National University of Singapore: 90257
23. Harbin Institute of Technology: 89722
24. University of California, Los Angeles: 87753
25. University of California, San Diego: 83984
26. Columbia University: 83357
27. New York University: 79745
28. University of Pennsylvania: 7773

In [35]:
openalex_rankings = merged_institution_total_citations

# Aggregation

In [36]:
set(openalex_rankings.keys()).difference(pcd_rankings.keys())

{'Amrita Vishwa Vidyapeetham University',
 'Steklov Mathematical Institute',
 'Services Australia',
 'Aletheia University',
 'Hubert Curien Pluridisciplinary Institute',
 'National University of La Plata',
 "St Vincent's Clinic",
 'Badji Mokhtar University',
 'VA Tennessee Valley Healthcare System',
 'Austrian Academy of Sciences',
 'Hertie School',
 'Centre for BioSystems Genomics',
 'University of Iowa Hospitals and Clinics',
 'University School of Physical Education in Wroclaw',
 'Pingdingshan University',
 'Martin Luther University Halle-Wittenberg',
 'Institute for Molecular Medicine',
 "Guy's Hospital",
 'EuroQol Research Foundation',
 'American Hospital Association',
 'Community Link',
 'Stockholm School of Economics in Riga',
 'Schlumberger (United States)',
 'École Pratique des Hautes Études',
 'Tianjin Polytechnic University',
 'Mustansiriyah University',
 'Zhengzhou Central Hospital',
 '22q11 Ireland',
 'Edwards Air Force Base',
 'Center for Transportation and the Environmen

In [37]:
set(pcd_rankings.keys()).intersection(openalex_rankings.keys())

{'Alibaba',
 'Amazon',
 'Australian Centre for Robotic Vision',
 'Baidu',
 'Brno University of Technology',
 'Charles University',
 'Chinese Academy of Sciences',
 'Chinese University of Hong Kong',
 'Collège de France',
 'Columbia University',
 'Cornell University',
 'DeepMind',
 'ETH Zurich',
 'Google',
 'Graz University of Technology',
 'Harvard University',
 'Heriot-Watt University',
 'Huawei',
 'IBM',
 'Intel',
 'Johns Hopkins University',
 'Meta',
 'Microsoft',
 'NEC',
 'NVIDIA',
 'Nanjing University',
 'Nanyang Technological University',
 'National Institute of Informatics',
 'National University of Singapore',
 'Naver',
 'Netflix',
 'New York University',
 'Northeastern University',
 'OpenAI',
 'Preferred Networks',
 'Princeton University',
 'Ritsumeikan University',
 'Salesforce',
 'Seoul National University',
 'Shandong University',
 'Stanford University',
 'Tel Aviv University',
 'Texas A&M University',
 'Tsinghua University',
 'Twitter',
 'Uber',
 'University of Adelaide',


In [38]:
for ins in set(openalex_rankings.keys()).difference(pcd_rankings.keys()):
    pcd_rankings[ins] = 0

In [39]:
def z_score_of_dict(d, log=True):
    if log:
        d = {k: np.log10(v+1) for k, v in d.items()}
    arr = np.array(list(d.values()))
    mean = np.mean(arr)
    std = np.std(arr)
    print(mean, std)
    return {k: (v - mean) / std for k, v in d.items()}

In [40]:
pcd_scores = z_score_of_dict(pcd_rankings, log=False)
pcd_scores

0.023289577158066663 0.6380563724893508


{'Runway': 4.665278102667354,
 'Anthropic': 1.5307588246965353,
 'Shanghai AI Laboratory': 1.5307588246965353,
 'SenseTime': 1.5307588246965353,
 'Baidu': 6.232537741652765,
 'Inflection AI': 1.5307588246965353,
 'Meta': 40.71224979933178,
 'Google': 92.4318178858503,
 'OpenAI': 15.636095575565221,
 'BAAI': 4.665278102667354,
 'HuggingFace': 3.098018463681945,
 'BigScience': 1.5307588246965353,
 'UCL': 6.232537741652765,
 'University of Michigan': 6.232537741652765,
 'Tsinghua University': 12.501576297594404,
 'Amazon': 1.5307588246965353,
 'Yandex': 1.5307588246965353,
 'DeepMind': 45.41402871628801,
 'Stability AI': 1.5307588246965353,
 'Inspur': 1.5307588246965353,
 'Alibaba': 4.665278102667354,
 'Naver': 1.5307588246965353,
 'Search Solutions': 1.5307588246965353,
 'Microsoft': 28.1741726874485,
 'AI21 Labs': 1.5307588246965353,
 'Huawei': 1.5307588246965353,
 'CMU': 12.501576297594404,
 'EleutherAI': 1.5307588246965353,
 'UC Berkeley': 12.501576297594404,
 'Uber': 1.53075882469653

In [41]:
new_openalex_rankings = {}
for k, v in openalex_rankings.items():
    if v > 0:
        new_openalex_rankings[k] = v
openalex_rankings = new_openalex_rankings

In [42]:
np.log(np.array(list(openalex_rankings.values())))

array([12.35241722, 12.00928496, 11.28658932, ...,  3.09104245,
        2.89037176,  3.33220451])

In [43]:
openalex_scores = z_score_of_dict(openalex_rankings, log=False)
openalex_scores

1480.1436373566357 8066.32542207011


{'Microsoft': 28.518420014798647,
 'Meta': 20.181786358039698,
 'New York University': 9.702665373319116,
 'Université de Montréal': 8.191692363644517,
 'Google': 48.66067705236598,
 'University of Toronto': 14.372697640667502,
 'Dana-Farber Cancer Institute': 2.7119977459363684,
 'Boston University': 9.420256732357618,
 'European Molecular Biology Laboratory': 3.5624469456711685,
 'Max Planck Institute for Molecular Genetics': 0.6523238385903085,
 'Tokyo Metropolitan University': 3.8411364210361585,
 'Arizona State University': 8.9394925929157,
 'Pennsylvania State University': 6.83221336593434,
 'King Abdulaziz University': 5.421037966432703,
 'National Taiwan University': 2.2722684002425875,
 'University of North Carolina at Chapel Hill': 7.5180275019303915,
 'University of Michigan–Ann Arbor': 12.98334134599871,
 'Magic Leap (United States)': 0.8483238655262734,
 'Stanford University': 34.777403797161746,
 'Massachusetts Institute of Technology': 22.17141597998486,
 'University of 

In [44]:
aggregate_scores = dict()
merged_keys = set(openalex_scores.keys()) | set(pcd_scores.keys())
for ins in merged_keys:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    if openalex_score == 0:  # Need publication data to analyse
        continue
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    aggregate_score = openalex_score + pcd_score
    aggregate_scores[ins] = aggregate_score

In [45]:
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins}: {score:.2f}")

1. Google: 141.09
2. Meta: 60.89
3. Microsoft: 56.69
4. Stanford University: 56.68
5. DeepMind: 52.80
6. University of Toronto: 42.55
7. University of Oxford: 35.35
8. Tsinghua University: 32.29
9. Chinese Academy of Sciences: 30.33
10. University of California, Berkeley: 30.28
11. Harvard University: 25.45
12. University of Washington: 25.28
13. Chinese University of Hong Kong: 24.64
14. Massachusetts Institute of Technology: 22.13
15. Johns Hopkins University: 18.73
16. Carnegie Mellon University: 17.64
17. New York University: 17.50
18. OpenAI: 17.00
19. ETH Zurich: 16.15
20. Nanyang Technological University: 15.15
21. National University of Singapore: 14.10
22. University of Cambridge: 14.03
23. University of Amsterdam: 13.61
24. Imperial College London: 13.33
25. Cornell University: 12.99
26. University of Michigan–Ann Arbor: 12.95
27. University College London: 12.51
28. Columbia University: 11.68
29. Harbin Institute of Technology: 10.90
30. University of California, Los Angeles

In [46]:
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    if any([keyword in ins.lower() for keyword in ['university', 'institute', 'college', 'academy']]):
        continue
    print(f"{i+1}. {ins}: {score:.2f}")

1. Google: 141.09
2. Meta: 60.89
3. Microsoft: 56.69
5. DeepMind: 52.80
18. OpenAI: 17.00
19. ETH Zurich: 16.15
43. Baidu: 8.57
46. Université de Montréal: 8.16
54. NVIDIA: 7.57
55. IBM: 7.57
59. École Polytechnique Fédérale de Lausanne: 6.96
65. KU Leuven: 6.78
69. Alibaba: 6.27
81. Intel: 5.44
82. Massachusetts General Hospital: 5.40
105. UNSW Sydney: 4.47
110. Mayo Clinic: 4.38
115. Inserm: 4.22
117. Huawei: 4.21
120. Brigham and Women's Hospital: 4.19
124. Virginia Tech: 4.05
127. Amazon: 3.89
128. Twitter: 3.87
132. Adobe: 3.80
143. European Molecular Biology Laboratory: 3.53
145. Los Alamos National Laboratory: 3.47
147. Lawrence Berkeley National Laboratory: 3.47
149. Xerox: 3.41
158. Salesforce: 3.18
165. Max Planck Society: 3.06
177. Tencent: 2.79
186. Politecnico di Milano: 2.72
195. Ludwig-Maximilians-Universität München: 2.60
202. Naver: 2.52
205. Polytechnique Montréal: 2.42
209. Enthought: 2.41
211. Commonwealth Scientific and Industrial Research Organisation: 2.39
216. Q

In [47]:
# Manually read private companies from above list
top_companies = [
    'Google',
    'Meta',
    'Microsoft',
    'DeepMind',
    'OpenAI',
    'Baidu',
    'NVIDIA',
    'IBM',
    'Alibaba',
    'Intel',
    'Huawei',
    'Amazon',
    'Twitter',
    'Adobe',
    'Xerox',
    'Salesforce',
    'Tencent',
    'Naver',
    'Enthought',
    'Quansight',
    'Group Sense',
    'NEC',
    'Uber',
    'Yandex',
    'Netflix',
]

In [56]:
len(top_companies)

25

In [48]:
company_rank = 0
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    if ins not in top_companies:
        continue
    company_rank += 1
    print(f"({i+1}) {company_rank}. {ins}: {score:.2f}")

(1) 1. Google: 141.09
(2) 2. Meta: 60.89
(3) 3. Microsoft: 56.69
(5) 4. DeepMind: 52.80
(18) 5. OpenAI: 17.00
(43) 6. Baidu: 8.57
(54) 7. NVIDIA: 7.57
(55) 8. IBM: 7.57
(69) 9. Alibaba: 6.27
(81) 10. Intel: 5.44
(117) 11. Huawei: 4.21
(127) 12. Amazon: 3.89
(128) 13. Twitter: 3.87
(132) 14. Adobe: 3.80
(149) 15. Xerox: 3.41
(158) 16. Salesforce: 3.18
(177) 17. Tencent: 2.79
(202) 18. Naver: 2.52
(209) 19. Enthought: 2.41
(216) 20. Quansight: 2.31
(241) 21. Group Sense: 2.18
(244) 22. NEC: 2.16
(265) 23. Uber: 1.99
(317) 24. Yandex: 1.63
(355) 25. Netflix: 1.46


# Plots

In [49]:
top_openalex_institutions = []
top_openalex_scores = []
for i, ins in enumerate(sorted(openalex_scores.keys(), key=lambda k: openalex_scores[k], reverse=True)):
    print(ins)
    if ins == "Chinese University of Hong Kong":
        ins_alias = "CUHK"
    elif ins == "Chinese Academy of Sciences":
        ins_alias = "CAS"
    elif ins == "University of California, Berkeley":
        ins_alias = "UC Berkeley"
    elif ins == "Carnegie Mellon University":
        ins_alias = "CMU"
    elif ins == "Université de Montréal":
        ins_alias = "U Montreal"
    elif ins == "Johns Hopkins University":
        ins_alias = "JHU"
    elif ins == "Brno University of Technology":
        ins_alias = "Brno U of Tech"
    elif ins == "University of Washington":
        ins_alias = "U Washington"
    elif ins == "University of Amsterdam":
        ins_alias = "U Amsterdam"
    elif ins == "New York University":
        ins_alias = "NYU"
    elif ins == "Massachusetts Institute of Technology":
        ins_alias = "MIT"
    elif ins == "Nanyang Technological University":
        ins_alias = "Nanyang TU"
    elif ins == "University of Michigan–Ann Arbor":
        ins_alias = "U Michigan"
    elif ins == "University College London":
        ins_alias = "UCL"
    elif ins == "Imperial College London":
        ins_alias = "Imperial College"
    else:
      ins_alias = ins
    # ins_alias = ins
    top_openalex_institutions.append(ins_alias)
    openalex_score = openalex_rankings[ins] / sum(openalex_rankings.values())
    top_openalex_scores.append(openalex_score)
    if i >= 19:
        break
top_openalex_scores = np.array(top_openalex_scores)
top_openalex_institutions

Google
Stanford University
University of California, Berkeley
Microsoft
University of Oxford
Harvard University
Massachusetts Institute of Technology
Chinese Academy of Sciences
Meta
Tsinghua University
Carnegie Mellon University
University of Washington
ETH Zurich
University of Toronto
University of Cambridge
Nanyang Technological University
Imperial College London
University of Michigan–Ann Arbor
University College London
Chinese University of Hong Kong


['Google',
 'Stanford University',
 'UC Berkeley',
 'Microsoft',
 'University of Oxford',
 'Harvard University',
 'MIT',
 'CAS',
 'Meta',
 'Tsinghua University',
 'CMU',
 'U Washington',
 'ETH Zurich',
 'University of Toronto',
 'University of Cambridge',
 'Nanyang TU',
 'Imperial College',
 'U Michigan',
 'UCL',
 'CUHK']

In [50]:
fig = go.Figure(data=[
    go.Bar(name='Citations score', x=top_openalex_institutions, y=100*top_openalex_scores),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Share of citations in the top 100,000 most-cited (%)',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=400,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'ranking_citations_academic_industry')

## Show plot
fig.show()

In [51]:
top_pcd_institutions = []
top_pcd_scores = []
for i, ins in enumerate(sorted(pcd_scores.keys(), key=lambda k: pcd_scores[k], reverse=True)):
    if ins == "Chinese University of Hong Kong":
        ins_alias = "CUHK"
    elif ins == "Chinese Academy of Sciences":
        ins_alias = "CAS"
    elif ins == "University of California, Berkeley":
        ins_alias = "UC Berkeley"
    elif ins == "Carnegie Mellon University":
        ins_alias = "CMU"
    elif ins == "Université de Montréal":
        ins_alias = "U Montreal"
    elif ins == "Johns Hopkins University":
        ins_alias = "JHU"
    elif ins == "Brno University of Technology":
        ins_alias = "Brno U of Tech"
    elif ins == "University of Washington":
        ins_alias = "U Washington"
    elif ins == "University of Amsterdam":
        ins_alias = "U Amsterdam"
    elif ins == "New York University":
        ins_alias = "NYU"
    else:
      ins_alias = ins
    # ins_alias = ins
    top_pcd_institutions.append(ins_alias)
    pcd_score = pcd_rankings[ins] / len(notable_df)
    top_pcd_scores.append(pcd_score)
    if i >= 19:
        break
top_pcd_scores = np.array(top_pcd_scores)
top_pcd_institutions

['Google',
 'DeepMind',
 'Meta',
 'Microsoft',
 'University of Toronto',
 'Stanford University',
 'OpenAI',
 'University of Montreal',
 'Tsinghua University',
 'CMU',
 'UC Berkeley',
 'CUHK',
 'University of Oxford',
 'CAS',
 'JHU',
 'U Washington',
 'NYU',
 'U Amsterdam',
 'Baidu',
 'UCL']

In [52]:
fig = go.Figure(data=[
    go.Bar(name='Citations score', x=top_pcd_institutions, y=100*top_pcd_scores, marker_color='rgb(230, 65, 30)'),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Share of notable ML systems in dataset (%)',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=400,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'ranking_pcd_academic_industry')

## Show plot
fig.show()

In [53]:
top_pcd_scores = []
top_openalex_scores = []
for ins in top_companies:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    top_pcd_scores.append(pcd_score)
    top_openalex_scores.append(openalex_score)

In [54]:
fig = go.Figure(data=[
    go.Bar(name='Citations (standardized score)', x=top_companies, y=top_openalex_scores),
    go.Bar(name='Notable ML systems (standardized score)', x=top_companies, y=top_pcd_scores),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Total score',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=600,
    height=600,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'ranking_industry')

## Show plot
fig.show()

# Final list of institutions based on ranking

Get the final list of institution IDs based on this ranking

In [57]:
for ins in top_companies:
    search_results = Institutions().search(ins).get()
    for r in search_results:
        print(f"\"{r['id']}\",  # {r['display_name']}")

"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210149973",  # ARC Centre of Excellence for Transformative Meta-Optical Systems
"https://openalex.org/I4210120172",  # Meta Vision Systems (United Kingdom)
"https://openalex.org/I4210128585",  # META Health
"https://openalex.org/I4210118911",  # META Group
"https://openalex.org/I4210099706",  # Center For Advanced Meta-Materials
"https://openalex.org/I4210131362",  # Corporación Universitaria del Meta
"https://openalex.org/I4210158286",  # International Platform of Registered Systematic Review and Meta-analysis Protocols
"https://openalex.org/I4210158323",  # Meta House
"https://openalex.org/I1290206253",  # Microsoft

Just use top 10 aliases, and eliminate spurious results e.g. Amazon Conservation Association.

In [None]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210113297",  # Google (United Kingdom)
    "https://openalex.org/I4210100430",  # Google (Switzerland)
    "https://openalex.org/I4210148186",  # Google (Canada)
    "https://openalex.org/I4210117425",  # Google (Israel)
    "https://openalex.org/I4210131802",  # Google (Ireland)
    "https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
    "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I4210111288",  # Meta (United Kingdom)
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    "https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
    "https://openalex.org/I4210124949",  # Microsoft Research (India)
    "https://openalex.org/I4210105678",  # Microsoft (Finland)
    "https://openalex.org/I4210087053",  # Microsoft (Germany)
    "https://openalex.org/I4210125051",  # Microsoft (Israel)
    "https://openalex.org/I4210162141",  # Microsoft (India)
    "https://openalex.org/I4210086099",  # Microsoft (Brazil)
    "https://openalex.org/I4210153468",  # Microsoft (Canada)
    "https://openalex.org/I4210161634",  # Microsoft (France)
    "https://openalex.org/I4210110431",  # Microsoft (Netherlands)
    "https://openalex.org/I4210099966",  # Microsoft (Denmark)
    "https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
    "https://openalex.org/I4210135422",  # Microsoft (Norway)
    "https://openalex.org/I4210139986",  # Microsoft (Switzerland)
    "https://openalex.org/I4210109507",  # Microsoft (Ireland)
    "https://openalex.org/I4210092974",  # Microsoft (Portugal)
    "https://openalex.org/I4210151458",  # Microsoft (Belgium)
    "https://openalex.org/I4210161460",  # OpenAI (United States)
    "https://openalex.org/I45928872",  # Alibaba Group (China)
    "https://openalex.org/I4210095624",  # Alibaba Group (United States)
    "https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
    "https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
    "https://openalex.org/I4210127875",  # Nvidia (United States)
    "https://openalex.org/I98301712",  # Baidu (China)
    "https://openalex.org/I1311688040",  # Amazon (United States)
    "https://openalex.org/I4210089985",  # Amazon (Germany)
    "https://openalex.org/I4210123934",  # Amazon (United Kingdom)
    "https://openalex.org/I2250653659",  # Tencent (China)
    "https://openalex.org/I4210103558",  # Tencent Healthcare (China)
]
len(selected_institution_ids)

Have the alias mapping for later

In [None]:
selected_institutions_text = """
"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I4210161634",  # Microsoft (France)
"https://openalex.org/I4210110431",  # Microsoft (Netherlands)
"https://openalex.org/I4210099966",  # Microsoft (Denmark)
"https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
"https://openalex.org/I4210135422",  # Microsoft (Norway)
"https://openalex.org/I4210139986",  # Microsoft (Switzerland)
"https://openalex.org/I4210109507",  # Microsoft (Ireland)
"https://openalex.org/I4210092974",  # Microsoft (Portugal)
"https://openalex.org/I4210151458",  # Microsoft (Belgium)
"https://openalex.org/I4210161460",  # OpenAI (United States)
"https://openalex.org/I45928872",  # Alibaba Group (China)
"https://openalex.org/I4210095624",  # Alibaba Group (United States)
"https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
"https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
"https://openalex.org/I4210127875",  # Nvidia (United States)
"https://openalex.org/I98301712",  # Baidu (China)
"https://openalex.org/I1311688040",  # Amazon (United States)
"https://openalex.org/I4210089985",  # Amazon (Germany)
"https://openalex.org/I4210123934",  # Amazon (United Kingdom)
"https://openalex.org/I2250653659",  # Tencent (China)
"https://openalex.org/I4210103558",  # Tencent Healthcare (China)
"""

# Map each institution id to the first word after the comment
institution_aliases = {}
for line in selected_institutions_text.splitlines()[1:]:
    institution_id = line.split(",")[0].strip('"')
    institution_name = line.split("#")[1].strip()
    institution_alias = institution_name.split(" ")[0].strip()
    institution_aliases[institution_id] = institution_alias
institution_aliases

# Spot checks

In [None]:
Institutions().search("Montréal").get()

[{'id': 'https://openalex.org/I70931966',
  'ror': 'https://ror.org/0161xgx34',
  'display_name': 'Université de Montréal',
  'relevance_score': 80228.38,
  'country_code': 'CA',
  'type': 'education',
  'homepage_url': 'http://www.umontreal.ca/english/',
  'image_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Universit%C3%A4t%20Montreal%20Logo.svg',
  'image_thumbnail_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Universit%C3%A4t%20Montreal%20Logo.svg&width=300',
  'display_name_acronyms': [],
  'display_name_alternatives': ['Université de Montréal'],
  'repositories': [{'id': 'https://openalex.org/S4377196102',
    'display_name': 'Érudit (Université de Montréal)',
    'host_organization': 'https://openalex.org/I70931966',
    'host_organization_name': 'Université de Montréal',
    'host_organization_lineage': ['https://openalex.org/I70931966']},
   {'id': 'https://openalex.org/S4306402422',
    'display_name': 'Papyrus : I

In [None]:
for ins in ["Stanford", "University of Toronto", "OpenAI"]:
    top_result = Institutions().search(ins).get()[0]
    print(
        top_result["display_name"],
        "| CS?", any(["Computer science" in concept["display_name"] for concept in top_result["x_concepts"]]),
        "| AI?", any(["Artificial intelligence" in concept["display_name"] for concept in top_result["x_concepts"]]),
        "| ML?", any(["Machine learning" in concept["display_name"] for concept in top_result["x_concepts"]]),
    )

Stanford University | CS? True | AI? False | ML? False
University of Toronto | CS? True | AI? False | ML? False
OpenAI (United States) | CS? True | AI? True | ML? True


In [None]:
Works().filter(institutions={"id": "https://openalex.org/I97018004"}).sort(cited_by_count="desc").get()

[{'id': 'https://openalex.org/W2117539524',
  'doi': 'https://doi.org/10.1007/s11263-015-0816-y',
  'title': 'ImageNet Large Scale Visual Recognition Challenge',
  'display_name': 'ImageNet Large Scale Visual Recognition Challenge',
  'publication_year': 2015,
  'publication_date': '2015-04-11',
  'ids': {'openalex': 'https://openalex.org/W2117539524',
   'doi': 'https://doi.org/10.1007/s11263-015-0816-y',
   'mag': '2117539524'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://doi.org/10.1007/s11263-015-0816-y',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S25538012',
    'display_name': 'International Journal of Computer Vision',
    'issn_l': '0920-5691',
    'issn': ['0920-5691', '1573-1405'],
    'is_oa': False,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/P4310319900',
    'host_organization_name': 'Springer Science+Business Media',
    'host_organization_lineage': ['https://openalex.org/P43103

Check that works tagged Machine Learning are generally the kind of works we are interested in.

In [None]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .get()
[work['display_name'] for work in ml_works]

What is tagged Artificial Intelligence vs. Machine Learning for top institutions.

AI: "https://openalex.org/C154945302"

ML: "https://openalex.org/C119857082"

In [None]:
Concepts()["https://openalex.org/C154945302"]["level"]

In [None]:
Concepts()["https://openalex.org/C119857082"]["level"]

OpenAI: "https://openalex.org/I4210161460"

DeepMind: "https://openalex.org/I4210090411"

In [None]:
institution_id = "https://openalex.org/I4210090411"

In [None]:
# OpenAI
ai_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C154945302"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ai_works)

In [None]:
for work in ai_works:
    print(work['display_name'])

In [None]:
ml_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C119857082"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ml_works)

In [None]:
for work in ml_works:
    print(work['display_name'])

In [None]:
ai_ids = [work['id'] for work in ai_works]
ml_ids = [work['id'] for work in ml_works]
for work_id in set(ai_ids).difference(set(ml_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
for work_id in set(ml_ids).difference(set(ai_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
top_cited_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": "https://openalex.org/I4210161460"}}) \
        .filter(cited_by_count=">100") \
        .paginate(per_page=200, n_max=n_max)
)

for work in top_cited_works:
    print(work['display_name'], get_bounded_citations(work, year_bound=3))

In [None]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .select(["publication_year", "authorships", "cited_by_count", "counts_by_year"]) \
    .get()
ml_works[:2]

[{'publication_year': 2016,
  'authorships': [{'author_position': 'first',
    'author': {'id': 'https://openalex.org/A4344207660',
     'display_name': 'Kaiming He',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsoft Research#TAB#',
    'raw_affiliation_strings': ['Microsoft Research#TAB#']},
   {'author_position': 'middle',
    'author': {'id': 'https://openalex.org/A4358260579',
     'display_name': 'Xiangyu Zhang',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsof