In [3]:
%load_ext autoreload
%autoreload 2

In [1]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm.notebook import tqdm

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [2]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [3]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# PCD database

In [4]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4/export?format=csv#gid=0')

In [5]:
df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
2,Phenaki,Vision,Video generation,"Google Brain, University College London, Unive...",Industry - Academia Collaboration (Industry le...,"Ruben Villegas, Mohammad Babaeizadeh, Pieter-J...",2022-10-05,Phenaki: Variable Length Video Generation From...,https://arxiv.org/abs/2210.02399,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,"$3,267,257.75",,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,"$3,232,806.53",,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546,,Vision,Image classification,"University of Guelph,Canadian Institute for Ad...",Industry - Academia Collaboration,"Terrance DeVries, Graham W. Taylor",2017-08-15,Improved Regularization of Convolutional Neura...,https://arxiv.org/abs/1708.04552,1450.0,...,,,,https://www.yuzeh.com/data/agz-cost.html,,,Industry,,,2023-06-09 16:00:52
547,AltCLIP,Multimodal,,BAAI,Academia,"Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong...",2022-11-12,AltCLIP: Altering the Language Encoder in CLIP...,https://arxiv.org/abs/2211.06679,,...,,,,,,,,Likely,"In this work, we present a conceptually simple...",2023-06-09 16:04:45
548,ALM 1.0,Language,Language modelling,BAAI,Academia,,,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-06-09 16:06:43
549,MusicGen,Audio,Audio generation,Meta AI,Industry,"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, ...",2023-06-08,Simple and Controllable Music Generation,https://arxiv.org/abs/2306.05284,,...,,,,,,,Industry,Unverified,We tackle the task of conditional music genera...,2023-06-09 18:28:28


In [6]:
notable_df = df.dropna(subset=['Inclusion criteria'])
notable_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,"$3,267,257.75",,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,"$3,232,806.53",,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
6,Chinchilla,Language,Language modelling,DeepMind,Industry,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me...",2022-03-29,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,,...,,,"$753,491.58",,Yes,,Industry,,We investigate the optimal model size and numb...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
543,CogVideo,Multimodal,Video generation,"Tsinghua University,BAAI",Academia,"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Li...",2022-05-29,CogVideo: Large-scale Pretraining for Text-to-...,https://arxiv.org/abs/2205.15868,,...,,,,,,,,Likely,Large-scale pretrained transformers have creat...,2023-06-09 15:44:54
544,Zidong Taichu,Multimodal,,Chinese Academy of Sciences,,,,Zidong Ancestral multi-modal large model,https://gitee.com/zidongtaichu/multi-modal-models,,...,,,,,,,,Likely,,2023-06-09 15:44:36
546,,Vision,Image classification,"University of Guelph,Canadian Institute for Ad...",Industry - Academia Collaboration,"Terrance DeVries, Graham W. Taylor",2017-08-15,Improved Regularization of Convolutional Neura...,https://arxiv.org/abs/1708.04552,1450.0,...,,,,https://www.yuzeh.com/data/agz-cost.html,,,Industry,,,2023-06-09 16:00:52
548,ALM 1.0,Language,Language modelling,BAAI,Academia,,,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-06-09 16:06:43


In [7]:
# Count the number of notable ML systems for each Organization since 2010.
organization_system_count = defaultdict(int)
for i, row in notable_df.iterrows():
    pub_date = row['Publication date']
    if type(pub_date) == str and int(pub_date[:4]) >= 2010 and row['Organization Categorization'] == 'Industry':
        org = row['Organization']
        organization_system_count[org] += 1

In [8]:
# Print organization and its system count, in descending order of count
for org, count in sorted(organization_system_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{org}: {count} systems")

Google: 21 systems
DeepMind: 16 systems
OpenAI: 12 systems
Google Brain: 7 systems
Google DeepMind: 7 systems
Google Research: 6 systems
Microsoft Research: 5 systems
Meta AI: 4 systems
Facebook AI Research: 4 systems
Facebook AI research: 4 systems
MetaAI: 3 systems
Microsoft: 3 systems
Alibaba Group: 2 systems
Facebook: 2 systems
Facebook AI: 2 systems
Google Inc.: 2 systems
Amazon: 1 systems
Stability AI, Runway: 1 systems
Google AI, Brain team: 1 systems
Microsoft Research,Peking University: 1 systems
Open AI: 1 systems
Microsoft Bing: 1 systems
Google Research, Brain Team: 1 systems
Google Research,Brain Team: 1 systems
Google AI: 1 systems
AllenAI, University of Washington: 1 systems
Google Brain,Google Research: 1 systems
Twitter: 1 systems
Megvii Inc: 1 systems
Nvidia: 1 systems
Salesforce: 1 systems
Baidu Research- Silicon Valley AI Lab: 1 systems
Netflix: 1 systems
Xerox Research Centre Europe (XRCE): 1 systems
Google Inc: 1 systems
NVIDIA: 1 systems
Baidu: 1 systems


Industry:
1. Google: 42
  - Google: 20
  - Google Research: 7
  - Google Brain: 7
  - Google Inc.: 2
  - Google AI, Brain team: 1
  - Google Research, Brain Team: 1
  - Google Research,Brain Team: 1
  - Google AI: 1
  - Google Brain,Google Research: 1
  - Google Inc: 1
2. DeepMind: 23
  - DeepMind: 16
  - Google DeepMind: 7
3. Meta: 20
  - Meta AI: 4
  - MetaAI: 4
  - Facebook AI Research: 4
  - Facebook AI research: 4
  - Facebook: 2
  - Facebook AI 2
4. OpenAI: 13
   1. OpenAI: 12
   2. Open AI: 1
5. Microsoft: 10
   - Microsoft Research: 5
   - Microsoft: 3
   - Microsoft Research,Peking University: 1
   - Microsoft Bing: 1
6. Alibaba: 2
  - Alibaba Group: 2
6. NVIDIA: 2
  - Nvidia: 1
  - NVIDIA: 1
6. Baidu: 2
  - Baidu Research- Silicon Valley AI Lab: 1
  - Baidu: 1
7. Amazon: 1
  - Amazon: 1
7. Stability: 1
  - Stability AI, Runway: 1
7. Runway: 1
  - Stability AI, Runway: 1
7. Twitter: 1
7. Megvii: 
  - Megvii Inc: 1
7. Salesforce: 1
  - Salesforce: 1
7. Netflix: 1
  - Netflix: 1
7. Xerox: 1
  - Xerox Research Centre Europe (XRCE): 1

In [9]:
pcd_rankings = {
    "Google": 42,
    "DeepMind": 23,
    "Meta": 20,
    "OpenAI": 13,
    "Microsoft": 10,
    "Alibaba": 2,
    "NVIDIA": 2,
    "Baidu": 2,
    "Amazon": 1,
    "Stability": 1,
    "Runway": 1,
    "Twitter": 1,
    "Megvii": 1,
    "Salesforce": 1,
    "Netflix": 1,
    "Xerox": 1
}

# OpenAlex

In [10]:
concept_ids = [
    # 'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [11]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C119857082'

Sort by works because we are interested in impact since around 2010, in the Deep Learning era of AI.
Citations are not as informative for this, because it may include citations of works published before 2010.

We can only sort by number of works in any year.
But we want to narrow down based on number of works published since 2010.
So we fetch 200 institutions initially, assuming that this is large enough to include all of the institutions in the narrowed-down set.

In [12]:
top_institutions = merge_pages(
    Institutions() \
        .filter(concepts={"id": concept_query}) \
        .filter(type="company") \
        .sort(cited_by_count="desc") \
        .paginate(per_page=100, n_max=200)
)
top_institutions

[{'id': 'https://openalex.org/I1291425158',
  'ror': 'https://ror.org/00njsd438',
  'display_name': 'Google (United States)',
  'country_code': 'US',
  'type': 'company',
  'homepage_url': 'https://www.google.com/',
  'image_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Google%202015%20logo.svg',
  'image_thumbnail_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Google%202015%20logo.svg&width=300',
  'display_name_acronyms': [],
  'display_name_alternatives': ['Googleplex'],
  'repositories': [{'id': 'https://openalex.org/S4306400783',
    'display_name': 'Repositorio de Tesis USAT (Santo Toribio de Mogrovejo Catholic University)',
    'host_organization': 'https://openalex.org/I1291425158',
    'host_organization_name': 'Google (United States)',
    'host_organization_lineage': ['https://openalex.org/I1291425158']},
   {'id': 'https://openalex.org/S4306402250',
    'display_name': 'Scientia cum Industria (University of Caxias

In [13]:
institution_names = [ins_obj['display_name'] for ins_obj in top_institutions]

In [14]:
for name in ['Google', 'DeepMind', 'OpenAI']:
    print(f"{name}: {any([name in ins_name for ins_name in institution_names])}")

Google: True
DeepMind: True
OpenAI: False


In [15]:
selected_institution_ids = [ins_obj['id'] for ins_obj in top_institutions]
selected_institution_ids

['https://openalex.org/I1291425158',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210143601',
 'https://openalex.org/I4210114444',
 'https://openalex.org/I1306409833',
 'https://openalex.org/I4210134091',
 'https://openalex.org/I4210090411',
 'https://openalex.org/I4210155590',
 'https://openalex.org/I1311688040',
 'https://openalex.org/I2250955327',
 'https://openalex.org/I2250653659',
 'https://openalex.org/I98301712',
 'https://openalex.org/I1304085615',
 'https://openalex.org/I4210127875',
 'https://openalex.org/I4210113297',
 'https://openalex.org/I33976269',
 'https://openalex.org/I4210128910',
 'https://openalex.org/I45928872',
 'https://openalex.org/I2800095910',
 'https://openalex.org/I122754148',
 'https://openalex.org/I180662265',
 'https://openalex.org/I55215948',
 'https://openalex.org/I4210100430',
 'https://openalex.org/I4210148872',
 'https://o

In [16]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
params = []
n_max = int(1e6)
works = []
for institution_id in tqdm(selected_institution_ids):
    works_instance = Works()
    try:
        new_works = merge_pages(
            works_instance \
                .filter(authorships={"institutions": {"id": institution_id}}) \
                .filter(concepts={"id": concept_query}) \
                .filter(from_publication_date="2010-01-01") \
                .filter(to_publication_date="2023-06-15") \
                .filter(cited_by_count=">9") \
                .paginate(per_page=200, n_max=n_max)
        )
    except Exception as e:
        print(f"Error for {Institutions()[institution_id]['display_name']} ({institution_id})")
        print(f"{works_instance.params}")
        print(e)
    works.extend(new_works)
    params.append(works_instance.params)

assert len(works) < n_max
len(works)

  0%|          | 0/200 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [128]:
params

[{'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I1291425158'}},
   'concepts': {'id': 'https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-page': 200,
  'page': None,
  'cursor': 'IlsxMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c0Mjk5NTEzNDkwJ10i'},
 {'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I1290206253'}},
   'concepts': {'id': 'https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-page': 200,
  'page': None,
  'cursor': 'IlsxMCwgJ2h0dHBzOi8vb3BlbmFsZXgub3JnL1c4MTA4MTYxNSddIg=='},
 {'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I4210164937'}},
   'concepts': {'id': 'https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-pag

In [129]:
# Save to avoid fetching every time
timestamp = datetime.datetime.now()
with open(data_file_location + f"longlist_institution_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
    obj = {
        "params": params,  # for reproducibility
        "works": works,
    }
    pickle.dump(obj, f)

In [130]:
# Took 43s for ~100K works on Macbook Pro 2019
with open(data_file_location + "longlist_institution_works_openalex_2023-06-15_15-45-48", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

10636

## Data processing

In [131]:
processor = OpenAlexProcessor

In [132]:
CITATION_WINDOW_SIZE = 3

In [133]:
institution_cited_by_counts, institution_work_counts = processor.get_institution_counts(
    works,
    selected_institution_ids=selected_institution_ids,
    citation_window_size=CITATION_WINDOW_SIZE,
)

In [193]:
# Get average cited-by count over the whole time period of the data
institution_total_cited_by_counts = defaultdict(int)
institution_total_work_counts = defaultdict(int)
for ins, cited_by_counts in institution_cited_by_counts.items():
    for cited_by_count in cited_by_counts:
        works_count = institution_work_counts[ins].loc[cited_by_count.year].item()
        institution_total_cited_by_counts[ins] += cited_by_count.item()
        institution_total_work_counts[ins] += works_count

institution_overall_scores = defaultdict(float)
for ins, cited_by_count in institution_total_cited_by_counts.items():
    works_count = institution_total_work_counts[ins]
    if works_count >= 100:
        institution_overall_scores[ins] = cited_by_count  # / works_count
print(f"Number of institutions: {len(institution_overall_scores)}")
institution_overall_scores

Number of institutions: 30


defaultdict(float,
            {'https://openalex.org/I1291425158': 863026.0,
             'https://openalex.org/I4210113297': 414809.0,
             'https://openalex.org/I4210090411': 987838.0,
             'https://openalex.org/I1290206253': 122792.0,
             'https://openalex.org/I2252078561': 171478.0,
             'https://openalex.org/I1311688040': 22070.0,
             'https://openalex.org/I1316064682': 5648.0,
             'https://openalex.org/I1306409833': 31867.0,
             'https://openalex.org/I98301712': 41295.0,
             'https://openalex.org/I4210107353': 8694.0,
             'https://openalex.org/I4210127875': 18420.0,
             'https://openalex.org/I4210114444': 52844.0,
             'https://openalex.org/I1304085615': 30106.0,
             'https://openalex.org/I4210164937': 176755.0,
             'https://openalex.org/I4210089985': 14088.0,
             'https://openalex.org/I4210134091': 6898.0,
             'https://openalex.org/I2250653659': 557

In [194]:
for ins, score in sorted(institution_overall_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{Institutions()[ins]['display_name']} ({ins}): {score}")

DeepMind (United Kingdom) (https://openalex.org/I4210090411): 987838.0
Google (United States) (https://openalex.org/I1291425158): 863026.0
Google (United Kingdom) (https://openalex.org/I4210113297): 414809.0
Microsoft Research (United Kingdom) (https://openalex.org/I4210164937): 176755.0
Meta (Israel) (https://openalex.org/I2252078561): 171478.0
Microsoft (United States) (https://openalex.org/I1290206253): 122792.0
Tencent (China) (https://openalex.org/I2250653659): 55700.0
Meta (United States) (https://openalex.org/I4210114444): 52844.0
Microsoft Research Asia (China) (https://openalex.org/I4210113369): 50802.0
Baidu (China) (https://openalex.org/I98301712): 41295.0
Alibaba Group (China) (https://openalex.org/I45928872): 34295.0
Adobe Systems (United States) (https://openalex.org/I1306409833): 31867.0
Nvidia (United Kingdom) (https://openalex.org/I1304085615): 30106.0
Huawei Technologies (China) (https://openalex.org/I2250955327): 24082.0
Amazon (United States) (https://openalex.org/I

In [195]:
institution_aliases = {
    "https://openalex.org/I4210113297": "Google",
    "https://openalex.org/I4210090411": "DeepMind",
    "https://openalex.org/I4210164937": "Microsoft",
    "https://openalex.org/I1291425158": "Google",
    "https://openalex.org/I2252078561": "Meta",
    "https://openalex.org/I4210114444": "Meta",
    "https://openalex.org/I60922564": "Naver",
    "https://openalex.org/I1304085615": "NVIDIA",
    "https://openalex.org/I4210156496": "Uber",
    "https://openalex.org/I4210107353": "NEC",
    "https://openalex.org/I4210128910": "Group Sense",
    "https://openalex.org/I4210127875": "NVIDIA",
    "https://openalex.org/I1306409833": "Adobe Systems",
    "https://openalex.org/I98301712": "Baidu",
    "https://openalex.org/I4210113369": "Microsoft",
    "https://openalex.org/I2250955327": "Huawei",
    "https://openalex.org/I4210089985": "Amazon",
    "https://openalex.org/I2250653659": "Tencent",
    "https://openalex.org/I4210159102": "Huawei",
    "https://openalex.org/I45928872": "Alibaba",
    "https://openalex.org/I1290206253": "Microsoft",
    "https://openalex.org/I4210095624": "Alibaba",
    "https://openalex.org/I4210103986": "Jingdong",
    "https://openalex.org/I1316064682": "LinkedIn",
    "https://openalex.org/I1311688040": "Amazon",
    "https://openalex.org/I4210087778": "Dascena",
    "https://openalex.org/I2800095910": "Yahoo",
    "https://openalex.org/I4210134091": "Yahoo",
    "https://openalex.org/I55215948": "Tata Consultancy Services",
}

From inspecting the papers associated with "Decision Systems", it seems that OpenAlex (or one of its sources) has mistakenly associated the MIT Laboratory for Information and Decision Systems (and possibly similar departments) with the company Decision Systems Inc. ([example](https://arxiv.org/pdf/1606.05830.pdf)). 
Similarly for "Management Sciences", there is a mistaken association of Department of Industrial Engineering and Management Sciences at Northwestern University ([example](https://epubs.siam.org/doi/epdf/10.1137/16M1080173)) with a company called "Management Sciences".

So I will eliminate these spurious cases.

In [196]:
ignore_institutions = [
    "https://openalex.org/I4210143601",  # Decision Systems
    "https://openalex.org/I4210155590",
]

Recount using aliases

In [197]:
# Get average cited-by count over the whole time period of the data
institution_total_cited_by_counts = defaultdict(int)
institution_total_work_counts = defaultdict(int)
for ins, cited_by_counts in institution_cited_by_counts.items():
    if ins in ignore_institutions:
        continue
    alias = institution_aliases.get(ins, ins)
    for cited_by_count in cited_by_counts:
        works_count = institution_work_counts[ins].loc[cited_by_count.year].item()
        institution_total_cited_by_counts[alias] += cited_by_count.item()
        institution_total_work_counts[alias] += works_count

institution_overall_scores = defaultdict(float)
for alias, cited_by_count in institution_total_cited_by_counts.items():
    works_count = institution_total_work_counts[alias]
    if works_count >= 100:
        institution_overall_scores[alias] = cited_by_count   # / works_count
print(f"Number of institutions: {len(institution_overall_scores)}")
institution_overall_scores

Number of institutions: 20


defaultdict(float,
            {'Google': 1277835.0,
             'DeepMind': 987838.0,
             'Microsoft': 350349.0,
             'Meta': 224322.0,
             'Amazon': 36158.0,
             'LinkedIn': 5648.0,
             'Adobe Systems': 31867.0,
             'Baidu': 41295.0,
             'NEC': 8694.0,
             'NVIDIA': 48526.0,
             'Yahoo': 9364.0,
             'Tencent': 55700.0,
             'Alibaba': 51822.0,
             'Uber': 8241.0,
             'Naver': 14459.0,
             'Jingdong': 7710.0,
             'Group Sense': 18208.0,
             'Huawei': 35242.0,
             'Tata Consultancy Services': 2334.0,
             'Dascena': 3789.0})

In [198]:
openalex_rankings = institution_overall_scores

# Aggregation

In [199]:
set(openalex_rankings.keys()).difference(pcd_rankings.keys())

{'Adobe Systems',
 'Dascena',
 'Group Sense',
 'Huawei',
 'Jingdong',
 'LinkedIn',
 'NEC',
 'Naver',
 'Tata Consultancy Services',
 'Tencent',
 'Uber',
 'Yahoo'}

In [200]:
set(pcd_rankings.keys()).difference(openalex_rankings.keys())

{'Megvii',
 'Netflix',
 'OpenAI',
 'Runway',
 'Salesforce',
 'Stability',
 'Twitter',
 'Xerox'}

In [201]:
pcd_total = sum(pcd_rankings.values())
pcd_scores = {ins: n / pcd_total for ins, n in pcd_rankings.items()}
pcd_scores

{'Google': 0.3442622950819672,
 'DeepMind': 0.1885245901639344,
 'Meta': 0.16393442622950818,
 'OpenAI': 0.10655737704918032,
 'Microsoft': 0.08196721311475409,
 'Alibaba': 0.01639344262295082,
 'NVIDIA': 0.01639344262295082,
 'Baidu': 0.01639344262295082,
 'Amazon': 0.00819672131147541,
 'Stability': 0.00819672131147541,
 'Runway': 0.00819672131147541,
 'Twitter': 0.00819672131147541,
 'Megvii': 0.00819672131147541,
 'Salesforce': 0.00819672131147541,
 'Netflix': 0.00819672131147541,
 'Xerox': 0.00819672131147541}

In [202]:
openalex_total = sum(openalex_rankings.values())
openalex_scores = {ins: n / openalex_total for ins, n in openalex_rankings.items()}
openalex_scores

{'Google': 0.3969170041259228,
 'DeepMind': 0.3068390672674824,
 'Microsoft': 0.108824281287109,
 'Meta': 0.0696781792637823,
 'Amazon': 0.011231281844044901,
 'LinkedIn': 0.0017543636223011673,
 'Adobe Systems': 0.009898425203943219,
 'Baidu': 0.012826920287345379,
 'NEC': 0.0027005023605322855,
 'NVIDIA': 0.015072990286081169,
 'Yahoo': 0.0029086156089284934,
 'Tencent': 0.01730135512786385,
 'Alibaba': 0.01609678322147505,
 'Uber': 0.002559792955273357,
 'Naver': 0.004491208147105626,
 'Jingdong': 0.002394855440499646,
 'Group Sense': 0.005655710487758437,
 'Huawei': 0.010946756865640533,
 'Tata Consultancy Services': 0.0007249795847115659,
 'Dascena': 0.0011769270121988532}

In [203]:
aggregate_scores = dict()
merged_keys = set(openalex_scores.keys()) | set(pcd_scores.keys())
for ins in merged_keys:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    aggregate_score = (openalex_score + pcd_score) / 2
    aggregate_scores[ins] = aggregate_score

In [204]:
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins}: {100 * score:.1f}%")

1. Google: 37.1%
2. DeepMind: 24.8%
3. Meta: 11.7%
4. Microsoft: 9.5%
5. OpenAI: 5.3%
6. Alibaba: 1.6%
7. NVIDIA: 1.6%
8. Baidu: 1.5%
9. Amazon: 1.0%
10. Tencent: 0.9%
11. Huawei: 0.5%
12. Adobe Systems: 0.5%
13. Stability: 0.4%
14. Twitter: 0.4%
15. Runway: 0.4%
16. Netflix: 0.4%
17. Megvii: 0.4%
18. Salesforce: 0.4%
19. Xerox: 0.4%
20. Group Sense: 0.3%
21. Naver: 0.2%
22. Yahoo: 0.1%
23. NEC: 0.1%
24. Uber: 0.1%
25. Jingdong: 0.1%
26. LinkedIn: 0.1%
27. Dascena: 0.1%
28. Tata Consultancy Services: 0.0%


# Final list of institutions based on ranking

Get the final list of institution IDs based on this ranking

In [223]:
for ins, score in sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True):
    search_results = Institutions().search(ins).get()
    for r in search_results:
        print(f"\"{r['id']}\",  # {r['display_name']}")

"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210149973",  # ARC Centre of Excellence for Transformative Meta-Optical Systems
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I4210120172",  # Meta Vision Systems (United Kingdom)
"https://openalex.org/I4210118911",  # META Group
"https://openalex.org/I4210099706",  # Center For Advanced Meta-Materials
"https://openalex.org/I4210131362",  # Corporación Universitaria del Meta
"https://openalex.org/I421012858

Just use top 10 aliases, and eliminate spurious results e.g. Amazon Conservation Association.

In [224]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210113297",  # Google (United Kingdom)
    "https://openalex.org/I4210100430",  # Google (Switzerland)
    "https://openalex.org/I4210148186",  # Google (Canada)
    "https://openalex.org/I4210117425",  # Google (Israel)
    "https://openalex.org/I4210131802",  # Google (Ireland)
    "https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
    "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I4210111288",  # Meta (United Kingdom)
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    "https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
    "https://openalex.org/I4210124949",  # Microsoft Research (India)
    "https://openalex.org/I4210105678",  # Microsoft (Finland)
    "https://openalex.org/I4210087053",  # Microsoft (Germany)
    "https://openalex.org/I4210125051",  # Microsoft (Israel)
    "https://openalex.org/I4210162141",  # Microsoft (India)
    "https://openalex.org/I4210086099",  # Microsoft (Brazil)
    "https://openalex.org/I4210153468",  # Microsoft (Canada)
    "https://openalex.org/I4210161634",  # Microsoft (France)
    "https://openalex.org/I4210110431",  # Microsoft (Netherlands)
    "https://openalex.org/I4210099966",  # Microsoft (Denmark)
    "https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
    "https://openalex.org/I4210135422",  # Microsoft (Norway)
    "https://openalex.org/I4210139986",  # Microsoft (Switzerland)
    "https://openalex.org/I4210109507",  # Microsoft (Ireland)
    "https://openalex.org/I4210092974",  # Microsoft (Portugal)
    "https://openalex.org/I4210151458",  # Microsoft (Belgium)
    "https://openalex.org/I4210161460",  # OpenAI (United States)
    "https://openalex.org/I45928872",  # Alibaba Group (China)
    "https://openalex.org/I4210095624",  # Alibaba Group (United States)
    "https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
    "https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
    "https://openalex.org/I4210127875",  # Nvidia (United States)
    "https://openalex.org/I98301712",  # Baidu (China)
    "https://openalex.org/I1311688040",  # Amazon (United States)
    "https://openalex.org/I4210089985",  # Amazon (Germany)
    "https://openalex.org/I4210123934",  # Amazon (United Kingdom)
    "https://openalex.org/I2250653659",  # Tencent (China)
    "https://openalex.org/I4210103558",  # Tencent Healthcare (China)
]
len(selected_institution_ids)

41

Have the alias mapping for later

In [232]:
selected_institutions_text = """
"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I4210161634",  # Microsoft (France)
"https://openalex.org/I4210110431",  # Microsoft (Netherlands)
"https://openalex.org/I4210099966",  # Microsoft (Denmark)
"https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
"https://openalex.org/I4210135422",  # Microsoft (Norway)
"https://openalex.org/I4210139986",  # Microsoft (Switzerland)
"https://openalex.org/I4210109507",  # Microsoft (Ireland)
"https://openalex.org/I4210092974",  # Microsoft (Portugal)
"https://openalex.org/I4210151458",  # Microsoft (Belgium)
"https://openalex.org/I4210161460",  # OpenAI (United States)
"https://openalex.org/I45928872",  # Alibaba Group (China)
"https://openalex.org/I4210095624",  # Alibaba Group (United States)
"https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
"https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
"https://openalex.org/I4210127875",  # Nvidia (United States)
"https://openalex.org/I98301712",  # Baidu (China)
"https://openalex.org/I1311688040",  # Amazon (United States)
"https://openalex.org/I4210089985",  # Amazon (Germany)
"https://openalex.org/I4210123934",  # Amazon (United Kingdom)
"https://openalex.org/I2250653659",  # Tencent (China)
"https://openalex.org/I4210103558",  # Tencent Healthcare (China)
"""

# Map each institution id to the first word after the comment
institution_aliases = {}
for line in selected_institutions_text.splitlines()[1:]:
    institution_id = line.split(",")[0].strip('"')
    institution_name = line.split("#")[1].strip()
    institution_alias = institution_name.split(" ")[0].strip()
    institution_aliases[institution_id] = institution_alias
institution_aliases

{'https://openalex.org/I1291425158': 'Google',
 'https://openalex.org/I4210113297': 'Google',
 'https://openalex.org/I4210100430': 'Google',
 'https://openalex.org/I4210148186': 'Google',
 'https://openalex.org/I4210117425': 'Google',
 'https://openalex.org/I4210131802': 'Google',
 'https://openalex.org/I4210090411': 'DeepMind',
 'https://openalex.org/I2252078561': 'Meta',
 'https://openalex.org/I4210114444': 'Meta',
 'https://openalex.org/I4210111288': 'Meta',
 'https://openalex.org/I1290206253': 'Microsoft',
 'https://openalex.org/I4210164937': 'Microsoft',
 'https://openalex.org/I4210113369': 'Microsoft',
 'https://openalex.org/I4210124949': 'Microsoft',
 'https://openalex.org/I4210105678': 'Microsoft',
 'https://openalex.org/I4210087053': 'Microsoft',
 'https://openalex.org/I4210125051': 'Microsoft',
 'https://openalex.org/I4210162141': 'Microsoft',
 'https://openalex.org/I4210086099': 'Microsoft',
 'https://openalex.org/I4210153468': 'Microsoft',
 'https://openalex.org/I4210161634

# Spot checks

Check that works tagged Machine Learning are generally the kind of works we are interested in.

In [212]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .get()
[work['display_name'] for work in ml_works]

['Deep Residual Learning for Image Recognition',
 'LIBSVM',
 'Adam: A Method for Stochastic Optimization',
 'Adam: A Method for Stochastic Optimization',
 'Going deeper with convolutions',
 'U-Net: Convolutional Networks for Biomedical Image Segmentation',
 'Human-level control through deep reinforcement learning',
 'MrBayes 3.2: Efficient Bayesian Phylogenetic Inference and Model Choice Across a Large Model Space',
 'Rich Feature Hierarchies for Accurate Object Detection and Semantic Segmentation',
 'Search and clustering orders of magnitude faster than BLAST',
 'ImageNet classification with deep convolutional neural networks',
 'Very Deep Convolutional Networks for Large-Scale Image Recognition',
 'Rethinking the Inception Architecture for Computer Vision',
 'ImageNet Classification with Deep Convolutional Neural Networks',
 'Geneious Basic: An integrated and extendable desktop software platform for the organization and analysis of sequence data',
 'XGBoost',
 'DADA2: High-resolution

What is tagged Artificial Intelligence vs. Machine Learning for top institutions.

AI: "https://openalex.org/C154945302"

ML: "https://openalex.org/C119857082"

In [42]:
Concepts()["https://openalex.org/C154945302"]["level"]

1

In [43]:
Concepts()["https://openalex.org/C119857082"]["level"]

1

OpenAI: "https://openalex.org/I4210161460"

DeepMind: "https://openalex.org/I4210090411"

In [33]:
institution_id = "https://openalex.org/I4210090411"

In [34]:
# OpenAI
ai_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C154945302"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [35]:
len(ai_works)

392

In [36]:
for work in ai_works:
    print(work['display_name'])

Human-level control through deep reinforcement learning
Mastering the game of Go with deep neural networks and tree search
Highly accurate protein structure prediction with AlphaFold
Mastering the game of Go without human knowledge
Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
Overcoming catastrophic forgetting in neural networks
3D U-Net: Learning Dense Volumetric Segmentation from Sparse Annotation
Improved protein structure prediction using potentials from deep learning
AlphaFold Protein Structure Database: massively expanding the structural coverage of protein-sequence space with high-accuracy models
A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play
Grandmaster level in StarCraft II using multi-agent reinforcement learning
Clinically applicable deep learning for diagnosis and referral in retinal disease
Deep Reinforcement Learning with Double Q-Learning
nnU-Net: a self-configuring method for deep learning-based biom

In [37]:
ml_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C119857082"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [38]:
len(ml_works)

206

In [39]:
for work in ml_works:
    print(work['display_name'])

Human-level control through deep reinforcement learning
Mastering the game of Go with deep neural networks and tree search
Highly accurate protein structure prediction with AlphaFold
Mastering the game of Go without human knowledge
Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
Overcoming catastrophic forgetting in neural networks
Improved protein structure prediction using potentials from deep learning
Clinically applicable deep learning for diagnosis and referral in retinal disease
Deep Reinforcement Learning with Double Q-Learning
nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation
Deep Reinforcement Learning with Double Q-Learning
nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation
International evaluation of an AI system for breast cancer screening
Highly accurate protein structure prediction for the human proteome
U-Net: deep learning for cell counting, detection, and morphometry
U-Net:

In [40]:
ai_ids = [work['id'] for work in ai_works]
ml_ids = [work['id'] for work in ml_works]
for work_id in set(ai_ids).difference(set(ml_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

Distilling Policy Distillation: 17
Meta-learning in natural and artificial intelligence: 34
A probabilistic approach to demixing odors: 51
High Fidelity Speech Synthesis with Adversarial Networks: 52
Computations Underlying Social Hierarchy Learning: Distinct Neural Mechanisms for Updating and Representing Self-Relevant Information: 104
Distilling Policy Distillation: 15
Applying and improving <scp>AlphaFold</scp> at <scp>CASP14</scp>: 137
Cross-Lingual Word Embeddings: 15
Deep Reinforcement Learning for Tactile Robotics: Learning to Type on a Braille Keyboard: 14
Mental labour: 105
Sample-efficient adaptive text-to-speech: 31
Massively Parallel Video Networks: 23
The NarrativeQA Reading Comprehension Challenge: 19
Protein complex prediction with AlphaFold-Multimer: 636
Agent57: Outperforming the Atari Human Benchmark: 46
Efficient Neighbourhood Consensus Networks via Submanifold Sparse Convolutions: 46
Placing language in an integrated understanding system: Next steps toward human-lev

In [44]:
for work_id in set(ml_ids).difference(set(ai_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

Confidence modulates exploration and exploitation in value-based learning: 41
Minimax Regret Bounds for Reinforcement Learning: 42
A Sparse Coding Approach to Household Electricity Demand Forecasting in Smart Grids: 69
Predicting the long-term stability of compact multiplanet systems: 42
Learning Nash Equilibrium for General-Sum Markov Games from Batch Data: 15
Degenerate Feedback Loops in Recommender Systems: 86
Ranking soccer teams on the basis of their current strength: A comparison of maximum likelihood approaches: 30
Evidence integration in model-based tree search: 36
Is the Bellman residual a bad proxy?: 14
A General Approach to Fairness with Optimal Transport: 17
This time with feeling: learning expressive musical performance: 94
