In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [46]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import plotly.graph_objects as go
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm.notebook import tqdm

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [4]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [5]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# PCD database

In [6]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4/export?format=csv#gid=0')

In [7]:
df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
2,Phenaki,Vision,Video generation,"Google Brain, University College London, Unive...",Industry - Academia Collaboration (Industry le...,"Ruben Villegas, Mohammad Babaeizadeh, Pieter-J...",2022-10-05,Phenaki: Variable Length Video Generation From...,https://arxiv.org/abs/2210.02399,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,3267257.75,,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,3232806.53,,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,ALM 1.0,Language,Language modelling,BAAI,Academia,,,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-06-09 16:06:43
548,MusicGen,Audio,Audio generation,Meta AI,Industry,"Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, ...",2023-06-08,Simple and Controllable Music Generation,https://arxiv.org/abs/2306.05284,,...,,,,,,,Industry,Unverified,We tackle the task of conditional music genera...,2023-06-09 18:28:28
549,Unsupervised High-level Feature Learner,Vision,Image classification,Google,Industry,"Quoc V. Le, Marc'Aurelio Ranzato, Rajat Monga,...",2012-07-12,Building High-level Features Using Large Scale...,https://arxiv.org/pdf/1112.6209.pdf,2910.0,...,800,Unsupervised,,Hardware not reported,,,Industry,Likely,We consider the problem of building high-level...,2023-06-15 15:50:10
550,Gen-2,Text-to-Video,Video generation,Runway,Industry,,2023-01-01,,https://research.runwayml.com/gen2,0.0,...,,,,,,,,Unverified,,2023-06-15 19:55:47


In [8]:
notable_df = df.dropna(subset=['Inclusion criteria'])
notable_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,3267257.75,,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,3232806.53,,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
6,Chinchilla,Language,Language modelling,DeepMind,Industry,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me...",2022-03-29,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,,...,,,753491.58,,Yes,,Industry,,We investigate the optimal model size and numb...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,,Vision,Image classification,"University of Guelph,Canadian Institute for Ad...",Industry - Academia Collaboration,"Terrance DeVries, Graham W. Taylor",2017-08-15,Improved Regularization of Convolutional Neura...,https://arxiv.org/abs/1708.04552,1450.0,...,,,,https://www.yuzeh.com/data/agz-cost.html,,,Industry,,,2023-06-09 16:00:52
547,ALM 1.0,Language,Language modelling,BAAI,Academia,,,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-06-09 16:06:43
549,Unsupervised High-level Feature Learner,Vision,Image classification,Google,Industry,"Quoc V. Le, Marc'Aurelio Ranzato, Rajat Monga,...",2012-07-12,Building High-level Features Using Large Scale...,https://arxiv.org/pdf/1112.6209.pdf,2910.0,...,800,Unsupervised,,Hardware not reported,,,Industry,Likely,We consider the problem of building high-level...,2023-06-15 15:50:10
550,Gen-2,Text-to-Video,Video generation,Runway,Industry,,2023-01-01,,https://research.runwayml.com/gen2,0.0,...,,,,,,,,Unverified,,2023-06-15 19:55:47


In [9]:
# Count the number of notable ML systems for each Organization since 2010.
organization_system_count = defaultdict(int)
for i, row in notable_df.iterrows():
    pub_date = row['Publication date']
    if type(pub_date) == str and int(pub_date[:4]) >= 2010 and row['Organization Categorization'] == 'Industry':
        org = row['Organization']
        organization_system_count[org] += 1

In [10]:
# Print organization and its system count, in descending order of count
for org, count in sorted(organization_system_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{org}: {count} systems")

Google: 21 systems
DeepMind: 16 systems
OpenAI: 12 systems
Google Brain: 7 systems
Google DeepMind: 7 systems
Google Research: 6 systems
Microsoft Research: 5 systems
Meta AI: 4 systems
Facebook AI Research: 4 systems
Facebook AI research: 4 systems
MetaAI: 3 systems
Microsoft: 3 systems
Alibaba Group: 2 systems
Facebook: 2 systems
Facebook AI: 2 systems
Google Inc.: 2 systems
Runway: 2 systems
Amazon: 1 systems
Stability AI, Runway: 1 systems
Google AI, Brain team: 1 systems
Microsoft Research,Peking University: 1 systems
Open AI: 1 systems
Microsoft Bing: 1 systems
Google Research, Brain Team: 1 systems
Google Research,Brain Team: 1 systems
Google AI: 1 systems
AllenAI, University of Washington: 1 systems
Google Brain,Google Research: 1 systems
Twitter: 1 systems
Megvii Inc: 1 systems
Nvidia: 1 systems
Salesforce: 1 systems
Baidu Research- Silicon Valley AI Lab: 1 systems
Netflix: 1 systems
Xerox Research Centre Europe (XRCE): 1 systems
Google Inc: 1 systems
NVIDIA: 1 systems
Baidu: 

Industry:
1. Google: 42
  - Google: 20
  - Google Research: 7
  - Google Brain: 7
  - Google Inc.: 2
  - Google AI, Brain team: 1
  - Google Research, Brain Team: 1
  - Google Research,Brain Team: 1
  - Google AI: 1
  - Google Brain,Google Research: 1
  - Google Inc: 1
2. DeepMind: 23
  - DeepMind: 16
  - Google DeepMind: 7
3. Meta: 20
  - Meta AI: 4
  - MetaAI: 4
  - Facebook AI Research: 4
  - Facebook AI research: 4
  - Facebook: 2
  - Facebook AI 2
4. OpenAI: 13
   1. OpenAI: 12
   2. Open AI: 1
5. Microsoft: 10
   - Microsoft Research: 5
   - Microsoft: 3
   - Microsoft Research,Peking University: 1
   - Microsoft Bing: 1
6. Alibaba: 2
  - Alibaba Group: 2
6. NVIDIA: 2
  - Nvidia: 1
  - NVIDIA: 1
6. Baidu: 2
  - Baidu Research- Silicon Valley AI Lab: 1
  - Baidu: 1
7. Amazon: 1
  - Amazon: 1
7. Stability: 1
  - Stability AI, Runway: 1
7. Runway: 1
  - Stability AI, Runway: 1
7. Twitter: 1
7. Megvii: 
  - Megvii Inc: 1
7. Salesforce: 1
  - Salesforce: 1
7. Netflix: 1
  - Netflix: 1
7. Xerox: 1
  - Xerox Research Centre Europe (XRCE): 1

In [11]:
pcd_rankings = {
    "Google": 42,
    "DeepMind": 23,
    "Meta": 20,
    "OpenAI": 13,
    "Microsoft": 10,
    "Alibaba": 2,
    "NVIDIA": 2,
    "Baidu": 2,
    "Amazon": 1,
    "Stability": 1,
    "Runway": 1,
    "Twitter": 1,
    "Megvii": 1,
    "Salesforce": 1,
    "Netflix": 1,
    "Xerox": 1
}

# OpenAlex

In [18]:
concept_ids = [
    # 'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [19]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C119857082'

Sort by works because we are interested in impact since around 2010, in the Deep Learning era of AI.
Citations are not as informative for this, because it may include citations of works published before 2010.

We can only sort by number of works in any year.
But we want to narrow down based on number of works published since 2010.
So we fetch 200 institutions initially, assuming that this is large enough to include all of the institutions in the narrowed-down set.

In [20]:
top_institutions = merge_pages(
    Institutions() \
        .filter(concepts={"id": concept_query}) \
        .filter(type="company") \
        .sort(cited_by_count="desc") \
        .paginate(per_page=100, n_max=200)
)
top_institutions

0page [00:00, ?page/s]

2page [00:01,  1.21page/s]


[{'id': 'https://openalex.org/I1291425158',
  'ror': 'https://ror.org/00njsd438',
  'display_name': 'Google (United States)',
  'country_code': 'US',
  'type': 'company',
  'homepage_url': 'https://www.google.com/',
  'image_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Google%202015%20logo.svg',
  'image_thumbnail_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Google%202015%20logo.svg&width=300',
  'display_name_acronyms': [],
  'display_name_alternatives': ['Googleplex'],
  'repositories': [{'id': 'https://openalex.org/S4306400783',
    'display_name': 'Repositorio de Tesis USAT (Santo Toribio de Mogrovejo Catholic University)',
    'host_organization': 'https://openalex.org/I1291425158',
    'host_organization_name': 'Google (United States)',
    'host_organization_lineage': ['https://openalex.org/I1291425158']},
   {'id': 'https://openalex.org/S4306402250',
    'display_name': 'Scientia cum Industria (University of Caxias

In [21]:
institution_names = [ins_obj['display_name'] for ins_obj in top_institutions]

In [22]:
for name in ['Google', 'DeepMind', 'OpenAI']:
    print(f"{name}: {any([name in ins_name for ins_name in institution_names])}")

Google: True
DeepMind: True
OpenAI: False


In [23]:
selected_institution_ids = [ins_obj['id'] for ins_obj in top_institutions]
selected_institution_ids

['https://openalex.org/I1291425158',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210143601',
 'https://openalex.org/I4210114444',
 'https://openalex.org/I1306409833',
 'https://openalex.org/I4210134091',
 'https://openalex.org/I4210090411',
 'https://openalex.org/I4210155590',
 'https://openalex.org/I2250955327',
 'https://openalex.org/I1311688040',
 'https://openalex.org/I2250653659',
 'https://openalex.org/I98301712',
 'https://openalex.org/I1304085615',
 'https://openalex.org/I4210127875',
 'https://openalex.org/I4210113297',
 'https://openalex.org/I33976269',
 'https://openalex.org/I4210128910',
 'https://openalex.org/I45928872',
 'https://openalex.org/I2800095910',
 'https://openalex.org/I4210100430',
 'https://openalex.org/I122754148',
 'https://openalex.org/I180662265',
 'https://openalex.org/I55215948',
 'https://openalex.org/I4210148872',
 'https://o

In [24]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
# params = []
# n_max = int(1e6)
# works = []
# unique_work_ids = set()
# for institution_id in tqdm(selected_institution_ids):
#     works_instance = Works()
#     try:
#         new_works = merge_pages(
#             works_instance \
#                 .filter(authorships={"institutions": {"id": institution_id}}) \
#                 .filter(concepts={"id": concept_query}) \
#                 .filter(from_publication_date="2010-01-01") \
#                 .filter(to_publication_date="2023-06-15") \
#                 .filter(cited_by_count=">9") \
#                 .paginate(per_page=200, n_max=n_max)
#         )
#     except Exception as e:
#         print(f"Error for {Institutions()[institution_id]['display_name']} ({institution_id})")
#         print(f"{works_instance.params}")
#         print(e)
#     for work in new_works:
#         if work['id'] not in unique_work_ids:
#             works.append(work)
#             unique_work_ids.add(work['id'])
#     params.append(works_instance.params)

# assert len(works) < n_max
# len(works)

In [25]:
# Save to avoid fetching every time
# timestamp = datetime.datetime.now()
# with open(data_file_location + f"longlist_institution_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
#     obj = {
#         "params": params,  # for reproducibility
#         "works": works,
#     }
#     pickle.dump(obj, f)

In [26]:
# Took 43s for ~100K works on Macbook Pro 2019
with open(data_file_location + "longlist_institution_works_openalex_2023-06-15_21-15-05", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

9490

## Data processing

In [27]:
CITATION_YEAR_BOUND = 3

In [28]:
processor = OpenAlexProcessor(works, selected_institution_ids, None, citation_year_bound=CITATION_YEAR_BOUND)

In [29]:
processor.process_works()

In [30]:
institution_bounded_citations = processor.get_bounded_citations()
institution_work_counts = processor.get_work_counts()

In [31]:
# Get totals over the whole time period of the data
total_bounded_citations = defaultdict(int)
total_work_counts = defaultdict(int)
for ins, bounded_citations in institution_bounded_citations.items():
    for citation_count in bounded_citations:
        works_count = institution_work_counts[ins].loc[citation_count.year].item()
        total_bounded_citations[ins] += citation_count.item()
        total_work_counts[ins] += works_count

overall_scores = defaultdict(float)
for ins, cited_by_count in total_bounded_citations.items():
    works_count = total_work_counts[ins]
    overall_scores[ins] = cited_by_count
print(f"Number of institutions: {len(overall_scores)}")
overall_scores

Number of institutions: 182


defaultdict(float,
            {'https://openalex.org/I1291425158': 193710.0,
             'https://openalex.org/I4210148872': 8272.0,
             'https://openalex.org/I4210113297': 12321.0,
             'https://openalex.org/I4210090411': 44566.0,
             'https://openalex.org/I1290206253': 49856.0,
             'https://openalex.org/I2252078561': 53938.0,
             'https://openalex.org/I1311688040': 9332.0,
             'https://openalex.org/I869089601': 905.0,
             'https://openalex.org/I1316064682': 2398.0,
             'https://openalex.org/I4210100430': 2623.0,
             'https://openalex.org/I1306409833': 14901.0,
             'https://openalex.org/I98301712': 15283.0,
             'https://openalex.org/I4210107353': 3126.0,
             'https://openalex.org/I4210127875': 3889.0,
             'https://openalex.org/I4210114444': 14298.0,
             'https://openalex.org/I1304085615': 6494.0,
             'https://openalex.org/I4210164937': 51882.0,
      

In [32]:
for ins, score in sorted(overall_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{Institutions()[ins]['display_name']} ({ins}): {score}")

Google (United States) (https://openalex.org/I1291425158): 193710.0
Meta (Israel) (https://openalex.org/I2252078561): 53938.0
Microsoft Research (United Kingdom) (https://openalex.org/I4210164937): 51882.0
Microsoft (United States) (https://openalex.org/I1290206253): 49856.0
DeepMind (United Kingdom) (https://openalex.org/I4210090411): 44566.0
Microsoft Research Asia (China) (https://openalex.org/I4210113369): 21778.0
Tencent (China) (https://openalex.org/I2250653659): 20001.0
Baidu (China) (https://openalex.org/I98301712): 15283.0
Adobe Systems (United States) (https://openalex.org/I1306409833): 14901.0
Meta (United States) (https://openalex.org/I4210114444): 14298.0
Google (United Kingdom) (https://openalex.org/I4210113297): 12321.0
Alibaba Group (China) (https://openalex.org/I45928872): 12125.0
Amazon (United States) (https://openalex.org/I1311688040): 9332.0
Magic Leap (United States) (https://openalex.org/I4210148872): 8272.0
Group Sense (China) (https://openalex.org/I4210128910):

In [33]:
institution_aliases = {
    "https://openalex.org/I4210113297": "Google",
    "https://openalex.org/I4210090411": "DeepMind",
    "https://openalex.org/I4210164937": "Microsoft",
    "https://openalex.org/I1291425158": "Google",
    "https://openalex.org/I2252078561": "Meta",
    "https://openalex.org/I4210114444": "Meta",
    "https://openalex.org/I60922564": "Naver",
    "https://openalex.org/I1304085615": "NVIDIA",
    "https://openalex.org/I4210156496": "Uber",
    "https://openalex.org/I4210107353": "NEC",
    "https://openalex.org/I4210128910": "Group Sense",
    "https://openalex.org/I4210127875": "NVIDIA",
    "https://openalex.org/I1306409833": "Adobe Systems",
    "https://openalex.org/I98301712": "Baidu",
    "https://openalex.org/I4210113369": "Microsoft",
    "https://openalex.org/I2250955327": "Huawei",
    "https://openalex.org/I4210089985": "Amazon",
    "https://openalex.org/I2250653659": "Tencent",
    "https://openalex.org/I4210159102": "Huawei",
    "https://openalex.org/I45928872": "Alibaba",
    "https://openalex.org/I1290206253": "Microsoft",
    "https://openalex.org/I4210095624": "Alibaba",
    "https://openalex.org/I4210103986": "Jingdong",
    "https://openalex.org/I1316064682": "LinkedIn",
    "https://openalex.org/I1311688040": "Amazon",
    "https://openalex.org/I4210087778": "Dascena",
    "https://openalex.org/I2800095910": "Yahoo",
    "https://openalex.org/I4210134091": "Yahoo",
    "https://openalex.org/I55215948": "Tata Consultancy Services",
}

From inspecting the papers associated with "Decision Systems", it seems that OpenAlex (or one of its sources) has mistakenly associated the MIT Laboratory for Information and Decision Systems (and possibly similar departments) with the company Decision Systems Inc. ([example](https://arxiv.org/pdf/1606.05830.pdf)). 
Similarly for "Management Sciences", there is a mistaken association of Department of Industrial Engineering and Management Sciences at Northwestern University ([example](https://epubs.siam.org/doi/epdf/10.1137/16M1080173)) with a company called "Management Sciences".

So I will eliminate these spurious cases.

In [34]:
ignore_institutions = [
    "https://openalex.org/I4210143601",  # Decision Systems
    "https://openalex.org/I4210155590",
]

Recount using aliases

In [35]:
# Get totals over the whole time period of the data
total_bounded_citations = defaultdict(int)
total_work_counts = defaultdict(int)
for ins, bounded_citations in institution_bounded_citations.items():
    if ins in ignore_institutions:
        continue
    alias = institution_aliases.get(ins, ins)
    for citation_count in bounded_citations:
        works_count = institution_work_counts[ins].loc[citation_count.year].item()
        total_bounded_citations[alias] += citation_count.item()
        total_work_counts[alias] += works_count

overall_scores = defaultdict(float)
for alias, cited_by_count in total_bounded_citations.items():
    works_count = total_work_counts[alias]
    overall_scores[alias] = cited_by_count
print(f"Number of institutions: {len(overall_scores)}")
overall_scores

Number of institutions: 171


defaultdict(float,
            {'Google': 206031.0,
             'https://openalex.org/I4210148872': 8272.0,
             'DeepMind': 44566.0,
             'Microsoft': 123516.0,
             'Meta': 68236.0,
             'Amazon': 13607.0,
             'https://openalex.org/I869089601': 905.0,
             'LinkedIn': 2398.0,
             'https://openalex.org/I4210100430': 2623.0,
             'Adobe Systems': 14901.0,
             'Baidu': 15283.0,
             'NEC': 3126.0,
             'NVIDIA': 10383.0,
             'https://openalex.org/I19268510': 1014.0,
             'https://openalex.org/I1321826891': 309.0,
             'https://openalex.org/I4210126929': 728.0,
             'https://openalex.org/I4210111288': 338.0,
             'https://openalex.org/I4210148186': 584.0,
             'https://openalex.org/I4210149914': 1040.0,
             'https://openalex.org/I4210090142': 429.0,
             'https://openalex.org/I2946016260': 1410.0,
             'https://openalex.org/

In [36]:
openalex_rankings = overall_scores

# Aggregation

In [37]:
set(openalex_rankings.keys()).difference(pcd_rankings.keys())

{'Adobe Systems',
 'Dascena',
 'Group Sense',
 'Huawei',
 'Jingdong',
 'LinkedIn',
 'NEC',
 'Naver',
 'Tata Consultancy Services',
 'Tencent',
 'Uber',
 'Yahoo',
 'https://openalex.org/I122754148',
 'https://openalex.org/I1301041018',
 'https://openalex.org/I1305444813',
 'https://openalex.org/I1321826891',
 'https://openalex.org/I1330693074',
 'https://openalex.org/I136848882',
 'https://openalex.org/I180662265',
 'https://openalex.org/I19268510',
 'https://openalex.org/I276751011',
 'https://openalex.org/I2800752714',
 'https://openalex.org/I2801840469',
 'https://openalex.org/I2905213637',
 'https://openalex.org/I2946016260',
 'https://openalex.org/I33976269',
 'https://openalex.org/I4210086143',
 'https://openalex.org/I4210086396',
 'https://openalex.org/I4210086647',
 'https://openalex.org/I4210086940',
 'https://openalex.org/I4210086945',
 'https://openalex.org/I4210087272',
 'https://openalex.org/I4210089168',
 'https://openalex.org/I4210090142',
 'https://openalex.org/I42100904

In [38]:
set(pcd_rankings.keys()).difference(openalex_rankings.keys())

{'Megvii',
 'Netflix',
 'OpenAI',
 'Runway',
 'Salesforce',
 'Stability',
 'Twitter',
 'Xerox'}

In [39]:
pcd_total = sum(pcd_rankings.values())
pcd_scores = {ins: n / pcd_total for ins, n in pcd_rankings.items()}
pcd_scores

{'Google': 0.3442622950819672,
 'DeepMind': 0.1885245901639344,
 'Meta': 0.16393442622950818,
 'OpenAI': 0.10655737704918032,
 'Microsoft': 0.08196721311475409,
 'Alibaba': 0.01639344262295082,
 'NVIDIA': 0.01639344262295082,
 'Baidu': 0.01639344262295082,
 'Amazon': 0.00819672131147541,
 'Stability': 0.00819672131147541,
 'Runway': 0.00819672131147541,
 'Twitter': 0.00819672131147541,
 'Megvii': 0.00819672131147541,
 'Salesforce': 0.00819672131147541,
 'Netflix': 0.00819672131147541,
 'Xerox': 0.00819672131147541}

In [40]:
openalex_total = sum(openalex_rankings.values())
openalex_scores = {ins: n / openalex_total for ins, n in openalex_rankings.items()}
openalex_scores

{'Google': 0.32015756841172904,
 'https://openalex.org/I4210148872': 0.012854101595885195,
 'DeepMind': 0.06925240470529734,
 'Microsoft': 0.19193510792037669,
 'Meta': 0.10603390673317484,
 'Amazon': 0.021144313396422854,
 'https://openalex.org/I869089601': 0.0014063058443273817,
 'LinkedIn': 0.0037263220051901233,
 'https://openalex.org/I4210100430': 0.004075956054884776,
 'Adobe Systems': 0.023155097664444547,
 'Baidu': 0.023748698584370582,
 'NEC': 0.004857582397091045,
 'NVIDIA': 0.016134445946575916,
 'https://openalex.org/I19268510': 0.0015756841172905691,
 'https://openalex.org/I1321826891': 0.00048016409491399,
 'https://openalex.org/I4210126929': 0.0011312603919009215,
 'https://openalex.org/I4210111288': 0.0005252280390968564,
 'https://openalex.org/I4210148186': 0.0009074946000963436,
 'https://openalex.org/I4210149914': 0.0016160862741441736,
 'https://openalex.org/I4210090142': 0.0006666355880844716,
 'https://openalex.org/I2946016260': 0.002191040044753158,
 'https://ope

In [41]:
aggregate_scores = dict()
merged_keys = set(openalex_scores.keys()) | set(pcd_scores.keys())
for ins in merged_keys:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    aggregate_score = (openalex_score + pcd_score) / 2
    aggregate_scores[ins] = aggregate_score

In [52]:
for i, (ins, score) in enumerate(sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True)):
    print(f"{i+1}. {ins}: {100 * score:.1f}%")

1. Google: 33.2%
2. Microsoft: 13.7%
3. Meta: 13.5%
4. DeepMind: 12.9%
5. OpenAI: 5.3%
6. Alibaba: 2.1%
7. Baidu: 2.0%
8. NVIDIA: 1.6%
9. Tencent: 1.6%
10. Amazon: 1.5%
11. Adobe Systems: 1.2%
12. Huawei: 0.9%
13. https://openalex.org/I4210148872: 0.6%
14. Group Sense: 0.6%
15. Naver: 0.5%
16. Runway: 0.4%
17. Xerox: 0.4%
18. Salesforce: 0.4%
19. Netflix: 0.4%
20. Stability: 0.4%
21. Twitter: 0.4%
22. Megvii: 0.4%
23. Jingdong: 0.3%
24. Yahoo: 0.3%
25. NEC: 0.2%
26. https://openalex.org/I4210100430: 0.2%
27. Uber: 0.2%
28. LinkedIn: 0.2%
29. https://openalex.org/I4210128969: 0.2%
30. https://openalex.org/I4210109870: 0.1%
31. https://openalex.org/I4210142583: 0.1%
32. https://openalex.org/I4210135459: 0.1%
33. https://openalex.org/I58957048: 0.1%
34. https://openalex.org/I2946016260: 0.1%
35. https://openalex.org/I4210111607: 0.1%
36. https://openalex.org/I4210155967: 0.1%
37. https://openalex.org/I180662265: 0.1%
38. https://openalex.org/I4210155268: 0.1%
39. Tata Consultancy Services

In [58]:
top_institutions = []
top_pcd_scores = []
top_openalex_scores = []
for i, ins in enumerate(sorted(aggregate_scores.keys(), key=lambda k: aggregate_scores[k], reverse=True)):
    top_institutions.append(ins)
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    top_pcd_scores.append(pcd_score/2)
    top_openalex_scores.append(openalex_score/2)
    if i >= 9:
        break
top_institutions

['Google',
 'Microsoft',
 'Meta',
 'DeepMind',
 'OpenAI',
 'Alibaba',
 'Baidu',
 'NVIDIA',
 'Tencent',
 'Amazon']

In [61]:
fig = go.Figure(data=[
    go.Bar(name='Citation-weighted publications score', x=top_institutions, y=top_openalex_scores),
    go.Bar(name='Notable ML systems score', x=top_institutions, y=top_pcd_scores),
])

## Plot layout
fig.update_layout(barmode='stack')
fig.update_layout(
    title='Initial ranking of companies leading in AI research',
    xaxis_title='Company',
    yaxis_title='Research impact score',
)
fig.update_layout(
    autosize=False,
    width=800,
    height=600,
    title_x=0.5,
    margin=dict(l=100, r=30, t=80, b=80),
)

## Save plot
fig.write_image('results/ranking.png', scale=2)

## Show plot
fig.show()

# Final list of institutions based on ranking

Get the final list of institution IDs based on this ranking

In [43]:
for ins, score in sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True):
    search_results = Institutions().search(ins).get()
    for r in search_results:
        print(f"\"{r['id']}\",  # {r['display_name']}")

"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I

Just use top 10 aliases, and eliminate spurious results e.g. Amazon Conservation Association.

In [None]:
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210113297",  # Google (United Kingdom)
    "https://openalex.org/I4210100430",  # Google (Switzerland)
    "https://openalex.org/I4210148186",  # Google (Canada)
    "https://openalex.org/I4210117425",  # Google (Israel)
    "https://openalex.org/I4210131802",  # Google (Ireland)
    "https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
    "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I4210111288",  # Meta (United Kingdom)
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    "https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
    "https://openalex.org/I4210124949",  # Microsoft Research (India)
    "https://openalex.org/I4210105678",  # Microsoft (Finland)
    "https://openalex.org/I4210087053",  # Microsoft (Germany)
    "https://openalex.org/I4210125051",  # Microsoft (Israel)
    "https://openalex.org/I4210162141",  # Microsoft (India)
    "https://openalex.org/I4210086099",  # Microsoft (Brazil)
    "https://openalex.org/I4210153468",  # Microsoft (Canada)
    "https://openalex.org/I4210161634",  # Microsoft (France)
    "https://openalex.org/I4210110431",  # Microsoft (Netherlands)
    "https://openalex.org/I4210099966",  # Microsoft (Denmark)
    "https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
    "https://openalex.org/I4210135422",  # Microsoft (Norway)
    "https://openalex.org/I4210139986",  # Microsoft (Switzerland)
    "https://openalex.org/I4210109507",  # Microsoft (Ireland)
    "https://openalex.org/I4210092974",  # Microsoft (Portugal)
    "https://openalex.org/I4210151458",  # Microsoft (Belgium)
    "https://openalex.org/I4210161460",  # OpenAI (United States)
    "https://openalex.org/I45928872",  # Alibaba Group (China)
    "https://openalex.org/I4210095624",  # Alibaba Group (United States)
    "https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
    "https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
    "https://openalex.org/I4210127875",  # Nvidia (United States)
    "https://openalex.org/I98301712",  # Baidu (China)
    "https://openalex.org/I1311688040",  # Amazon (United States)
    "https://openalex.org/I4210089985",  # Amazon (Germany)
    "https://openalex.org/I4210123934",  # Amazon (United Kingdom)
    "https://openalex.org/I2250653659",  # Tencent (China)
    "https://openalex.org/I4210103558",  # Tencent Healthcare (China)
]
len(selected_institution_ids)

Have the alias mapping for later

In [None]:
selected_institutions_text = """
"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I4210161634",  # Microsoft (France)
"https://openalex.org/I4210110431",  # Microsoft (Netherlands)
"https://openalex.org/I4210099966",  # Microsoft (Denmark)
"https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
"https://openalex.org/I4210135422",  # Microsoft (Norway)
"https://openalex.org/I4210139986",  # Microsoft (Switzerland)
"https://openalex.org/I4210109507",  # Microsoft (Ireland)
"https://openalex.org/I4210092974",  # Microsoft (Portugal)
"https://openalex.org/I4210151458",  # Microsoft (Belgium)
"https://openalex.org/I4210161460",  # OpenAI (United States)
"https://openalex.org/I45928872",  # Alibaba Group (China)
"https://openalex.org/I4210095624",  # Alibaba Group (United States)
"https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
"https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
"https://openalex.org/I4210127875",  # Nvidia (United States)
"https://openalex.org/I98301712",  # Baidu (China)
"https://openalex.org/I1311688040",  # Amazon (United States)
"https://openalex.org/I4210089985",  # Amazon (Germany)
"https://openalex.org/I4210123934",  # Amazon (United Kingdom)
"https://openalex.org/I2250653659",  # Tencent (China)
"https://openalex.org/I4210103558",  # Tencent Healthcare (China)
"""

# Map each institution id to the first word after the comment
institution_aliases = {}
for line in selected_institutions_text.splitlines()[1:]:
    institution_id = line.split(",")[0].strip('"')
    institution_name = line.split("#")[1].strip()
    institution_alias = institution_name.split(" ")[0].strip()
    institution_aliases[institution_id] = institution_alias
institution_aliases

# Spot checks

Check that works tagged Machine Learning are generally the kind of works we are interested in.

In [None]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .get()
[work['display_name'] for work in ml_works]

What is tagged Artificial Intelligence vs. Machine Learning for top institutions.

AI: "https://openalex.org/C154945302"

ML: "https://openalex.org/C119857082"

In [None]:
Concepts()["https://openalex.org/C154945302"]["level"]

In [None]:
Concepts()["https://openalex.org/C119857082"]["level"]

OpenAI: "https://openalex.org/I4210161460"

DeepMind: "https://openalex.org/I4210090411"

In [None]:
institution_id = "https://openalex.org/I4210090411"

In [None]:
# OpenAI
ai_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C154945302"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ai_works)

In [None]:
for work in ai_works:
    print(work['display_name'])

In [None]:
ml_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": institution_id}}) \
        .filter(concepts={"id": "https://openalex.org/C119857082"}) \
        .filter(cited_by_count=">9") \
        .paginate(per_page=200, n_max=n_max)
)

In [None]:
len(ml_works)

In [None]:
for work in ml_works:
    print(work['display_name'])

In [None]:
ai_ids = [work['id'] for work in ai_works]
ml_ids = [work['id'] for work in ml_works]
for work_id in set(ai_ids).difference(set(ml_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
for work_id in set(ml_ids).difference(set(ai_ids)):
    print(f"{Works()[work_id]['display_name']}: {Works()[work_id]['cited_by_count']}")

In [None]:
top_cited_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": "https://openalex.org/I4210161460"}}) \
        .filter(cited_by_count=">100") \
        .paginate(per_page=200, n_max=n_max)
)

for work in top_cited_works:
    print(work['display_name'], get_bounded_citations(work, year_bound=3))

In [61]:
ml_works = Works() \
    .filter(concept={'id': 'https://openalex.org/C119857082'}) \
    .filter(from_publication_date="2010-01-01") \
    .filter(to_publication_date="2023-06-15") \
    .sort(cited_by_count="desc") \
    .select(["publication_year", "authorships", "cited_by_count", "counts_by_year"]) \
    .get()
ml_works[:2]

[{'publication_year': 2016,
  'authorships': [{'author_position': 'first',
    'author': {'id': 'https://openalex.org/A4344207660',
     'display_name': 'Kaiming He',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsoft Research#TAB#',
    'raw_affiliation_strings': ['Microsoft Research#TAB#']},
   {'author_position': 'middle',
    'author': {'id': 'https://openalex.org/A4358260579',
     'display_name': 'Xiangyu Zhang',
     'orcid': None},
    'institutions': [{'id': 'https://openalex.org/I4210164937',
      'display_name': 'Microsoft Research (United Kingdom)',
      'ror': 'https://ror.org/05k87vq12',
      'country_code': 'GB',
      'type': 'company'}],
    'is_corresponding': False,
    'raw_affiliation_string': 'Microsof