In [3]:
%load_ext autoreload
%autoreload 2

In [88]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm import tqdm

from researcher_impact.pyalex_utils import *

In [58]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [59]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# PCD database

In [106]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4/export?format=csv#gid=0')

In [107]:
df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
2,Phenaki,Vision,Video generation,"Google Brain, University College London, Unive...",Industry - Academia Collaboration (Industry le...,"Ruben Villegas, Mohammad Babaeizadeh, Pieter-J...",2022-10-05,Phenaki: Variable Length Video Generation From...,https://arxiv.org/abs/2210.02399,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,"$3,267,257.75",,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,"$3,232,806.53",,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,AR-LDM,Multimodal,Text-to-image,Alibaba,Industry - Academia Collaboration (Industry le...,"Xichen Pan, Pengda Qin, Yuhong Li, Hui Xue, We...",2022-11-20,Synthesizing Coherent Story with Auto-Regressi...,https://arxiv.org/abs/2211.10950,,...,,,,,,,,Likely,Conditioned diffusion models have demonstrated...,2023-06-09 16:04:47
546,,Vision,Image classification,"University of Guelph,Canadian Institute for Ad...",Industry - Academia Collaboration,"Terrance DeVries, Graham W. Taylor",2017-08-15,Improved Regularization of Convolutional Neura...,https://arxiv.org/abs/1708.04552,1450.0,...,,,,https://www.yuzeh.com/data/agz-cost.html,,,Industry,,,2023-06-09 16:00:52
547,AltCLIP,Multimodal,,BAAI,Academia,"Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong...",2022-11-12,AltCLIP: Altering the Language Encoder in CLIP...,https://arxiv.org/abs/2211.06679,,...,,,,,,,,Likely,"In this work, we present a conceptually simple...",2023-06-09 16:04:45
548,ALM 1.0,Language,Language modelling,BAAI,Academia,,,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-06-09 16:06:43


In [108]:
notable_df = df.dropna(subset=['Inclusion criteria'])
notable_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training dataset size (GB),Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Architecture,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,PaLM 2,Language,Language modelling,Google Research,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,PaLM 2 was trained on TPU v4 according to the ...,,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-06-06 18:47:36
1,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,,...,,,,,Yes,,,,,2023-05-29 20:51:04
3,Minerva (540B),Language,Quantitative Reasoning Problems,Google Research,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,"$3,267,257.75",,Yes,,Industry,,Language models have achieved remarkable perfo...,2023-06-08 00:39:43
4,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,https://arxiv.org/abs/2204.02311,228.0,...,,,"$3,232,806.53",,Yes,,Industry,,Large language models have been shown to achie...,2023-05-29 20:51:04
6,Chinchilla,Language,Language modelling,DeepMind,Industry,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me...",2022-03-29,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,,...,,,"$753,491.58",,Yes,,Industry,,We investigate the optimal model size and numb...,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Taiyi-Stable\nDiffusion,Drawing,Text-to-image,,,,2022-10-31,,https://huggingface.co/IDEA-CCNL/Taiyi-Stable-...,,...,,,,,,,,Likely,,2023-06-09 16:04:42
543,CogVideo,Multimodal,Video generation,"Tsinghua University,BAAI",Academia,"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Li...",2022-05-29,CogVideo: Large-scale Pretraining for Text-to-...,https://arxiv.org/abs/2205.15868,,...,,,,,,,,Likely,Large-scale pretrained transformers have creat...,2023-06-09 15:44:54
544,Zidong Taichu,Multimodal,,Chinese Academy of Sciences,,,,Zidong Ancestral multi-modal large model,https://gitee.com/zidongtaichu/multi-modal-models,,...,,,,,,,,Likely,,2023-06-09 15:44:36
546,,Vision,Image classification,"University of Guelph,Canadian Institute for Ad...",Industry - Academia Collaboration,"Terrance DeVries, Graham W. Taylor",2017-08-15,Improved Regularization of Convolutional Neura...,https://arxiv.org/abs/1708.04552,1450.0,...,,,,https://www.yuzeh.com/data/agz-cost.html,,,Industry,,,2023-06-09 16:00:52


In [109]:
# Count the number of notable ML systems for each Organization since 2010.
organization_system_count = defaultdict(int)
for i, row in notable_df.iterrows():
    pub_date = row['Publication date']
    if type(pub_date) == str and int(pub_date[:4]) >= 2010 and row['Organization Categorization'] == 'Industry':
        org = row['Organization']
        organization_system_count[org] += 1

In [110]:
# Print organization and its system count, in descending order of count
for org, count in sorted(organization_system_count.items(), key=lambda x: x[1], reverse=True):
    print(f"{org}: {count} systems")

Google: 20 systems
DeepMind: 16 systems
OpenAI: 12 systems
Google Brain: 7 systems
Google DeepMind: 7 systems
Google Research: 6 systems
Microsoft Research: 5 systems
Meta AI: 4 systems
Facebook AI Research: 4 systems
Facebook AI research: 4 systems
MetaAI: 3 systems
Microsoft: 3 systems
Alibaba Group: 2 systems
Facebook: 2 systems
Facebook AI: 2 systems
Google Inc.: 2 systems
Amazon: 1 systems
Stability AI, Runway: 1 systems
Google AI, Brain team: 1 systems
Microsoft Research,Peking University: 1 systems
Open AI: 1 systems
Microsoft Bing: 1 systems
Google Research, Brain Team: 1 systems
Google Research,Brain Team: 1 systems
Google AI: 1 systems
AllenAI, University of Washington: 1 systems
Google Brain,Google Research: 1 systems
Twitter: 1 systems
Megvii Inc: 1 systems
Nvidia: 1 systems
Salesforce: 1 systems
Baidu Research- Silicon Valley AI Lab: 1 systems
Netflix: 1 systems
Xerox Research Centre Europe (XRCE): 1 systems
Google Inc: 1 systems
NVIDIA: 1 systems
Baidu: 1 systems


Industry:
1. Google: 42
  - Google: 20
  - Google Research: 7
  - Google Brain: 7
  - Google Inc.: 2
  - Google AI, Brain team: 1
  - Google Research, Brain Team: 1
  - Google Research,Brain Team: 1
  - Google AI: 1
  - Google Brain,Google Research: 1
  - Google Inc: 1
2. DeepMind: 23
  - DeepMind: 16
  - Google DeepMind: 7
3. Meta: 20
  - Meta AI: 4
  - MetaAI: 4
  - Facebook AI Research: 4
  - Facebook AI research: 4
  - Facebook: 2
  - Facebook AI 2
4. OpenAI: 13
   1. OpenAI: 12
   2. Open AI: 1
5. Microsoft: 10
   - Microsoft Research: 5
   - Microsoft: 3
   - Microsoft Research,Peking University: 1
   - Microsoft Bing: 1
6. Alibaba: 2
  - Alibaba Group: 2
6. NVIDIA: 2
  - Nvidia: 1
  - NVIDIA: 1
6. Baidu: 2
  - Baidu Research- Silicon Valley AI Lab: 1
  - Baidu: 1
7. Amazon: 1
  - Amazon: 1
7. Stability: 1
  - Stability AI, Runway: 1
7. Runway: 1
  - Stability AI, Runway: 1
7. Twitter: 1
7. Megvii: 
  - Megvii Inc: 1
7. Salesforce: 1
  - Salesforce: 1
7. Netflix: 1
  - Netflix: 1
7. Xerox: 1
  - Xerox Research Centre Europe (XRCE): 1

In [111]:
pcd_rankings = {
    "Google": 42,
    "DeepMind": 23,
    "Meta": 20,
    "OpenAI": 13,
    "Microsoft": 10,
    "Alibaba": 2,
    "NVIDIA": 2,
    "Baidu": 2,
    "Amazon": 1,
    "Stability": 1,
    "Runway": 1,
    "Twitter": 1,
    "Megvii": 1,
    "Salesforce": 1,
    "Netflix": 1,
    "Xerox": 1
}

# OpenAlex

In [91]:
concept_ids = [
    'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [92]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

Sort by works because we are interested in impact since around 2010, in the Deep Learning era of AI.
Citations are not as informative for this, because it may include citations of works published before 2010.

We can only sort by number of works in any year.
But we want to narrow down based on number of works published since 2010.
So we fetch 200 institutions initially, assuming that this is large enough to include all of the institutions in the narrowed-down set.

In [93]:
top_institutions = merge_pages(
    Institutions() \
        .filter(concepts={"id": concept_query}) \
        .filter(type="company") \
        .sort(cited_by_count="desc") \
        .paginate(per_page=100, n_max=200)
)
top_institutions

[{'id': 'https://openalex.org/I1283103587',
  'ror': 'https://ror.org/02bbd5539',
  'display_name': 'AT&T (United States)',
  'country_code': 'US',
  'type': 'company',
  'homepage_url': 'https://www.att.com/',
  'image_url': None,
  'image_thumbnail_url': None,
  'display_name_acronyms': [],
  'display_name_alternatives': ['American Telephone and Telegraph Company'],
  'repositories': [],
  'works_count': 33295,
  'cited_by_count': 1726556,
  'summary_stats': {'2yr_mean_citedness': 1.6395348837209303,
   'h_index': 472,
   'i10_index': 15432},
  'ids': {'openalex': 'https://openalex.org/I1283103587',
   'ror': 'https://ror.org/02bbd5539',
   'mag': '1283103587',
   'grid': 'grid.431860.8',
   'wikipedia': 'https://en.wikipedia.org/wiki/AT',
   'wikidata': 'https://www.wikidata.org/wiki/Q210323'},
  'geo': {'city': 'Bedminster',
   'geonames_city_id': '5095528',
   'region': 'New Jersey',
   'country_code': 'US',
   'country': 'United States',
   'latitude': 40.66958,
   'longitude': -

In [94]:
institution_names = [ins_obj['display_name'] for ins_obj in top_institutions]

In [95]:
for name in ['Google', 'DeepMind', 'OpenAI']:
    print(f"{name}: {any(['Google' in name for name in institution_names])}")

Google: True
DeepMind: True
OpenAI: True


In [96]:
selected_institution_ids = [ins_obj['id'] for ins_obj in top_institutions]
selected_institution_ids

['https://openalex.org/I1283103587',
 'https://openalex.org/I1291425158',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I1341412227',
 'https://openalex.org/I4210131439',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I1343180700',
 'https://openalex.org/I2250650973',
 'https://openalex.org/I4210122849',
 'https://openalex.org/I1324840837',
 'https://openalex.org/I4210127672',
 'https://openalex.org/I65143321',
 'https://openalex.org/I1325886976',
 'https://openalex.org/I2251713219',
 'https://openalex.org/I118136607',
 'https://openalex.org/I4210133369',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I118347220',
 'https://openalex.org/I1292974536',
 'https://openalex.org/I74760111',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210122332',
 'https://openalex.org/I1287521167',
 'https://openalex.org/I1292669757',
 'https://openalex.org/I4210143601',
 'https://openalex.org/I200576644',
 'https://openalex.org/I4210158702',
 'https:

In [97]:
# Took ~15 minutes for ~100K works on Macbook Pro 2019
# params = []
# n_max = int(1e6)
# works = []
# for institution_id in tqdm(selected_institution_ids):
#     works_instance = Works()
#     try:
#         new_works = merge_pages(
#             works_instance \
#                 .filter(authorships={"institutions": {"id": institution_id}}) \
#                 .filter(concepts={"id": concept_query}) \
#                 .filter(from_publication_date="2010-01-01") \
#                 .filter(to_publication_date="2023-06-15") \
#                 .filter(cited_by_count=">9") \
#                 .paginate(per_page=200, n_max=n_max)
#         )
#     except Exception as e:
#         print(f"Error for {Institutions()[institution_id]['display_name']} ({institution_id})")
#         print(f"{works_instance.params}")
#         print(e)
#     works.extend(new_works)
#     params.append(works_instance.params)

# assert len(works) < n_max
# len(works)

  0%|          | 0/200 [00:00<?, ?it/s]

100%|██████████| 200/200 [09:05<00:00,  2.73s/it]


53607

In [86]:
params

[{'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I1283103587'}},
   'concepts': {'id': 'https://openalex.org/C154945302|https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-page': 200,
  'page': None,
  'cursor': '*'},
 {'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I1291425158'}},
   'concepts': {'id': 'https://openalex.org/C154945302|https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-page': 200,
  'page': None,
  'cursor': '*'},
 {'filter': {'authorships': {'institutions': {'id': 'https://openalex.org/I1290206253'}},
   'concepts': {'id': 'https://openalex.org/C154945302|https://openalex.org/C119857082'},
   'from_publication_date': '2010-01-01',
   'to_publication_date': '2023-06-15',
   'cited_by_count': '>9'},
  'per-page': 200,
  'pa

In [98]:
# Save to avoid fetching every time
# timestamp = datetime.datetime.now()
# with open(data_file_location + f"longlist_institution_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
#     obj = {
#         "params": params,  # for reproducibility
#         "works": works,
#     }
#     pickle.dump(obj, f)

In [99]:
# Took 43s for ~100K works on Macbook Pro 2019
with open(data_file_location + "longlist_institution_works_openalex_2023-06-15_13-24-38", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

53607

In [68]:
ins_cited_by_count = defaultdict(int)
ins_works_count = defaultdict(int)
for ins_obj in top_institutions:
    counts_by_year = ins_obj['counts_by_year']
    for year_counts in counts_by_year:
        if year_counts['year'] >= 2010:
            ins_cited_by_count[ins_obj['id']] += year_counts['cited_by_count']
            ins_works_count[ins_obj['id']] += year_counts['works_count']

In [127]:
i = 0
for ins, works_count in sorted(ins_works_count.items(), key=lambda x: x[1], reverse=True):
    if ins_cited_by_count[ins] < 1e5:
        continue
    if i > 20:
        print("Done")
        break
    i += 1
    print(f"{Institutions()[ins]['display_name']}: {works_count} works")

Google (United States): 19083 works
Microsoft (United States): 15487 works
Microsoft Research (United Kingdom): 7212 works
Huawei Technologies (China): 7017 works
Amazon (United States): 5459 works
Tencent (China): 4535 works
Meta (Israel): 4244 works
Alibaba Group (China): 3428 works
Adobe Systems (United States): 3415 works
Microsoft Research Asia (China): 3374 works
Baidu (China): 2823 works
Decision Systems (United States): 2765 works
Meta (United States): 2173 works
Nvidia (United States): 1782 works
Nvidia (United Kingdom): 1455 works
DeepMind (United Kingdom): 1123 works
Yahoo (United States): 896 works
Google (United Kingdom): 301 works


Some of these institutions sound weird.
Let's investigate.

In [128]:
Institutions().search("Decision Systems").get()

[{'id': 'https://openalex.org/I4210143601',
  'ror': 'https://ror.org/0434dpa13',
  'display_name': 'Decision Systems (United States)',
  'relevance_score': 28400.467,
  'country_code': 'US',
  'type': 'company',
  'homepage_url': 'http://www.decisionsystems.com/',
  'image_url': None,
  'image_thumbnail_url': None,
  'display_name_acronyms': [],
  'display_name_alternatives': [],
  'repositories': [],
  'works_count': 5140,
  'cited_by_count': 209992,
  'summary_stats': {'2yr_mean_citedness': 4.631051752921536,
   'h_index': 197,
   'i10_index': 2172},
  'ids': {'openalex': 'https://openalex.org/I4210143601',
   'ror': 'https://ror.org/0434dpa13',
   'grid': 'grid.421803.c'},
  'geo': {'city': 'Northbrook',
   'geonames_city_id': '4904056',
   'region': 'Illinois',
   'country_code': 'US',
   'country': 'United States',
   'latitude': 42.151894,
   'longitude': -87.80496},
  'international': {'display_name': {'en': 'Decision Systems (United States)'}},
  'associated_institutions': [],

In [129]:
Institutions().search("Management Sciences").get()

[{'id': 'https://openalex.org/I4210155590',
  'ror': 'https://ror.org/05shz5j84',
  'display_name': 'Management Sciences (United States)',
  'relevance_score': 12529.667,
  'country_code': 'US',
  'type': 'company',
  'homepage_url': 'http://www.mgtsciences.com/',
  'image_url': None,
  'image_thumbnail_url': None,
  'display_name_acronyms': ['MSI'],
  'display_name_alternatives': [],
  'repositories': [],
  'works_count': 4018,
  'cited_by_count': 93228,
  'summary_stats': {'2yr_mean_citedness': 1.6057529610829102,
   'h_index': 132,
   'i10_index': 1123},
  'ids': {'openalex': 'https://openalex.org/I4210155590',
   'ror': 'https://ror.org/05shz5j84',
   'grid': 'grid.455730.0'},
  'geo': {'city': 'Albuquerque',
   'geonames_city_id': '5454711',
   'region': 'New Mexico',
   'country_code': 'US',
   'country': 'United States',
   'latitude': 35.09488,
   'longitude': -106.576614},
  'international': {'display_name': {'en': 'Management Sciences (United States)'}},
  'associated_institu

From inspecting the papers associated with "Decision Systems", it seems that OpenAlex (or one of its sources) has mistakenly associated the MIT Laboratory for Information and Decision Systems (and possibly similar departments) with the company Decision Systems Inc. ([example](https://arxiv.org/pdf/1606.05830.pdf)). 
Similarly for "Management Sciences", there is a mistaken association of Department of Industrial Engineering and Management Sciences at Northwestern University ([example](https://epubs.siam.org/doi/epdf/10.1137/16M1080173)) with a company called "Management Sciences".

So I will eliminate these spurious cases.

In [130]:
ignore_institutions = [
    "https://openalex.org/I4210143601",
    "https://openalex.org/I4210155590",
]

We are also getting departments or branches of the same parent company, e.g. Google.
So we will allow more than 20 to be printed.

In [136]:
i = 0
for ins, works_count in sorted(ins_works_count.items(), key=lambda x: x[1], reverse=True):
    if ins_cited_by_count[ins] < 1e5:
        continue
    if ins in ignore_institutions:
        continue
    # if i > 20:
    #     print("Done")
    #     break
    i += 1
    print(f"{Institutions()[ins]['display_name']}: {works_count} works")
print(f"Got {i} institutions")

Google (United States): 19083 works
Microsoft (United States): 15487 works
Microsoft Research (United Kingdom): 7212 works
Huawei Technologies (China): 7017 works
Amazon (United States): 5459 works
Tencent (China): 4535 works
Meta (Israel): 4244 works
Alibaba Group (China): 3428 works
Adobe Systems (United States): 3415 works
Microsoft Research Asia (China): 3374 works
Baidu (China): 2823 works
Meta (United States): 2173 works
Nvidia (United States): 1782 works
Nvidia (United Kingdom): 1455 works
DeepMind (United Kingdom): 1123 works
Yahoo (United States): 896 works
Google (United Kingdom): 301 works
Got 17 institutions


In [70]:
openalex_rankings = {
    "Google": 19083,
    "Microsoft": 15487 + 7212 + 3374,
    "Huawei": 7017,
    "Amazon": 5459 + 2104,
    "Tencent": 4535,
    "Meta": 4244 + 2173,
    "Tata Consultancy Services": 3574,
    "China Mobile": 3467,
    "Alibaba": 3428 + 1675,
    "Adobe Systems": 3415,
    "Baidu": 2823,
    "Aditya Birla": 2529,
    "Aselsan": 1872,
    "NVIDIA": 1782 + 1455,
    "Jingdong": 1249,
    "Samsung": 1236,
    "United Imaging Healthcare": 1164,
    "DeepMind": 1123,
}

In [124]:
for i, (ins, cited_by_count) in enumerate(sorted(ins_cited_by_count.items(), key=lambda x: x[1], reverse=True)):
    # if ins_cited_by_count[ins] < 1e5:
    #     continue
    if i > 20:
        break
    print(f"{Institutions()[ins]['display_name']}: {cited_by_count} citations since 2010")

Google (United States): 4093366 citations since 2010
Microsoft (United States): 1702011 citations since 2010
DeepMind (United Kingdom): 1434751 citations since 2010
Microsoft Research (United Kingdom): 1330987 citations since 2010
Meta (Israel): 741232 citations since 2010
Microsoft Research Asia (China): 632097 citations since 2010
Google (United Kingdom): 576345 citations since 2010
Meta (United States): 297247 citations since 2010
Decision Systems (United States): 234505 citations since 2010
Adobe Systems (United States): 218333 citations since 2010
Yahoo (United States): 199194 citations since 2010
Baidu (China): 197226 citations since 2010
Tencent (China): 157185 citations since 2010
Huawei Technologies (China): 153542 citations since 2010
Amazon (United States): 142423 citations since 2010
Nvidia (United Kingdom): 134422 citations since 2010
Nvidia (United States): 118260 citations since 2010
Alibaba Group (China): 102661 citations since 2010
Group Sense (China): 80619 citations 

# Aggregation

In [97]:
set(openalex_rankings.keys()).difference(pcd_rankings.keys())

{'Aditya Birla',
 'Adobe Systems',
 'Aselsan',
 'China Mobile',
 'Decision Systems',
 'Huawei',
 'Management Sciences',
 'Tata Consultancy Services',
 'Tencent'}

In [98]:
set(pcd_rankings.keys()).difference(openalex_rankings.keys())

{'DeepMind',
 'Megvii',
 'Netflix',
 'OpenAI',
 'Runway',
 'Salesforce',
 'Stability',
 'Twitter',
 'Xerox'}

In [78]:
pcd_total = sum(pcd_rankings.values())
pcd_scores = {ins: n / pcd_total for ins, n in pcd_rankings.items()}
pcd_scores

{'Google': 0.3442622950819672,
 'DeepMind': 0.1885245901639344,
 'Meta': 0.16393442622950818,
 'OpenAI': 0.10655737704918032,
 'Microsoft': 0.08196721311475409,
 'Alibaba': 0.01639344262295082,
 'NVIDIA': 0.01639344262295082,
 'Baidu': 0.01639344262295082,
 'Amazon': 0.00819672131147541,
 'Stability': 0.00819672131147541,
 'Runway': 0.00819672131147541,
 'Twitter': 0.00819672131147541,
 'Megvii': 0.00819672131147541,
 'Salesforce': 0.00819672131147541,
 'Netflix': 0.00819672131147541,
 'Xerox': 0.00819672131147541}

In [80]:
openalex_total = sum(openalex_rankings.values())
openalex_scores = {ins: n / openalex_total for ins, n in openalex_rankings.items()}
openalex_scores

{'Google': 0.19020801977533466,
 'Microsoft': 0.2598801917729026,
 'Huawei': 0.06994129197524096,
 'Amazon': 0.07538349596818404,
 'Tencent': 0.04520218884248507,
 'Meta': 0.06396084802695187,
 'Tata Consultancy Services': 0.03562351111864204,
 'China Mobile': 0.03455699861453049,
 'Alibaba': 0.05086367578019875,
 'Adobe Systems': 0.03403869347234543,
 'Baidu': 0.02813798877670019,
 'Decision Systems': 0.027559879195032246,
 'Aditya Birla': 0.025207571242038533,
 'Management Sciences': 0.023014741794332532,
 'Aselsan': 0.018658985118661976,
 'NVIDIA': 0.01776191852641861}

In [88]:
aggregate_scores = dict()
merged_keys = set(openalex_scores.keys()) | set(pcd_scores.keys())
for ins in merged_keys:
    openalex_score = openalex_scores.get(ins)
    if openalex_score is None:
        openalex_score = 0
    pcd_score = pcd_scores.get(ins)
    if pcd_score is None:
        pcd_score = 0
    aggregate_score = (openalex_score + pcd_score) / 2
    aggregate_scores[ins] = aggregate_score

In [93]:
for ins, score in sorted(aggregate_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{ins}: {100 * score:.1f}%")

Google: 26.7%
Microsoft: 17.1%
Meta: 11.4%
DeepMind: 9.4%
OpenAI: 5.3%
Amazon: 4.2%
Huawei: 3.5%
Alibaba: 3.4%
Tencent: 2.3%
Baidu: 2.2%
Tata Consultancy Services: 1.8%
China Mobile: 1.7%
NVIDIA: 1.7%
Adobe Systems: 1.7%
Decision Systems: 1.4%
Aditya Birla: 1.3%
Management Sciences: 1.2%
Aselsan: 0.9%
Stability: 0.4%
Twitter: 0.4%
Runway: 0.4%
Netflix: 0.4%
Megvii: 0.4%
Xerox: 0.4%
Salesforce: 0.4%
