In [2]:
%load_ext autoreload
%autoreload 2

In [158]:
from collections import defaultdict
import datetime
import numpy as np
from numpy.random import default_rng
import os
import pandas as pd
import pickle
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from scipy.stats import mannwhitneyu
import seaborn as sns
from tqdm.notebook import tqdm
import xarray as xr

from researcher_impact.bootstrap import bootstrap_wrapper, propagate_bootstrap_list, propagate_bootstrap_dict, bootstrap_stats
from researcher_impact.citations import get_bounded_citations
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.regression import fit_linear_regression, predict
from researcher_impact.utils import dict_to_dataarray
from researcher_impact.plotting import save_plot
from researcher_impact.pyalex_utils import merge_pages, merge_sample

In [132]:
CITATION_YEAR_BOUND = 3

In [4]:
# Location to save data
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

# Load dataset

In [136]:
with open(data_file_location + "selected_institution_works_openalex_deduplicated_2023-08-29_12-22-34", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
    selected_institution_ids = works_obj["selected_institution_ids"]
    institution_aliases = works_obj["institution_aliases"]

In [137]:
len(works)

66093

In [138]:
works[0]

{'id': 'https://openalex.org/W2919115771',
 'doi': 'https://doi.org/10.1038/nature14539',
 'title': 'Deep learning',
 'display_name': 'Deep learning',
 'publication_year': 2015,
 'publication_date': '2015-05-27',
 'ids': {'openalex': 'https://openalex.org/W2919115771',
  'doi': 'https://doi.org/10.1038/nature14539',
  'mag': '2919115771',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/26017442'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1038/nature14539',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S137773608',
   'display_name': 'Nature',
   'issn_l': '0028-0836',
   'issn': ['1476-4687', '0028-0836'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310319908',
   'host_organization_name': 'Nature Portfolio',
   'host_organization_lineage': ['https://openalex.org/P4310319908',
    'https://openalex.org/P4310319965'],
   'host_organization_lineage_names': ['Nature Portfolio'

In [139]:
selected_institution_ids

['https://openalex.org/I1291425158',
 'https://openalex.org/I4210113297',
 'https://openalex.org/I4210100430',
 'https://openalex.org/I4210148186',
 'https://openalex.org/I4210117425',
 'https://openalex.org/I4210131802',
 'https://openalex.org/I4210090411',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210114444',
 'https://openalex.org/I4210111288',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I4210124949',
 'https://openalex.org/I4210105678',
 'https://openalex.org/I4210087053',
 'https://openalex.org/I4210125051',
 'https://openalex.org/I4210162141',
 'https://openalex.org/I4210086099',
 'https://openalex.org/I4210153468',
 'https://openalex.org/I4210161634',
 'https://openalex.org/I4210110431',
 'https://openalex.org/I4210099966',
 'https://openalex.org/I4210108625',
 'https://openalex.org/I4210135422',
 'https://openalex.org/I4210139986',
 'https://openalex.org/I4210109507',
 

In [140]:
institution_aliases

{'https://openalex.org/I1291425158': 'Google',
 'https://openalex.org/I4210113297': 'Google',
 'https://openalex.org/I4210100430': 'Google',
 'https://openalex.org/I4210148186': 'Google',
 'https://openalex.org/I4210117425': 'Google',
 'https://openalex.org/I4210131802': 'Google',
 'https://openalex.org/I4210090411': 'DeepMind',
 'https://openalex.org/I2252078561': 'Meta',
 'https://openalex.org/I4210114444': 'Meta',
 'https://openalex.org/I4210111288': 'Meta',
 'https://openalex.org/I1290206253': 'Microsoft',
 'https://openalex.org/I4210164937': 'Microsoft',
 'https://openalex.org/I4210113369': 'Microsoft',
 'https://openalex.org/I4210124949': 'Microsoft',
 'https://openalex.org/I4210105678': 'Microsoft',
 'https://openalex.org/I4210087053': 'Microsoft',
 'https://openalex.org/I4210125051': 'Microsoft',
 'https://openalex.org/I4210162141': 'Microsoft',
 'https://openalex.org/I4210086099': 'Microsoft',
 'https://openalex.org/I4210153468': 'Microsoft',
 'https://openalex.org/I4210161634

Add extra OpenAI data and merge

In [141]:
with open("data/openai/openai_research_works_processed_2023-09-07_17-16-25", "rb") as f:
    openai_research_works = pickle.load(f)

In [142]:
len(openai_research_works)

95

In [143]:
openai_research_works[0]

{'id': 'https://openalex.org/W3200980294',
 'doi': None,
 'title': 'Recursively Summarizing Books with Human Feedback',
 'display_name': 'Recursively Summarizing Books with Human Feedback',
 'relevance_score': 486.11057,
 'publication_year': 2021,
 'publication_date': '2021-09-22',
 'ids': {'openalex': 'https://openalex.org/W3200980294', 'mag': '3200980294'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'http://arxiv.org/pdf/2109.10862.pdf',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',


In [144]:
# If the work is already present, then replace it with the version from the extra OpenAI dataset
# Else, append it
openai_works_dict = {work["id"]:work for work in openai_research_works}
new_works = []
openai_works_replaced = set()
for work in works:
    if work["id"] in openai_works_dict.keys():
        openai_work = openai_works_dict[work["id"]]
        new_works.append(openai_work)
        openai_works_replaced.add(work["id"])
    else:
        new_works.append(work)

openai_works_added = set()
for work_id in openai_works_dict.keys():
    if work_id not in openai_works_replaced:
        new_works.append(openai_works_dict[work_id])
        openai_works_added.add(work_id)

In [145]:
openai_works_replaced

{'https://openalex.org/W2560512785',
 'https://openalex.org/W2762117857',
 'https://openalex.org/W2762872434',
 'https://openalex.org/W2898917980',
 'https://openalex.org/W2950602864',
 'https://openalex.org/W2963989027',
 'https://openalex.org/W2964075320',
 'https://openalex.org/W2964263543',
 'https://openalex.org/W2973525135',
 'https://openalex.org/W3030163527',
 'https://openalex.org/W3082115681',
 'https://openalex.org/W3093419064'}

In [146]:
openai_works_added

{'https://openalex.org/W2462906003',
 'https://openalex.org/W2530944449',
 'https://openalex.org/W2548137223',
 'https://openalex.org/W2566467060',
 'https://openalex.org/W2578206533',
 'https://openalex.org/W2591957724',
 'https://openalex.org/W2595180411',
 'https://openalex.org/W2596367596',
 'https://openalex.org/W2606347107',
 'https://openalex.org/W2606433045',
 'https://openalex.org/W2609650878',
 'https://openalex.org/W2623491082',
 'https://openalex.org/W2736601468',
 'https://openalex.org/W2749928749',
 'https://openalex.org/W2765602917',
 'https://openalex.org/W2766774033',
 'https://openalex.org/W2767313115',
 'https://openalex.org/W2785397462',
 'https://openalex.org/W2786303200',
 'https://openalex.org/W2787887017',
 'https://openalex.org/W2789008106',
 'https://openalex.org/W2795900505',
 'https://openalex.org/W2796979132',
 'https://openalex.org/W2798877128',
 'https://openalex.org/W2807324060',
 'https://openalex.org/W2883433335',
 'https://openalex.org/W2885550588',
 

In [147]:
Works()[list(openai_works_added)[2]]

{'id': 'https://openalex.org/W2766774033',
 'doi': None,
 'title': 'Interpretable and Pedagogical Examples',
 'display_name': 'Interpretable and Pedagogical Examples',
 'publication_year': 2017,
 'publication_date': '2017-11-02',
 'ids': {'openalex': 'https://openalex.org/W2766774033', 'mag': '2766774033'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://arxiv.org/pdf/1711.00694.pdf',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',
  'is_accepted': False,
  'is_published': False},
 '

In [148]:
len(new_works) - len(works)

83

In [149]:
works = new_works

In [150]:
len(works)

66176

# Statistical significance

In [151]:
processor = OpenAlexProcessor(works, selected_institution_ids, institution_aliases, citation_year_bound=CITATION_YEAR_BOUND)
processor.process_works()

In [152]:
institution_individual_citations = processor.get_individual_bounded_citations()

In [154]:
for k, v in sorted(institution_individual_citations.items(), key=lambda x: np.mean(x[1]), reverse=True):
    print(k, np.mean(v))

Quansight 6808.333333333333
Enthought 1770.3333333333333
OpenAI 153.20731707317074
DeepMind 91.3682119205298
Meta 56.33914088314809
Google 43.36305101535414
Group Sense 40.59084194977844
Uber 31.40948275862069
Twitter 31.40548780487805
Nvidia 30.762755102040817
Naver 23.32075471698113
Microsoft 23.10701030927835
Adobe 23.092077087794433
Baidu 18.891666666666666
Salesforce 18.578199052132703
Tencent 16.056938483547924
Yandex 14.34959349593496
Amazon 13.60769750168805
Netflix 12.564705882352941
Alibaba 12.111857476635514
Intel 11.713271103896103
Xerox 11.691729323308271
IBM 10.792873279934279
Huawei 10.694357976653697
NEC 8.998475609756097


In [155]:
for k, v in sorted(institution_individual_citations.items(), key=lambda x: np.median(x[1]), reverse=True):
    print(k, np.median(v))

Quansight 7530.0
OpenAI 40.5
Enthought 13.5
DeepMind 9.0
Meta 7.0
Group Sense 7.0
Google 6.0
Adobe 6.0
Twitter 6.0
Uber 6.0
Microsoft 5.0
Netflix 5.0
IBM 3.0
Salesforce 3.0
Baidu 3.0
Nvidia 3.0
Tencent 3.0
Alibaba 3.0
Amazon 2.0
Intel 2.0
Huawei 2.0
Yandex 2.0
Naver 2.0
Xerox 2.0
NEC 1.0


In [156]:
for k, v in sorted(institution_individual_citations.items(), key=lambda x: np.sum(x[1]), reverse=True):
    print(k, np.sum(v))

Google 437750
Microsoft 336207
Meta 187553
IBM 105101
DeepMind 68983
Intel 57723
Tencent 56119
Huawei 54969
Adobe 53920
Nvidia 48236
Alibaba 41471
Amazon 40306
Baidu 36272
Group Sense 27480
OpenAI 25126
Enthought 21244
Quansight 20425
Naver 13596
NEC 11806
Twitter 10301
Uber 7287
Xerox 6220
Salesforce 3920
Yandex 3530
Netflix 2136


In [159]:
# Perform the Mann-Whitney U test
ins1 = 'Google'
ins2 = 'Microsoft'
array1 = np.copy(institution_individual_citations[ins1])
array2 = np.copy(institution_individual_citations[ins2])
u_statistic, p_value = mannwhitneyu(array1, array2)

print(f"{ins1}: sample_size={len(array1)}, mean={np.mean(array1):.2f}, median={np.median(array1):.2f}")
print(f"{ins2}: sample_size={len(array2)}, mean={np.mean(array2):.2f}, median={np.median(array2):.2f}")
print("U statistic:", u_statistic)
print("Common language effect size:", u_statistic / (len(array1) * len(array2)))
print("P-value:", p_value)

Google: sample_size=10095, mean=43.36, median=6.00
Microsoft: sample_size=14550, mean=23.11, median=5.00
U statistic: 77637855.5
Common language effect size: 0.5285720738891186
P-value: 1.5935126077562757e-14


# Sample of papers

In [20]:
for work in works[:10]:
    has_affiliation = False
    for authorship in work['authorships']:
        if len(authorship['institutions']) > 0:
            has_affiliation = True
    if has_affiliation:
        authors = [authorship['author']['display_name'] for authorship in work['authorships']]
        institutions = [[ins['display_name'] for ins in authorship['institutions']] for authorship in work['authorships']]
        authors_institutions = []
        for auth, inss in zip(authors, institutions):
            if len(inss) == 0:
                authors_institutions.append(f"{auth} - [unknown]")
            else:
                authors_institutions.append(f"{auth} - {', '.join(inss)}")
        
        print(f"\"{work['title']}\"")
        for auth_ins in authors_institutions:
            print(f"    {auth_ins}")
        print()

"Deep learning"
    Yann LeCun - Meta (United States), New York University
    Yoshua Bengio - Université de Montréal
    Geoffrey E. Hinton - Google (United States), University of Toronto

"Going deeper with convolutions"
    Christian Szegedy - Google (United States)
    Wei Liu - University of North Carolina at Chapel Hill
    Yangqing Jia - Google (United States)
    Pierre Sermanet - Google (United States)
    Scott Reed - University of Michigan–Ann Arbor
    Dragomir Anguelov - Google (United States)
    Dumitru Erhan - Google (United States)
    Vincent Vanhoucke - Google (United States)
    Andrew Rabinovich - Magic Leap (United States)

"ImageNet classification with deep convolutional neural networks"
    Alex Krizhevsky - Google (United States)
    Ilya Sutskever - Google (United States)
    Geoffrey E. Hinton - OpenAI (United States)

"Attention is All you Need"
    Ashish Vaswani - Google (United States)
    Noam Shazeer - Google (United States)
    Niki Parmar - University

In [21]:
# Create a random number generator, with a fixed random seed for reproducibility
SEED = 20230105
rng = default_rng(seed=SEED)

for work in rng.choice(works, size=100):
    has_affiliation = False
    for authorship in work['authorships']:
        if len(authorship['institutions']) > 0:
            has_affiliation = True
    if has_affiliation:
        authors = [authorship['author']['display_name'] for authorship in work['authorships']]
        institutions = [[ins['display_name'] for ins in authorship['institutions']] for authorship in work['authorships']]
        authors_institutions = []
        for auth, inss in zip(authors, institutions):
            if len(inss) == 0:
                authors_institutions.append(f"{auth} - [unknown]")
            else:
                authors_institutions.append(f"{auth} - {', '.join(inss)}")
        
        print(f"\"{work['title']}\"")
        for auth_ins in authors_institutions:
            print(f"    {auth_ins}")
        print()


"Closed-Loop DPD with Dynamic Resource Block Scheduling"
    Peter Pawliuk - Intel (United States)
    Benjamin Jann - Intel (United States)

"Vamsa: Automated Provenance Tracking in Data Science Scripts"
    Mohammad Hossein Namaki - Washington State University
    Avrilia Floratou - Microsoft (United States)
    Fotis Psallidas - Microsoft (United States)
    Subru Krishnan - Microsoft (United States)
    Ashvin Agrawal - Microsoft (United States)
    Yinghui Wu - Case Western Reserve University
    Yiwen Zhu - Microsoft (United States)
    Markus Weimer - Microsoft (United States)

"The basics of natural language processing"
    Chenguang Zhu - Microsoft (United States)

"Is this Change the Answer to that Problem?"
    Haoye Tian - University of Luxembourg
    Tang, Xunzhu - University of Luxembourg
    Andrew Habib - University of Luxembourg
    Shangwen Wang - National University of Defense Technology
    Kui Liu - Huawei Technologies (China)
    Xin Xia - Huawei Technologies (Chi

# Sense checks

We had not heard of Quansight or Enthought before - what are they known for?

In [22]:
quansight_id = 'https://openalex.org/I4210098317'
enthought_id = 'https://openalex.org/I4210121859'

In [23]:
Works().filter(authorships={"institutions": {"id": quansight_id}}).get()

[{'id': 'https://openalex.org/W3103145119',
  'doi': 'https://doi.org/10.1038/s41592-019-0686-2',
  'title': 'SciPy 1.0: fundamental algorithms for scientific computing in Python',
  'display_name': 'SciPy 1.0: fundamental algorithms for scientific computing in Python',
  'publication_year': 2020,
  'publication_date': '2020-02-03',
  'ids': {'openalex': 'https://openalex.org/W3103145119',
   'doi': 'https://doi.org/10.1038/s41592-019-0686-2',
   'mag': '3103145119'},
  'language': 'en',
  'primary_location': {'is_oa': True,
   'landing_page_url': 'https://doi.org/10.1038/s41592-019-0686-2',
   'pdf_url': 'https://www.nature.com/articles/s41592-019-0686-2.pdf',
   'source': {'id': 'https://openalex.org/S127827428',
    'display_name': 'Nature Methods',
    'issn_l': '1548-7091',
    'issn': ['1548-7105', '1548-7091'],
    'is_oa': False,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/P4310319908',
    'host_organization_name': 'Nature Portfolio',
    'host_orga

In [24]:
Works().filter(authorships={"institutions": {"id": enthought_id}}).get()

[{'id': 'https://openalex.org/W3103145119',
  'doi': 'https://doi.org/10.1038/s41592-019-0686-2',
  'title': 'SciPy 1.0: fundamental algorithms for scientific computing in Python',
  'display_name': 'SciPy 1.0: fundamental algorithms for scientific computing in Python',
  'publication_year': 2020,
  'publication_date': '2020-02-03',
  'ids': {'openalex': 'https://openalex.org/W3103145119',
   'doi': 'https://doi.org/10.1038/s41592-019-0686-2',
   'mag': '3103145119'},
  'language': 'en',
  'primary_location': {'is_oa': True,
   'landing_page_url': 'https://doi.org/10.1038/s41592-019-0686-2',
   'pdf_url': 'https://www.nature.com/articles/s41592-019-0686-2.pdf',
   'source': {'id': 'https://openalex.org/S127827428',
    'display_name': 'Nature Methods',
    'issn_l': '1548-7091',
    'issn': ['1548-7105', '1548-7091'],
    'is_oa': False,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/P4310319908',
    'host_organization_name': 'Nature Portfolio',
    'host_orga

What is Group Sense?

In [25]:
group_sense_id = 'https://openalex.org/I4210128910'
group_sense_works = merge_pages(
    Works() \
        .filter(authorships={"institutions": {"id": group_sense_id}}) \
        .filter(concepts={"id": 'https://openalex.org/C154945302|https://openalex.org/C119857082'}) \
        .filter(from_publication_date="2010-01-01") \
        .filter(to_publication_date="2023-06-01") \
        .paginate(per_page=200, n_max=int(1e6))        
)
print(len(group_sense_works), 'works')

for work in group_sense_works[:25]:
    print(work['title'])
    for authorship in work['authorships']:
        if group_sense_id in [ins['id'] for ins in authorship['institutions']]:
            print(authorship['raw_affiliation_string'])
    print()

5page [00:08,  1.74s/page]

808 works
Pyramid Scene Parsing Network
[Sensetime Group Limited]

Residual Attention Network for Image Classification
[Sensetime Group Limited]
[Sensetime Group Limited]
[Sensetime Group Limited]

High Performance Visual Tracking with Siamese Region Proposal Network
[Sensetime Group Limited]

DeepFashion: Powering Robust Clothes Recognition and Retrieval with Rich Annotations
[Sensetime Group Limited]

Context Encoding for Semantic Segmentation
[SenseTime]

Libra R-CNN: Towards Balanced Learning for Object Detection
[Sensetime Group Limited]

GeoNet: Unsupervised Learning of Dense Depth, Optical Flow and Camera Pose
[Sense Time Research]
[Sense Time Research]

NTIRE 2017 Challenge on Single Image Super-Resolution: Methods and Results
SenseTime
SenseTime

Distractor-Aware Siamese Networks for Visual Object Tracking
SenseTime Group Limited, Beijing, China
SenseTime Group Limited, Beijing, China
SenseTime Group Limited, Beijing, China

ICNet for Real-Time Semantic Segmentation on High-Re




In [26]:
sensetime_count = 0
for work in group_sense_works:
    for authorship in work['authorships']:
        if group_sense_id in [ins['id'] for ins in authorship['institutions']]:
            raw_affiliation_string = authorship['raw_affiliation_string']
            if 'sensetime' in raw_affiliation_string.lower() or 'sense time' in raw_affiliation_string.lower():
                sensetime_count += 1
                break
sensetime_count / len(group_sense_works)

0.9752475247524752

In [27]:
Institutions().search('Sense').get()

[{'id': 'https://openalex.org/I4210128910',
  'ror': 'https://ror.org/036wd5777',
  'display_name': 'Group Sense (China)',
  'relevance_score': 9980.91,
  'country_code': 'CN',
  'type': 'company',
  'lineage': ['https://openalex.org/I4210128910'],
  'homepage_url': 'http://www.gsl.com.hk/',
  'image_url': None,
  'image_thumbnail_url': None,
  'display_name_acronyms': ['GSL'],
  'display_name_alternatives': [],
  'repositories': [],
  'works_count': 1006,
  'cited_by_count': 47359,
  'summary_stats': {'2yr_mean_citedness': 8.174193548387096,
   'h_index': 96,
   'i10_index': 382},
  'ids': {'openalex': 'https://openalex.org/I4210128910',
   'ror': 'https://ror.org/036wd5777',
   'grid': 'grid.493556.8'},
  'geo': {'city': 'Hong Kong',
   'geonames_city_id': '1819729',
   'region': None,
   'country_code': 'CN',
   'country': 'China',
   'latitude': 22.330235,
   'longitude': 114.19292},
  'international': {'display_name': {'en': 'Group Sense (China)'}},
  'associated_institutions': []

Checking what some well-known papers look like and how accurate the authorship is

In [110]:
name_to_id = defaultdict(list)
for institution_id, institution_name in institution_aliases.items():
    name_to_id[institution_name].append(institution_id)

institution_keywords = {
    'Adobe': ['adobe'],
    'Alibaba': ['alibaba', 'alipay', 'ablibaba'],
    'Amazon': ['amazon'],
    'Baidu': ['baidu'],
    'DeepMind': ['deepmind', 'deep mind'],
    'Google': ['google'],
    'Group Sense': ['sensetime', 'sense time', 'sense\'time', 'sense-time'],
    'Huawei': ['huawei'],
    'IBM': ['ibm', 'international business machines'],
    'Intel': ['intel'],
    'NEC': ['nec', 'nippon electric company'],
    'Naver': ['naver'],
    'Netflix': ['netflix'],
    'Nvidia': ['nvidia'],
    'Meta': ['facebook', 'meta'],
    'Microsoft': ['microsoft'],
    'OpenAI': ['openai', 'open ai'],
    'Salesforce': ['salesforce', 'sales force'],
    'Tencent': ['tencent'],
    'Twitter': ['twitter'],
    'Uber': ['uber'],
    'Xerox': ['xerox'],
    'Yandex': ['yandex'],
    'Enthought': ['enthought'],
    'Quansight': ['quansight'],
}

raw_affiliation_fractions = {}

def get_institution_works(institution_ids):
    institution_works = []
    for institution_id in institution_ids:
        institution_works.extend(
            merge_pages(
                Works() \
                    .filter(authorships={"institutions": {"id": institution_id}}) \
                    .filter(concepts={"id": 'https://openalex.org/C154945302|https://openalex.org/C119857082'}) \
                    .filter(from_publication_date="2010-01-01") \
                    .filter(to_publication_date="2023-06-01") \
                    .paginate(per_page=200, n_max=int(1e6)) 
            )
        )
    unique_titles = set()
    unique_works = []
    for work in institution_works:
        title = work['display_name']
        if title is not None:
            title = title.lower()
            for char in title:
                if not char.isalnum():
                    title = title.replace(char, '')
            if title not in unique_titles:
                unique_titles.add(title)
                unique_works.append(work)
    institution_works = unique_works
    return institution_works

def get_raw_affiliation_count(institution_name, institution_ids, institution_works):
    raw_affiliation_count = 0
    raw_affiliation_keywords = institution_keywords[institution_name]
    print(f'Raw affiliation keywords: {raw_affiliation_keywords}')
    for work in institution_works:
        raw_affiliation_found = False
        for authorship in work['authorships']:
            author_institution_ids = [ins['id'] for ins in authorship['institutions']]
            if any([institution_id in author_institution_ids for institution_id in institution_ids]):
                raw_affiliation_string = authorship['raw_affiliation_string']
                if any([keyword in raw_affiliation_string.lower() for keyword in raw_affiliation_keywords]):
                    raw_affiliation_count += 1
                    raw_affiliation_found = True
            if raw_affiliation_found:
                break
        # if not raw_affiliation_found:
        #     raw_strings = [authorship["raw_affiliation_string"] for authorship in work["authorships"]]
        #     print(f'No raw affiliation match: {work["id"]} {raw_strings}')
    return raw_affiliation_count

for institution_name in institution_keywords.keys():
    print(f'==== {institution_name} ====')
    institution_ids = name_to_id[institution_name]
    institution_works = get_institution_works(institution_ids)
    raw_affiliation_count = get_raw_affiliation_count(institution_name, institution_ids, institution_works)
    raw_affiliation_fractions[institution_name] = raw_affiliation_count / len(institution_works)
    print()
    print(f'Works: {len(institution_works)}')
    print(f'Raw affiliation match: {raw_affiliation_count}')
    print(f'No raw affiliation match: {len(institution_works) - raw_affiliation_count}')
    print(f'Raw affiliation match fraction: {raw_affiliation_count / len(institution_works):.3f}')
    print()

==== Microsoft ====


45page [01:40,  2.23s/page]
21page [00:50,  2.41s/page]
18page [00:52,  2.94s/page]
4page [00:09,  2.31s/page]
1page [00:02,  2.97s/page]
1page [00:01,  1.88s/page]
1page [00:01,  1.93s/page]
1page [00:02,  2.50s/page]
1page [00:01,  1.79s/page]
1page [00:01,  1.70s/page]
1page [00:01,  1.45s/page]
1page [00:01,  1.02s/page]
1page [00:01,  1.17s/page]
1page [00:01,  1.58s/page]
1page [00:01,  1.46s/page]
1page [00:01,  1.52s/page]
1page [00:01,  1.28s/page]
1page [00:01,  1.37s/page]
1page [00:01,  1.07s/page]


Raw affiliation keywords: ['microsoft']
No raw affiliation match: https://openalex.org/W2470673105 ['', '', '', '', '', '']
No raw affiliation match: https://openalex.org/W1524333225 ['', '', '', '', '', '', '', '', '', '', '', '', '']
No raw affiliation match: https://openalex.org/W2141599568 ['', '', '']
No raw affiliation match: https://openalex.org/W2963206148 ['', '', '', '', '']
No raw affiliation match: https://openalex.org/W2951548327 ['']
No raw affiliation match: https://openalex.org/W2407521645 ['', '', '', '']
No raw affiliation match: https://openalex.org/W2599765304 ['', '', '', '']
No raw affiliation match: https://openalex.org/W2950898568 ['', '', '', '']
No raw affiliation match: https://openalex.org/W2963168371 ['', '', '']
No raw affiliation match: https://openalex.org/W2740747242 ['', '', '', '', '']
No raw affiliation match: https://openalex.org/W2407022425 ['', '', '', '', '']
No raw affiliation match: https://openalex.org/W2971274815 ['', '', '', '', '', '', '', 

In [96]:
raw_affiliation_fractions

{'Adobe': 0.8805474906677727,
 'Alibaba': 0.9272624753312658,
 'Amazon': 0.8445901639344262,
 'Baidu': 0.8728943338437979,
 'DeepMind': 0.9824561403508771,
 'Google': 0.8185350689722168,
 'Group Sense': 0.9786931818181818,
 'Huawei': 0.9106606606606606,
 'IBM': 0.8880985774393909,
 'Intel': 0.9620328257860391,
 'NEC': 0.9565217391304348,
 'Naver': 0.8396694214876033,
 'Netflix': 0.9265536723163842,
 'Nvidia': 0.9189526184538653,
 'Meta': 0.8218934911242604,
 'Microsoft': 0.8883103010881901,
 'OpenAI': 1.0,
 'Salesforce': 1.0,
 'Tencent': 0.897364771151179,
 'Twitter': 0.9449275362318841,
 'Uber': 0.782051282051282,
 'Xerox': 0.8087954110898662,
 'Yandex': 0.8862745098039215,
 'Enthought': 1.0,
 'Quansight': 1.0}

In [98]:
1 - sum(raw_affiliation_fractions.values()) / len(raw_affiliation_fractions)

0.09051657309065997

In [99]:
1 - min(raw_affiliation_fractions.values())

0.21794871794871795

In [None]:
institution_negative_keywords = {
    'Alibaba': [],  # unfinished
    'Amazon': ['Amazônia'],
    'Baidu': ['Baifendian', 'Baihe', 'Baiyining', 'Baihu', 'BAI YongQiang', 'Baihui', 'Baixing', 'Baiwang', ],  # unfinished
    'DeepMind': ['AdeptMind', 'Secondmind', 'VivoMind', 'InMind', 'Vastmindz', 'Nethermind', 'Medmind'],
    'Group Sense': ['SenseNets', 'Sense Cost', 'Sensel', 'SENSE', 'TECH/SENSE', 'GoodNotes', 'Senseg', 'Pollen Sense', 'We-Sense', 'SensEcho', 'Sense Future', 'SENSeOR' ],
    'Meta': ['Go Meta', 'GE Intelligent Platforms'],
    'OpenAI': ['Norwegian Open AI Lab'],  # unfinished
    'Yandex': ['Yanfang', 'Yanshan', 'Yanda'],
}

False negatives

In [128]:
institution_works = get_institution_works(name_to_id['OpenAI'])
institution_works_raw_search = merge_pages(
    Works() \
        .filter(raw_affiliation_string={'search': 'openai|open ai'}) \
        .filter(concepts={"id": 'https://openalex.org/C154945302|https://openalex.org/C119857082'}) \
        .filter(from_publication_date="2010-01-01") \
        .filter(to_publication_date="2023-06-01") \
        .paginate(per_page=200, n_max=int(1e6))        
)

1page [00:01,  1.95s/page]
2page [00:03,  1.95s/page]


In [130]:
[work['title'] for work in institution_works_raw_search]

['ImageNet classification with deep convolutional neural networks',
 'Deep Learning with Differential Privacy',
 'Domain randomization for transferring deep neural networks from simulation to the real world',
 'Practical Black-Box Attacks against Machine Learning',
 'Learning Transferable Visual Models From Natural Language Supervision',
 'Model-agnostic meta-learning for fast adaptation of deep networks',
 'InfoGAN: interpretable representation learning by information maximizing generative adversarial nets',
 'Learning dexterous in-hand manipulation',
 'Sim-to-Real Transfer of Robotic Control with Dynamics Randomization',
 'Generative Adversarial Imitation Learning',
 'Weight normalization: a simple reparameterization to accelerate training of deep neural networks',
 'Overcoming Exploration in Reinforcement Learning with Demonstrations',
 'InfoGAN: Interpretable Representation Learning by Information Maximizing Generative Adversarial Nets',
 'Scaling Laws for Neural Language Models',


In [126]:
institution_works_ids = set([work['id'] for work in institution_works])
institution_works_raw_search_ids = set([work['id'] for work in institution_works_raw_search])

In [None]:
for work_id in (institution_works_raw_search_ids - institution_works_ids):
    print(Works()[work_id]['title'])

In [108]:
print(len(institution_works_raw_search_ids.symmetric_difference(institution_works_ids)))

157


In [127]:
len(institution_works_raw_search_ids), len(institution_works_ids)

(1182, 801)

In [None]:
Works().search("Attention is all you need").get()

[{'id': 'https://openalex.org/W2963403868',
  'doi': None,
  'title': 'Attention is All you Need',
  'display_name': 'Attention is All you Need',
  'relevance_score': 19693.951,
  'publication_year': 2017,
  'publication_date': '2017-06-12',
  'ids': {'openalex': 'https://openalex.org/W2963403868', 'mag': '2963403868'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://arxiv.org/pdf/1706.03762v5',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S4306400194',
    'display_name': 'arXiv (Cornell University)',
    'issn_l': None,
    'issn': None,
    'is_oa': True,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/I205783295',
    'host_organization_name': 'Cornell University',
    'host_organization_lineage': ['https://openalex.org/I205783295'],
    'host_organization_lineage_names': ['Cornell University'],
    'type': 'repository'},
   'license': None,
   'version': None,
   'is_accepted': False,
   'is_publis

In [None]:
Works().search("Language models are few-shot learners").get()[0]

{'id': 'https://openalex.org/W3030163527',
 'doi': None,
 'title': 'Language Models are Few-Shot Learners',
 'display_name': 'Language Models are Few-Shot Learners',
 'relevance_score': 6745.986,
 'publication_year': 2020,
 'publication_date': '2020-05-28',
 'ids': {'openalex': 'https://openalex.org/W3030163527', 'mag': '3030163527'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://arxiv.org/pdf/2005.14165.pdf',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',
  'is_accepted': False,


In [None]:
Authors().search("Marcin Andrychowicz").get()

[{'id': 'https://openalex.org/A5091819924',
  'orcid': None,
  'display_name': 'Marcin Andrychowicz',
  'display_name_alternatives': ['Marcin Andrychowicz'],
  'relevance_score': 11873.169,
  'works_count': 61,
  'cited_by_count': 4342,
  'summary_stats': {'2yr_mean_citedness': 0.07692307692307693,
   'h_index': 23,
   'i10_index': 32},
  'ids': {'openalex': 'https://openalex.org/A5091819924'},
  'last_known_institution': {'id': 'https://openalex.org/I1291425158',
   'ror': 'https://ror.org/00njsd438',
   'display_name': 'Google (United States)',
   'country_code': 'US',
   'type': 'company',
   'lineage': ['https://openalex.org/I1291425158',
    'https://openalex.org/I4210128969']},
  'x_concepts': [{'id': 'https://openalex.org/C41008148',
    'wikidata': 'https://www.wikidata.org/wiki/Q21198',
    'display_name': 'Computer science',
    'level': 0,
    'score': 100.0},
   {'id': 'https://openalex.org/C33923547',
    'wikidata': 'https://www.wikidata.org/wiki/Q395',
    'display_name'

How many affiliations are we missing?

In [None]:
num_works_with_missing_affiliation = 0
num_works_with_missing_affiliation_id = 0
for work in works:
    for authorship in work["authorships"]:
        if len(authorship["institutions"]) == 0:
            num_works_with_missing_affiliation += 1
            break
        if all([ins.get("id") is None for ins in authorship["institutions"]]):
            # Means there's a display name but no ID - I exclude these cases
            num_works_with_missing_affiliation_id += 1
            break
num_works_with_missing_affiliation, num_works_with_missing_affiliation_id

(10214, 0)

In [None]:
len(works)

66176

In [None]:
(num_works_with_missing_affiliation + num_works_with_missing_affiliation_id) / len(works)

0.15434598646034817

In [None]:
len(works) - (num_works_with_missing_affiliation + num_works_with_missing_affiliation_id)

55962

In [None]:
processor.author_id_to_name

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.__init__.<locals>.<lambda>()>,
            {'Meta': defaultdict(dict,
                         {2015: {'https://openalex.org/A5001226970': 'Yann LeCun',
                           'https://openalex.org/A5076651586': 'Wojciech Zaremba',
                           'https://openalex.org/A5019934281': 'Ming Yang',
                           'https://openalex.org/A5077887756': "Marc'Aurelio Ranzato",
                           'https://openalex.org/A5047847314': 'Lada A. Adamic',
                           'https://openalex.org/A5086635908': 'Yunchao Gong',
                           'https://openalex.org/A5016319243': 'Du Tran',
                           'https://openalex.org/A5044186270': 'Lubomir Bourdev',
                           'https://openalex.org/A5089960673': 'Rob Fergus',
                           'https://openalex.org/A5054437548': 'Manohar Paluri',
                           'https://openalex.org/A50020785

In [None]:
institution_author_name_data["OpenAI"][2018]

['Prafulla Dhariwal',
 'Richard Y. Chen',
 'Miles Brundage',
 'Dario Amodei',
 'Bob McGrew',
 'John Schulman',
 'Tae-Hoon Kim',
 'Maciek Chociej',
 'Jonas Schneider',
 'Ilya Sutskever',
 'Rafal Jozefowicz',
 'Alec Radford',
 'Joshua Achiam',
 'Phillip Isola',
 'Matthias Plappert',
 'Xue Bin Peng',
 'Glenn Powell',
 'Igor Mordatch',
 'Wojciech Zaremba',
 'Marcin Andrychowicz',
 'Pieter Abbeel',
 'Harri Edwards',
 'Alex Ray',
 'Josh Tobin',
 'Rein Houthooft',
 'Oleg Klimov',
 'Bowen Baker',
 'Harrison Edwards',
 'Karl Cobbe',
 'Yuri Burda',
 'Peter Welinder',
 'Jack Clark']

Validate author de-duplication

In [None]:
total_authors = 0
total_unique_authors = 0

for alias in set(institution_aliases.values()):
    ins_authors = 0
    ins_unique_authors = 0
    for year in all_years:
        author_names_example = institution_author_name_data[alias][year]
        ins_authors += len(author_names_example)
        ins_unique_authors += len(set(author_names_example))
        total_authors += len(author_names_example)
        total_unique_authors += len(set(author_names_example))
    print(f"{alias}: {ins_authors} authors, {ins_unique_authors} unique authors ({100 * (ins_authors - ins_unique_authors) / ins_authors:.2f}% duplication)")
total_authors, total_unique_authors

Adobe: 1712 authors, 1712 unique authors (0.00% duplication)
Microsoft: 13635 authors, 13635 unique authors (0.00% duplication)
Yandex: 284 authors, 284 unique authors (0.00% duplication)
Twitter: 715 authors, 715 unique authors (0.00% duplication)
Huawei: 6387 authors, 6387 unique authors (0.00% duplication)
Amazon: 4255 authors, 4255 unique authors (0.00% duplication)
Google: 13733 authors, 13733 unique authors (0.00% duplication)
Alibaba: 4831 authors, 4831 unique authors (0.00% duplication)
IBM: 14344 authors, 14344 unique authors (0.00% duplication)
DeepMind: 1505 authors, 1505 unique authors (0.00% duplication)
OpenAI: 248 authors, 248 unique authors (0.00% duplication)
Netflix: 203 authors, 203 unique authors (0.00% duplication)
Meta: 4108 authors, 4108 unique authors (0.00% duplication)
Tencent: 3716 authors, 3716 unique authors (0.00% duplication)
Group Sense: 875 authors, 875 unique authors (0.00% duplication)
Enthought: 19 authors, 19 unique authors (0.00% duplication)
Quans

(86437, 86437)

In [119]:
meta_israel = Institutions().search('Meta (Israel)').get()[0]
meta_us = Institutions().search('Meta (United States)').get()[0]

In [117]:
Works().filter(authorships={"institutions": {"id": meta_israel['id']}}).get()

[{'id': 'https://openalex.org/W2963446712',
  'doi': 'https://doi.org/10.1109/cvpr.2017.243',
  'title': 'Densely Connected Convolutional Networks',
  'display_name': 'Densely Connected Convolutional Networks',
  'publication_year': 2017,
  'publication_date': '2017-07-01',
  'ids': {'openalex': 'https://openalex.org/W2963446712',
   'doi': 'https://doi.org/10.1109/cvpr.2017.243',
   'mag': '2963446712'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://doi.org/10.1109/cvpr.2017.243',
   'pdf_url': None,
   'source': None,
   'license': None,
   'version': None,
   'is_accepted': False,
   'is_published': False},
  'type': 'article',
  'type_crossref': 'proceedings-article',
  'open_access': {'is_oa': True,
   'oa_status': 'green',
   'oa_url': 'https://arxiv.org/pdf/1608.06993',
   'any_repository_has_fulltext': True},
  'authorships': [{'author_position': 'first',
    'author': {'id': 'https://openalex.org/A5013240918',
     'display_name': '

In [118]:
len(get_institution_works([meta_israel['id']]))

16page [00:25,  1.60s/page]


2304

In [120]:
len(get_institution_works([meta_us['id']]))

7page [00:11,  1.65s/page]


1155