In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import datetime
import numpy as np
import os
import pandas as pd
import pickle
import plotly
import plotly.graph_objects as go
import plotly.io as pio
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from tqdm.notebook import tqdm

from researcher_impact.citations import *
from researcher_impact.plotting import *
from researcher_impact.processors import OpenAlexProcessor
from researcher_impact.pyalex_utils import *
from researcher_impact.regression import *
from researcher_impact.utils import *

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
pio.templates.default = "plotly_white"

In [5]:
# Location to save data and results
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

In [128]:
concept_ids = [
    # "https://openalex.org/C41008148",  # Computer science
    # "https://openalex.org/C154945302",  # Artificial intelligence
    "https://openalex.org/C119857082",  # Machine learning
    "https://openalex.org/C108583219",  # Deep learning
]

In [129]:
concept_query = "|".join(concept_ids)
# concept_query = concept_ids
concept_query

'https://openalex.org/C119857082|https://openalex.org/C108583219'

In [130]:
# institution_id =  'https://openalex.org/I70931966'  # Université de Montréal
# institution_id = 'https://openalex.org/I97018004'  # Stanford University
# institution_id = 'https://openalex.org/I63966007'  # MIT
institution_id = 'https://openalex.org/I95457486'  # UC Berkeley

Fetch the most-cited authors in AI at the given institution.

In [131]:
authors_instance = Authors()
most_cited_ai_authors = authors_instance \
    .filter(last_known_institution={'id': institution_id}) \
    .filter(concepts={'id': concept_query}) \
    .sort(summary_stats={'h_index': 'desc'}) \
    .select('display_name') \
    .get()

In [132]:
for author in most_cited_ai_authors:
    print(author)
    # print(author['display_name'])

{'display_name': 'Michael I. Jordan'}
{'display_name': 'Rasmus Nielsen'}
{'display_name': 'Jitendra Malik'}
{'display_name': 'Trevor Darrell'}
{'display_name': 'Leon O. Chua'}
{'display_name': 'Pieter Abbeel'}
{'display_name': 'S. Shankar Sastry'}
{'display_name': 'Dawn Song'}
{'display_name': 'Montgomery Slatkin'}
{'display_name': 'Alexei A. Efros'}
{'display_name': 'Nicholas R. Jennings'}
{'display_name': 'Peter J. Bickel'}
{'display_name': 'Masayoshi Tomizuka'}
{'display_name': 'Timothy A. Brown'}
{'display_name': 'Kurt Keutzer'}
{'display_name': 'Leo A. Goodman'}
{'display_name': 'Claire J. Tomlin'}
{'display_name': 'Benjamin Recht'}
{'display_name': 'Alan Hubbard'}
{'display_name': 'Walter J. Freeman'}
{'display_name': 'Sophia Rabe-Hesketh'}
{'display_name': 'Stuart Russell'}
{'display_name': 'Peter L. Bartlett'}
{'display_name': 'Sandrine Dudoit'}
{'display_name': 'Frank S. Werblin'}


In [109]:
Authors().search('Pierre Jolicoeur').get()

[{'id': 'https://openalex.org/A5015753341',
  'orcid': None,
  'display_name': 'Pierre Jolicoeur',
  'display_name_alternatives': ['P. Jolicœur',
   'Jolicoeur Pierre',
   'Pierre Jolicoeur',
   'P. Jolicoeur',
   'P Jolicoeur',
   'Pierre Jolicœur'],
  'relevance_score': 10547.988,
  'works_count': 466,
  'cited_by_count': 13870,
  'summary_stats': {'2yr_mean_citedness': 0.8181818181818182,
   'h_index': 59,
   'i10_index': 222},
  'ids': {'openalex': 'https://openalex.org/A5015753341'},
  'last_known_institution': {'id': 'https://openalex.org/I70931966',
   'ror': 'https://ror.org/0161xgx34',
   'display_name': 'Université de Montréal',
   'country_code': 'CA',
   'type': 'education'},
  'x_concepts': [{'id': 'https://openalex.org/C86803240',
    'wikidata': 'https://www.wikidata.org/wiki/Q420',
    'display_name': 'Biology',
    'level': 0,
    'score': 74.2},
   {'id': 'https://openalex.org/C15744967',
    'wikidata': 'https://www.wikidata.org/wiki/Q9418',
    'display_name': 'Psyc

In [133]:
authors_instance.url

'https://api.openalex.org/authors?filter=last_known_institution.id:https://openalex.org/I95457486,concepts.id:https://openalex.org/C119857082|https://openalex.org/C108583219&sort=summary_stats.h_index:desc&select=display_name'

In [111]:
institution_instance = Institutions()
institution_instance.search('University of California, Berkeley').get()

[{'id': 'https://openalex.org/I95457486',
  'ror': 'https://ror.org/01an7q238',
  'display_name': 'University of California, Berkeley',
  'relevance_score': 317626.2,
  'country_code': 'US',
  'type': 'education',
  'homepage_url': 'http://www.berkeley.edu/',
  'image_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Seal%20of%20University%20of%20California%2C%20Berkeley.svg',
  'image_thumbnail_url': 'https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/Seal%20of%20University%20of%20California%2C%20Berkeley.svg&width=300',
  'display_name_acronyms': ['UCB'],
  'display_name_alternatives': ['UC Berkeley'],
  'repositories': [{'id': 'https://openalex.org/S4306402430',
    'display_name': 'eScholarship University of California (University of California)',
    'host_organization': 'https://openalex.org/I95457486',
    'host_organization_name': 'University of California, Berkeley',
    'host_organization_lineage': ['https://openalex.org/I95457486

In [112]:
institution_instance.url

'https://api.openalex.org/institutions?search=University+of+California%2C+Berkeley'

Alternative: get all the AI/ML works of the institution. Then run the processor on that. Then get the list of authors from that. Then sort by h-index or whatever metric.

In [31]:
institution_works = merge_pages(
    Works() \
        .filter(authorships={'institutions': {'id': institution_id}}) \
        .filter(concepts={'id': concept_query}) \
        .paginate(per_page=200, n_max=int(1e6))
)

51page [01:26,  1.69s/page]


In [32]:
processor = OpenAlexProcessor(institution_works, institution_aliases={'https://openalex.org/I70931966': 'Université de Montréal'}, selected_institution_ids=['https://openalex.org/I70931966'], citation_year_bound=3)

In [33]:
processor.process_works()

In [34]:
institution_author_data = processor.get_author_data()

In [35]:
institution_author_data

defaultdict(<function researcher_impact.processors.OpenAlexProcessor.deduplicate_authors.<locals>.<lambda>()>,
            {'Université de Montréal': defaultdict(set,
                         {2015: {'https://openalex.org/A5000181169',
                           'https://openalex.org/A5000372059',
                           'https://openalex.org/A5000656520',
                           'https://openalex.org/A5000884746',
                           'https://openalex.org/A5000971220',
                           'https://openalex.org/A5001883668',
                           'https://openalex.org/A5002101391',
                           'https://openalex.org/A5002300977',
                           'https://openalex.org/A5002574588',
                           'https://openalex.org/A5002700508',
                           'https://openalex.org/A5002807478',
                           'https://openalex.org/A5002832972',
                           'https://openalex.org/A5003094142',
        

In [43]:
all_author_ids = set()
for year_author_ids in institution_author_data['Université de Montréal'].values():
    for author_id in year_author_ids:
        all_author_ids.add(author_id)
len(all_author_ids)

6360

In [44]:
all_author_ids

{'https://openalex.org/A5030514644',
 'https://openalex.org/A5044681452',
 'https://openalex.org/A5090777482',
 'https://openalex.org/A5062263560',
 'https://openalex.org/A5018000450',
 'https://openalex.org/A5016766547',
 'https://openalex.org/A5018953179',
 'https://openalex.org/A5052833118',
 'https://openalex.org/A5003253618',
 'https://openalex.org/A5055558936',
 'https://openalex.org/A5008669266',
 'https://openalex.org/A5082680857',
 'https://openalex.org/A5020156205',
 'https://openalex.org/A5071782046',
 'https://openalex.org/A5075110348',
 'https://openalex.org/A5089841192',
 'https://openalex.org/A5089598314',
 'https://openalex.org/A5082385549',
 'https://openalex.org/A5050101462',
 'https://openalex.org/A5039348167',
 'https://openalex.org/A5080225460',
 'https://openalex.org/A5001883668',
 'https://openalex.org/A5007481373',
 'https://openalex.org/A5071164882',
 'https://openalex.org/A5036439345',
 'https://openalex.org/A5081725332',
 'https://openalex.org/A5071238636',
 

In [45]:
author_metric = {}
for i, author_id in enumerate(all_author_ids):
    author_obj = Authors()[author_id]
    author_metric[author_id] = author_obj['summary_stats']['h_index']
    if i > 10: break
author_metric

{'https://openalex.org/A5030514644': 34,
 'https://openalex.org/A5044681452': 26,
 'https://openalex.org/A5090777482': 17,
 'https://openalex.org/A5062263560': 11,
 'https://openalex.org/A5018000450': 25,
 'https://openalex.org/A5016766547': 5,
 'https://openalex.org/A5018953179': 16,
 'https://openalex.org/A5052833118': 24,
 'https://openalex.org/A5003253618': 10,
 'https://openalex.org/A5055558936': 6,
 'https://openalex.org/A5008669266': 4,
 'https://openalex.org/A5082680857': 22}