In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup
import datetime
import os
import pickle
import pyalex
from pyalex import Authors, Works
import requests

from research_impact.processors import OpenAlexProcessor
from research_impact.pyalex_utils import merge_pages

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
data_file_location = "data/"
os.makedirs(data_file_location, exist_ok=True)

# Extract additional OpenAI works based on the Research webpage

In [5]:
def extract_urls(html_folder, html_filename, page_number):
    with open(html_folder + html_filename + str(page_number) + ".html", 'r') as f:
        text = f.read()
    
    soup = BeautifulSoup(text, 'html.parser')

    # Find the specific unordered list
    ul = soup.find('ul', {'aria-labelledby': 'feedHeading'})

    urls = []

    # If the unordered list was found, find all 'a' tags within it
    if ul:
        for link in ul.find_all('a'):
            href = link.get('href')
            if href.startswith('http'):
                urls.append(href)
                # print(href)

    return urls

In [6]:
def extract_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Try to get title from the <title> tag
    title_tag = soup.find('title')
    if title_tag:
        return title_tag.text

    # If no <title> tag, try <h1> through <h6>
    for i in range(1, 7):
        header_tag = soup.find(f'h{i}')
        if header_tag:
            return header_tag.text

    # If no title or header tags found, return None
    return None

In [7]:
extract_urls("data/openai/", "openai_research_index_page_", 7)

['https://arxiv.org/abs/1706.01905',
 'https://arxiv.org/abs/1707.06347',
 'https://arxiv.org/abs/1707.07397',
 'https://arxiv.org/abs/1707.01495',
 'https://arxiv.org/abs/1707.00183',
 'https://arxiv.org/abs/1706.03741',
 'https://arxiv.org/abs/1706.02275',
 'https://arxiv.org/abs/1706.01502',
 'https://arxiv.org/abs/1704.06440',
 'https://arxiv.org/abs/1704.03012',
 'https://arxiv.org/abs/1704.01444',
 'https://arxiv.org/abs/1703.03864',
 'https://arxiv.org/abs/1703.07326']

In [8]:
html_folder = "data/openai/"
html_filename = "openai_research_index_page_"

In [9]:
openai_research_urls = []
for page in range(1, 9 + 1):
    page_urls = extract_urls(html_folder, html_filename, page)
    print(len(page_urls))
    openai_research_urls.extend(page_urls)
len(openai_research_urls)

17
15
18
16
15
14
13
14
3


125

In [10]:
openai_research_urls

['https://arxiv.org/abs/2307.03718',
 'https://arxiv.org/abs/2305.20050',
 'https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html',
 'https://arxiv.org/abs/2303.10130',
 'https://arxiv.org/abs/2303.08774',
 'https://arxiv.org/abs/2301.04246',
 'https://arxiv.org/abs/2212.08751',
 'https://arxiv.org/abs/2210.10760',
 'https://cdn.openai.com/papers/whisper.pdf',
 'https://arxiv.org/abs/2207.14255',
 'https://arxiv.org/abs/2207.14157',
 'https://arxiv.org/abs/2206.11795',
 'https://arxiv.org/abs/2206.08896',
 'https://arxiv.org/abs/2206.05802',
 'https://arxiv.org/abs/2205.14334',
 'https://arxiv.org/abs/2204.06125',
 'https://cdn.openai.com/papers/Economic_Impacts_Research_Agenda.pdf',
 'https://arxiv.org/abs/2202.01344',
 'https://arxiv.org/abs/2203.02155',
 'https://arxiv.org/abs/2201.10005',
 'https://arxiv.org/abs/2112.09332',
 'http://arxiv.org/abs/2110.14168',
 'https://arxiv.org/abs/2109.07958',
 'https://arxiv.org/abs/2107.03374',
 'https://cdn.openai.com/p

In [11]:
# Additional URLs from manual browsing
additional_urls = [
    "https://arxiv.org/abs/2109.10862",
    "https://arxiv.org/abs/1710.06537",
    "https://arxiv.org/abs/1710.06542",
    "https://arxiv.org/abs/1703.06907",
    "https://arxiv.org/abs/1703.07326",
]
openai_research_urls.extend(additional_urls)
# Remove duplicates. Example: multiple GPT-2 announcements.
openai_research_urls = list(set(openai_research_urls))
len(openai_research_urls)

121

In [12]:
openai_research_titles = []
for url in openai_research_urls:
    title = extract_title(url)
    # Remove the square bracket part of the title, if present
    if title is not None:
        if '] ' in title:
            title = title.split('] ')[1]
        title = title.strip()
        openai_research_titles.append(title)
    print(title)
len(openai_research_titles)


Recursively Summarizing Books with Human Feedback
One-Shot Imitation Learning
Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos
Parameter Space Noise for Exploration
Efficient Training of Language Models to Fill in the Middle
Some Considerations on Learning to Explore via Meta-Reinforcement Learning
Solving Rubik's Cube with a Robot Hand
None
Learning to Generate Reviews and Discovering Sentiment
Prediction and Control with Temporal Segment Models
Teaching Models to Express Their Uncertainty in Words
Variational Option Discovery Algorithms
Semi-supervised Knowledge Transfer for Deep Learning from Private Training Data
Implicit Generation and Generalization in Energy-Based Models
Large-Scale Study of Curiosity-Driven Learning
None
AI safety via debate
OpenAI Gym
Sim-to-Real Transfer of Robotic Control with Dynamics Randomization
None
Supervising strong learners by amplifying weak experts
On First-Order Meta-Learning Algorithms
None
Understanding the Capabiliti

112

In [13]:
for title in openai_research_titles:
    print("Original title: ", title)
    openalex_results = Works().search(title).get()
    if len(openalex_results) > 0:
        top_result = openalex_results[0]
        print("Search result: ", top_result['display_name'])
        print(top_result['id'])
        if len(openalex_results) > 1:
            print("Relevance score: ", top_result['relevance_score'])
            print("Match score: ", top_result['relevance_score'] / top_result['cited_by_count']**0.5)
    print()


Original title:  Recursively Summarizing Books with Human Feedback
Search result:  Recursively Summarizing Books with Human Feedback
https://openalex.org/W3200980294
Relevance score:  486.11057
Match score:  217.39525582023398

Original title:  One-Shot Imitation Learning
Search result:  One-Shot Imitation Learning
https://openalex.org/W2963094133
Relevance score:  2516.118
Match score:  184.4907228766487

Original title:  Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos
Search result:  Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online
  Videos
https://openalex.org/W4283460722

Original title:  Parameter Space Noise for Exploration
Search result:  Parameter Space Noise for Exploration
https://openalex.org/W2623491082
Relevance score:  1871.2231
Match score:  152.27797625659912

Original title:  Efficient Training of Language Models to Fill in the Middle
Search result:  Efficient Training of Language Models to Fill in the Middle
https://op

In [14]:
Works().search("Extensions and Limitations of the Neural GPU").get()[0]

{'id': 'https://openalex.org/W2548137223',
 'doi': None,
 'title': 'Extensions and Limitations of the Neural GPU',
 'display_name': 'Extensions and Limitations of the Neural GPU',
 'relevance_score': 395.41043,
 'publication_year': 2016,
 'publication_date': '2016-11-04',
 'ids': {'openalex': 'https://openalex.org/W2548137223', 'mag': '2548137223'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://openreview.net/pdf?id=ryjp1c9xg',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',
  'is_

Filter OpenAlex works that sufficiently match by title.

In [15]:
openai_research_works = []
for title in openai_research_titles:
    print("Original title: ", title)
    openalex_results = Works().search(title).get()
    if len(openalex_results) > 0:
        top_result = openalex_results[0]
        print("Search result: ", top_result['display_name'])
        print(top_result['id'])
        if len(openalex_results) > 1:
            print("Relevance score: ", top_result['relevance_score'])
            match_score = top_result['relevance_score'] / top_result['cited_by_count']**0.5
            print("Match score: ", match_score)
            if match_score >= 100:
                openai_research_works.append(top_result)
        else:
            openai_research_works.append(top_result)
    print()
len(openai_research_works)

Original title:  Recursively Summarizing Books with Human Feedback
Search result:  Recursively Summarizing Books with Human Feedback
https://openalex.org/W3200980294
Relevance score:  486.11057
Match score:  217.39525582023398

Original title:  One-Shot Imitation Learning
Search result:  One-Shot Imitation Learning
https://openalex.org/W2963094133
Relevance score:  2516.118
Match score:  184.4907228766487

Original title:  Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online Videos
Search result:  Video PreTraining (VPT): Learning to Act by Watching Unlabeled Online
  Videos
https://openalex.org/W4283460722

Original title:  Parameter Space Noise for Exploration
Search result:  Parameter Space Noise for Exploration
https://openalex.org/W2623491082
Relevance score:  1871.2231
Match score:  152.27797625659912

Original title:  Efficient Training of Language Models to Fill in the Middle
Search result:  Efficient Training of Language Models to Fill in the Middle
https://op

104

In [16]:
openai_research_works

[{'id': 'https://openalex.org/W3200980294',
  'doi': None,
  'title': 'Recursively Summarizing Books with Human Feedback',
  'display_name': 'Recursively Summarizing Books with Human Feedback',
  'relevance_score': 486.11057,
  'publication_year': 2021,
  'publication_date': '2021-09-22',
  'ids': {'openalex': 'https://openalex.org/W3200980294', 'mag': '3200980294'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'http://arxiv.org/pdf/2109.10862.pdf',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S4306400194',
    'display_name': 'arXiv (Cornell University)',
    'issn_l': None,
    'issn': None,
    'is_oa': True,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/I205783295',
    'host_organization_name': 'Cornell University',
    'host_organization_lineage': ['https://openalex.org/I205783295'],
    'host_organization_lineage_names': ['Cornell University'],
    'type': 'repository'},
   'license': None,
   'versi

In [17]:
with open("data/openai/openai_research_works", "wb") as f:
    pickle.dump(openai_research_works, f)

# Compare with OpenAlex works that have an OpenAI affiliation recorded

In [5]:
with open("data/openai/openai_research_works", "rb") as f:
    openai_research_works = pickle.load(f)

In [6]:
len(openai_research_works)

104

In [8]:
with open(data_file_location + "selected_institution_works_openalex_deduplicated_2023-08-29_12-22-34", "rb") as f:
    works_obj = pickle.load(f)
    all_works = works_obj["works"]

In [9]:
len(all_works)

66093

In [10]:
selected_institution_ids = [
    "https://openalex.org/I4210161460",  # OpenAI (United States)
]

In [11]:
institution_aliases = {selected_institution_ids[0]: 'OpenAI'}

In [12]:
processor = OpenAlexProcessor(all_works, selected_institution_ids, institution_aliases, citation_year_bound=3)
processor.process_works()

In [13]:
institution_authors_by_work = processor.get_authors_by_work()
institution_authors_by_work

defaultdict(<function research_impact.processors.OpenAlexProcessor.process_works.<locals>.<lambda>()>,
            {'OpenAI': defaultdict(list,
                         {'https://openalex.org/W2618530766': ['https://openalex.org/A5024209719'],
                          'https://openalex.org/W2473418344': ['https://openalex.org/A5004271128'],
                          'https://openalex.org/W2951777958': ['https://openalex.org/A5067370075'],
                          'https://openalex.org/W2965130685': ['https://openalex.org/A5041662257',
                           'https://openalex.org/A5068388812'],
                          'https://openalex.org/W3198675127': ['https://openalex.org/A5061984381'],
                          'https://openalex.org/W2976896402': ['https://openalex.org/A5039751155'],
                          'https://openalex.org/W3129659662': ['https://openalex.org/A5031789995'],
                          'https://openalex.org/W4308995511': ['https://openalex.org/A5073086

In [14]:
openai_authors_by_work = institution_authors_by_work["OpenAI"]
openai_authors_by_work

defaultdict(list,
            {'https://openalex.org/W2618530766': ['https://openalex.org/A5024209719'],
             'https://openalex.org/W2473418344': ['https://openalex.org/A5004271128'],
             'https://openalex.org/W2951777958': ['https://openalex.org/A5067370075'],
             'https://openalex.org/W2965130685': ['https://openalex.org/A5041662257',
              'https://openalex.org/A5068388812'],
             'https://openalex.org/W3198675127': ['https://openalex.org/A5061984381'],
             'https://openalex.org/W2976896402': ['https://openalex.org/A5039751155'],
             'https://openalex.org/W3129659662': ['https://openalex.org/A5031789995'],
             'https://openalex.org/W4308995511': ['https://openalex.org/A5073086818'],
             'https://openalex.org/W4376455648': ['https://openalex.org/A5070645440'],
             'https://openalex.org/W4207074987': ['https://openalex.org/A5000711679'],
             'https://openalex.org/W2983335573': ['https://ope

In [15]:
openai_openalex_work_ids = list(openai_authors_by_work.keys())
len(openai_openalex_work_ids)

77

In [16]:
openai_research_work_ids = [work["id"] for work in openai_research_works]
len(openai_research_work_ids)

104

In [17]:
len(openai_research_work_ids) - len(set(openai_research_work_ids))

0

In [18]:
len(openai_openalex_work_ids) - len(set(openai_openalex_work_ids))

0

In [19]:
for work_id in set(openai_research_work_ids).intersection(set(openai_openalex_work_ids)):
    print(Works()[work_id]['title'])

Scaling Laws for Neural Language Models
Domain randomization for transferring deep neural networks from simulation to the real world
Teacher–Student Curriculum Learning
Learning Transferable Visual Models From Natural Language Supervision
#Exploration: a study of count-based exploration for deep reinforcement learning
Multimodal Neurons in Artificial Neural Networks
Sim-to-Real Transfer of Robotic Control with Dynamics Randomization
Domain Randomization and Generative Models for Robotic Grasping
Learning dexterous in-hand manipulation


In [20]:
new_openai_research_works = [work for work in openai_research_works if work["id"] not in set(openai_openalex_work_ids)]
len(new_openai_research_works)

95

In [21]:
all_time_unique_authors = {}
for work in new_openai_research_works:
    print("Title: ", work["title"])
    for authorship in work["authorships"]:
        print("Author: ", authorship["author"]["display_name"])
        print("Institutions: ", authorship["institutions"])
        print("Raw affiliation strings: ", authorship["raw_affiliation_strings"])
        print()
        all_time_unique_authors[authorship["author"]["id"]] = authorship["author"]["display_name"]
    print()

Title:  Recursively Summarizing Books with Human Feedback
Author:  Jeff Wu
Institutions:  []
Raw affiliation strings:  ['[OpenAI.]']

Author:  Long Ouyang
Institutions:  []
Raw affiliation strings:  []

Author:  Daniel M. Ziegler
Institutions:  []
Raw affiliation strings:  []

Author:  Nisan Stiennon
Institutions:  []
Raw affiliation strings:  []

Author:  Ryan Lowe
Institutions:  []
Raw affiliation strings:  []

Author:  Jan Leike
Institutions:  []
Raw affiliation strings:  []

Author:  Paul F. Christiano
Institutions:  []
Raw affiliation strings:  []


Title:  One-Shot Imitation Learning
Author:  Yan Duan
Institutions:  [{'id': 'https://openalex.org/I95457486', 'display_name': 'University of California, Berkeley', 'ror': 'https://ror.org/01an7q238', 'country_code': 'US', 'type': 'education'}]
Raw affiliation strings:  []

Author:  Marcin Andrychowicz
Institutions:  [{'id': 'https://openalex.org/I4654613', 'display_name': 'University of Warsaw', 'ror': 'https://ror.org/039bjqg32', 'co

In [22]:
print("Total unique authors: ", len(all_time_unique_authors))
all_time_unique_authors

Total unique authors:  343


{'https://openalex.org/A5007570707': 'Jeff Wu',
 'https://openalex.org/A5068949174': 'Long Ouyang',
 'https://openalex.org/A5089747113': 'Daniel M. Ziegler',
 'https://openalex.org/A5019575601': 'Nisan Stiennon',
 'https://openalex.org/A5004295653': 'Ryan Lowe',
 'https://openalex.org/A5090592321': 'Jan Leike',
 'https://openalex.org/A5037736834': 'Paul F. Christiano',
 'https://openalex.org/A5027941146': 'Yan Duan',
 'https://openalex.org/A5091819924': 'Marcin Andrychowicz',
 'https://openalex.org/A5004762590': 'Bradly C. Stadie',
 'https://openalex.org/A5067435818': 'OpenAI Jonathan Ho',
 'https://openalex.org/A5009926169': 'Jonas Schneider',
 'https://openalex.org/A5006446297': 'Ilya Sutskever',
 'https://openalex.org/A5049349154': 'Pieter Abbeel',
 'https://openalex.org/A5076651586': 'Wojciech Zaremba',
 'https://openalex.org/A5048522044': 'Bowen Baker',
 'https://openalex.org/A5046491870': 'Ilge Akkaya',
 'https://openalex.org/A5018957231': 'P. A. Zhokhov',
 'https://openalex.org/

In [23]:
num_authors_confirmed_openai = 0
for author_id, author_name in all_time_unique_authors.items():
    last_known_affiliation = Authors()[author_id]["last_known_institution"]
    if last_known_affiliation is not None: 
        if last_known_affiliation["id"] == selected_institution_ids[0]:
            num_authors_confirmed_openai += 1
        print(f"{author_name}: {last_known_affiliation['display_name']}")
num_authors_confirmed_openai

Jeff Wu: Visual Sciences (United States)
Long Ouyang: Plastic Surgery Hospital
Daniel M. Ziegler: Massachusetts Institute of Technology
Ryan Lowe: Facebook (Israel)
Jan Leike: DeepMind (United Kingdom)
Paul F. Christiano: University of California, Berkeley
Yan Duan: Qingdao Agricultural University
Marcin Andrychowicz: Google (United States)
Bradly C. Stadie: University of California, Berkeley
Jonas Schneider: University of Basel
Ilya Sutskever: OpenAI (United States)
Pieter Abbeel: University of California, Berkeley
Wojciech Zaremba: New York University
Bowen Baker: International Commission on Missing Persons
Ilge Akkaya: OpenAI (United States)
P. A. Zhokhov: Texas A&M University
Joost Huizinga: OpenAI (United States)
Jie Tang: University of Chinese Academy of Sciences
Adrien Ecoffet: International Commission on Missing Persons
Brandon Houghton: International Commission on Missing Persons
Raul Sampedro: OpenAI (United States)
Jeff Clune: OpenAI (United States)
Matthias Plappert: OpenAI

30

In [24]:
openai_openalex_works = [Works()[work_id] for work_id in openai_openalex_work_ids]

In [25]:
len(openai_openalex_works)

77

In [26]:
all_time_unique_authors_openalex = {}
for work in openai_openalex_works:
    print("Title: ", work["title"])
    for authorship in work["authorships"]:
        print("Author: ", authorship["author"]["display_name"])
        print("Institutions: ", authorship["institutions"])
        print("Raw affiliation strings: ", authorship["raw_affiliation_strings"])
        print()
        for institution in authorship["institutions"]:
            if institution["id"] == selected_institution_ids[0]:
                all_time_unique_authors_openalex[authorship["author"]["id"]] = authorship["author"]["display_name"]
                break
    print()

Title:  ImageNet classification with deep convolutional neural networks
Author:  Alex Krizhevsky
Institutions:  [{'id': 'https://openalex.org/I1291425158', 'display_name': 'Google (United States)', 'ror': 'https://ror.org/00njsd438', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1291425158', 'https://openalex.org/I4210128969']}]
Raw affiliation strings:  ['GOOGLE INC.']

Author:  Ilya Sutskever
Institutions:  [{'id': 'https://openalex.org/I1291425158', 'display_name': 'Google (United States)', 'ror': 'https://ror.org/00njsd438', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1291425158', 'https://openalex.org/I4210128969']}]
Raw affiliation strings:  ['GOOGLE INC.']

Author:  Geoffrey E. Hinton
Institutions:  [{'id': 'https://openalex.org/I4210161460', 'display_name': 'OpenAI (United States)', 'ror': 'https://ror.org/05wx9n238', 'country_code': 'US', 'type': None, 'lineage': ['https://openalex.org/I4210161460']}]
Raw affiliatio

In [27]:
all_time_unique_authors_openalex

{'https://openalex.org/A5024209719': 'Geoffrey E. Hinton',
 'https://openalex.org/A5004271128': 'Ian Goodfellow',
 'https://openalex.org/A5067370075': 'Rafal Jozefowicz',
 'https://openalex.org/A5041662257': 'Gabriel Goh',
 'https://openalex.org/A5068388812': 'Venkatesan Guruswami',
 'https://openalex.org/A5061984381': 'Jong Wook Kim',
 'https://openalex.org/A5039751155': 'Chris Olah',
 'https://openalex.org/A5031789995': 'Lukasz Kaiser',
 'https://openalex.org/A5073086818': 'Scott McKinney',
 'https://openalex.org/A5070645440': 'Samuel S. Schoenholz',
 'https://openalex.org/A5000711679': 'Gillian K. Hadfield',
 'https://openalex.org/A5014407395': 'Georgia Gkioxari',
 'https://openalex.org/A5084821923': 'Yuandong Tian',
 'https://openalex.org/A5076651586': 'Wojciech Zaremba',
 'https://openalex.org/A5008951080': 'Yi Wu',
 'https://openalex.org/A5006797018': 'Jeff Clune',
 'https://openalex.org/A5014959740': 'Rosie Campbell',
 'https://openalex.org/A5009970175': 'Li Weng',
 'https://ope

In [28]:
set(all_time_unique_authors_openalex.keys()).intersection(set(all_time_unique_authors.keys()))

{'https://openalex.org/A5000711679',
 'https://openalex.org/A5000965900',
 'https://openalex.org/A5002281342',
 'https://openalex.org/A5003630193',
 'https://openalex.org/A5004271128',
 'https://openalex.org/A5006446297',
 'https://openalex.org/A5006797018',
 'https://openalex.org/A5007406730',
 'https://openalex.org/A5007861777',
 'https://openalex.org/A5008951080',
 'https://openalex.org/A5009037058',
 'https://openalex.org/A5009926169',
 'https://openalex.org/A5009970175',
 'https://openalex.org/A5010167098',
 'https://openalex.org/A5010674841',
 'https://openalex.org/A5011053473',
 'https://openalex.org/A5014193325',
 'https://openalex.org/A5016448989',
 'https://openalex.org/A5017456911',
 'https://openalex.org/A5018598100',
 'https://openalex.org/A5019471288',
 'https://openalex.org/A5025285284',
 'https://openalex.org/A5026541761',
 'https://openalex.org/A5026829243',
 'https://openalex.org/A5028772381',
 'https://openalex.org/A5028891032',
 'https://openalex.org/A5030305998',
 

In [29]:
overlapping_openai_author_ids = list(
    set(all_time_unique_authors_openalex.keys()).intersection(set(all_time_unique_authors.keys()))
)

In [30]:
for author_id in overlapping_openai_author_ids:
    print(all_time_unique_authors_openalex[author_id])

Ilya Sutskever
Joshua Achiam
Stanislas Polu
Yi Wu
Chris Hallacy
Jeff Clune
Prafulla Dhariwal
Ingmar Kanitscheider
Bob McGrew
Jack Clark
Phillip Isola
Benjamin Chess
Bowen Baker
M. N. Petrov
Jeffrey Wu
Rein Houthooft
Askell, Amanda
Jong Wook Kim
Dario Amodei
Jakub Pachocki
Adrien Ecoffet
Miles Brundage
Pamela Mishkin
A. Ramesh
Arthur Petron
Josh Tobin
Wojciech Zaremba
Lukasz Kaiser
Jonas Schneider
Pieter Abbeel
McCandlish, Sam
Szymon Sidor
Henighan, Tom
Henrique Ponde de Oliveira Pinto
Glenn Powell
Ilge Akkaya
Gillian K. Hadfield
Rewon Child
Chris Olah
Maciek Chociej
Jacob Hilton
Li Weng
Alex Paino
Marcin Andrychowicz
Jesse Michael Han
Qiming Yuan
Rafal Jozefowicz
Rachel Fong
Alex Ray
Chelsea Voss
Alec Radford
Peter Welinder
Gretchen Krueger
T. B. Brown
Ian Goodfellow
Igor Mordatch
Matthias Plappert
Sandhini Agarwal
Girish Sastry
John Schulman
Tae-Hoon Kim
Harrison Edwards
Venkatesan Guruswami
Raul Sampedro
Scott Gray
Joost Huizinga


Modify the extra works so that the authorships which we know are affiliated with OpenAI have their affiliation recorded.

Test case: work that doesn't have an OpenAI affiliation

In [31]:
new_openai_research_works[0]["authorships"]

[{'author_position': 'first',
  'author': {'id': 'https://openalex.org/A5007570707',
   'display_name': 'Jeff Wu',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': 'Jeff Wu',
  'raw_affiliation_string': '[OpenAI.]',
  'raw_affiliation_strings': ['[OpenAI.]']},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5068949174',
   'display_name': 'Long Ouyang',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': 'Long Ouyang',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5089747113',
   'display_name': 'Daniel M. Ziegler',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': 'Daniel M. Ziegler',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {

In [35]:
len(new_openai_research_works)

95

In [32]:
openai_institution_data = {
    'id': 'https://openalex.org/I4210161460',
    'display_name': 'OpenAI (United States)',
    'ror': 'https://ror.org/05wx9n238',
    'country_code': 'US',
    'type': None
}
num_works_added = 0
for work in new_openai_research_works:
    work_added = False
    for authorship in work["authorships"]:
        if authorship["author"]["id"] in overlapping_openai_author_ids:
            authorship["institutions"].append(openai_institution_data)
            if not work_added:
                num_works_added += 1
                work_added = True
        elif any([text in authorship["raw_affiliation_string"].lower() for text in ("openai", "open ai")]):
            authorship["institutions"].append(openai_institution_data)
            if not work_added:
                num_works_added += 1
                work_added = True
num_works_added

87

In [33]:
new_openai_research_works[0]["authorships"]

[{'author_position': 'first',
  'author': {'id': 'https://openalex.org/A5007570707',
   'display_name': 'Jeff Wu',
   'orcid': None},
  'institutions': [{'id': 'https://openalex.org/I4210161460',
    'display_name': 'OpenAI (United States)',
    'ror': 'https://ror.org/05wx9n238',
    'country_code': 'US',
    'type': None}],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': 'Jeff Wu',
  'raw_affiliation_string': '[OpenAI.]',
  'raw_affiliation_strings': ['[OpenAI.]']},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5068949174',
   'display_name': 'Long Ouyang',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_corresponding': False,
  'raw_author_name': 'Long Ouyang',
  'raw_affiliation_string': '',
  'raw_affiliation_strings': []},
 {'author_position': 'middle',
  'author': {'id': 'https://openalex.org/A5089747113',
   'display_name': 'Daniel M. Ziegler',
   'orcid': None},
  'institutions': [],
  'countries': [],
  'is_cor

Save as new dataset

In [34]:
timestamp = datetime.datetime.now()
fname = f"openai_research_works_processed_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}"
with open(f"data/openai/{fname}", "wb") as f:
    pickle.dump(new_openai_research_works, f)