In [23]:
import datetime
import json
import os
import pickle
from pyalex import Works

In [3]:
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

# Load initial data

In [4]:
with open(data_file_location + "selected_institution_works_openalex_deduplicated_2023-08-29_12-22-34", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
    selected_institution_ids = works_obj["selected_institution_ids"]
    institution_aliases = works_obj["institution_aliases"]

In [5]:
len(works)

66093

In [6]:
works[0]

{'id': 'https://openalex.org/W2919115771',
 'doi': 'https://doi.org/10.1038/nature14539',
 'title': 'Deep learning',
 'display_name': 'Deep learning',
 'publication_year': 2015,
 'publication_date': '2015-05-27',
 'ids': {'openalex': 'https://openalex.org/W2919115771',
  'doi': 'https://doi.org/10.1038/nature14539',
  'mag': '2919115771',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/26017442'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1038/nature14539',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S137773608',
   'display_name': 'Nature',
   'issn_l': '0028-0836',
   'issn': ['1476-4687', '0028-0836'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310319908',
   'host_organization_name': 'Nature Portfolio',
   'host_organization_lineage': ['https://openalex.org/P4310319908',
    'https://openalex.org/P4310319965'],
   'host_organization_lineage_names': ['Nature Portfolio'

In [7]:
selected_institution_ids

['https://openalex.org/I1291425158',
 'https://openalex.org/I4210113297',
 'https://openalex.org/I4210100430',
 'https://openalex.org/I4210148186',
 'https://openalex.org/I4210117425',
 'https://openalex.org/I4210131802',
 'https://openalex.org/I4210090411',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210114444',
 'https://openalex.org/I4210111288',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I4210124949',
 'https://openalex.org/I4210105678',
 'https://openalex.org/I4210087053',
 'https://openalex.org/I4210125051',
 'https://openalex.org/I4210162141',
 'https://openalex.org/I4210086099',
 'https://openalex.org/I4210153468',
 'https://openalex.org/I4210161634',
 'https://openalex.org/I4210110431',
 'https://openalex.org/I4210099966',
 'https://openalex.org/I4210108625',
 'https://openalex.org/I4210135422',
 'https://openalex.org/I4210139986',
 'https://openalex.org/I4210109507',
 

In [8]:
institution_aliases

{'https://openalex.org/I1291425158': 'Google',
 'https://openalex.org/I4210113297': 'Google',
 'https://openalex.org/I4210100430': 'Google',
 'https://openalex.org/I4210148186': 'Google',
 'https://openalex.org/I4210117425': 'Google',
 'https://openalex.org/I4210131802': 'Google',
 'https://openalex.org/I4210090411': 'DeepMind',
 'https://openalex.org/I2252078561': 'Meta',
 'https://openalex.org/I4210114444': 'Meta',
 'https://openalex.org/I4210111288': 'Meta',
 'https://openalex.org/I1290206253': 'Microsoft',
 'https://openalex.org/I4210164937': 'Microsoft',
 'https://openalex.org/I4210113369': 'Microsoft',
 'https://openalex.org/I4210124949': 'Microsoft',
 'https://openalex.org/I4210105678': 'Microsoft',
 'https://openalex.org/I4210087053': 'Microsoft',
 'https://openalex.org/I4210125051': 'Microsoft',
 'https://openalex.org/I4210162141': 'Microsoft',
 'https://openalex.org/I4210086099': 'Microsoft',
 'https://openalex.org/I4210153468': 'Microsoft',
 'https://openalex.org/I4210161634

In [9]:
# Correction. OpenAlex has systematically labelled affiliations with "SenseTime" or "Sense Time" as Group Sense.
# 97.5% of "Group Sense" papers are actually SenseTime.
# There is no institution called SenseTime in OpenAlex, so the best we can do is simply change the alias.
# See `publication_sense_checks.ipynb` for more details.
institution_aliases['https://openalex.org/I4210128910'] = 'SenseTime'

# Add extra OpenAI data and merge

In [10]:
with open("data/openai/openai_research_works_processed_2023-09-07_17-16-25", "rb") as f:
    openai_research_works = pickle.load(f)

In [11]:
len(openai_research_works)

95

In [12]:
openai_research_works[0]

{'id': 'https://openalex.org/W3200980294',
 'doi': None,
 'title': 'Recursively Summarizing Books with Human Feedback',
 'display_name': 'Recursively Summarizing Books with Human Feedback',
 'relevance_score': 486.11057,
 'publication_year': 2021,
 'publication_date': '2021-09-22',
 'ids': {'openalex': 'https://openalex.org/W3200980294', 'mag': '3200980294'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'http://arxiv.org/pdf/2109.10862.pdf',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',


In [13]:
excluded_works = [
    'https://openalex.org/W2618530766',  # 2017 AlexNet paper with incorrect affiliations
]

In [14]:
# If the work is already present, then replace it with the version from the extra OpenAI dataset
# Else, append it
openai_works_dict = {work["id"]:work for work in openai_research_works}
new_works = []
openai_works_replaced = set()
for work in works:
    if work["id"] in openai_works_dict.keys():
        openai_work = openai_works_dict[work["id"]]
        new_works.append(openai_work)
        openai_works_replaced.add(work["id"])
    else:
        # Exact case-sensitive title of Communications of The ACM paper
        if work['id'] in excluded_works:
            print(f"Skipping \"{work['display_name']}\"")
        else:
            new_works.append(work)

openai_works_added = set()
for work_id in openai_works_dict.keys():
    if work_id not in openai_works_replaced:
        new_works.append(openai_works_dict[work_id])
        openai_works_added.add(work_id)

Skipping "ImageNet classification with deep convolutional neural networks"


In [15]:
openai_works_replaced

{'https://openalex.org/W2560512785',
 'https://openalex.org/W2762117857',
 'https://openalex.org/W2762872434',
 'https://openalex.org/W2898917980',
 'https://openalex.org/W2950602864',
 'https://openalex.org/W2963989027',
 'https://openalex.org/W2964075320',
 'https://openalex.org/W2964263543',
 'https://openalex.org/W2973525135',
 'https://openalex.org/W3030163527',
 'https://openalex.org/W3082115681',
 'https://openalex.org/W3093419064'}

In [16]:
openai_works_added

{'https://openalex.org/W2462906003',
 'https://openalex.org/W2530944449',
 'https://openalex.org/W2548137223',
 'https://openalex.org/W2566467060',
 'https://openalex.org/W2578206533',
 'https://openalex.org/W2591957724',
 'https://openalex.org/W2595180411',
 'https://openalex.org/W2596367596',
 'https://openalex.org/W2606347107',
 'https://openalex.org/W2606433045',
 'https://openalex.org/W2609650878',
 'https://openalex.org/W2623491082',
 'https://openalex.org/W2736601468',
 'https://openalex.org/W2749928749',
 'https://openalex.org/W2765602917',
 'https://openalex.org/W2766774033',
 'https://openalex.org/W2767313115',
 'https://openalex.org/W2785397462',
 'https://openalex.org/W2786303200',
 'https://openalex.org/W2787887017',
 'https://openalex.org/W2789008106',
 'https://openalex.org/W2795900505',
 'https://openalex.org/W2796979132',
 'https://openalex.org/W2798877128',
 'https://openalex.org/W2807324060',
 'https://openalex.org/W2883433335',
 'https://openalex.org/W2885550588',
 

In [18]:
Works()[list(openai_works_added)[2]]

{'id': 'https://openalex.org/W2609650878',
 'doi': None,
 'title': 'Equivalence Between Policy Gradients and Soft Q-Learning',
 'display_name': 'Equivalence Between Policy Gradients and Soft Q-Learning',
 'publication_year': 2017,
 'publication_date': '2017-04-21',
 'ids': {'openalex': 'https://openalex.org/W2609650878', 'mag': '2609650878'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://arxiv.org/pdf/1704.06440.pdf',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S4306400194',
   'display_name': 'arXiv (Cornell University)',
   'issn_l': None,
   'issn': None,
   'is_oa': True,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/I205783295',
   'host_organization_name': 'Cornell University',
   'host_organization_lineage': ['https://openalex.org/I205783295'],
   'host_organization_lineage_names': ['Cornell University'],
   'type': 'repository'},
  'license': None,
  'version': 'submittedVersion',
  'is_accepted':

In [19]:
len(new_works) - len(works)

82

In [20]:
works = new_works

In [21]:
len(works)

66175

# Save

In [33]:
def write_jsonl(entries, filename, datatype=list):
    with open(filename, "w") as f:
        if datatype == dict:
            json.dump(entries, f)
        else:
            for entry in entries[:-1]:
                json.dump(entry, f)
                f.write("\n")
            json.dump(entries[-1], f)

In [35]:
timestamp = datetime.datetime.now()

works_fname = f"selected_institution_works_openalex_deduplicated_extra_openai_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}"
write_jsonl(works, data_file_location + works_fname + ".jsonl")

ids_fname = f"selected_institution_ids_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}"
write_jsonl(selected_institution_ids, data_file_location + ids_fname + ".jsonl")

aliases_fname = f"institution_aliases_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}"
write_jsonl(institution_aliases, data_file_location + aliases_fname + ".jsonl", datatype=dict)