# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import datetime
import os
import pandas as pd
import pickle
import pyalex
from pyalex import Works
from tqdm.notebook import tqdm

from researcher_impact.pyalex_utils import merge_pages

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
data_file_location = 'data/'
os.makedirs(data_file_location, exist_ok=True)

# Gather raw data

In [5]:
# From institutions.ipynb
selected_institution_ids = [
    "https://openalex.org/I1291425158",  # Google (United States)
    "https://openalex.org/I4210113297",  # Google (United Kingdom)
    "https://openalex.org/I4210100430",  # Google (Switzerland)
    "https://openalex.org/I4210148186",  # Google (Canada)
    "https://openalex.org/I4210117425",  # Google (Israel)
    "https://openalex.org/I4210131802",  # Google (Ireland)
    "https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
    "https://openalex.org/I2252078561",  # Meta (Israel)
    "https://openalex.org/I4210114444",  # Meta (United States)
    "https://openalex.org/I4210111288",  # Meta (United Kingdom)
    "https://openalex.org/I1290206253",  # Microsoft (United States)
    "https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
    "https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
    "https://openalex.org/I4210124949",  # Microsoft Research (India)
    "https://openalex.org/I4210105678",  # Microsoft (Finland)
    "https://openalex.org/I4210087053",  # Microsoft (Germany)
    "https://openalex.org/I4210125051",  # Microsoft (Israel)
    "https://openalex.org/I4210162141",  # Microsoft (India)
    "https://openalex.org/I4210086099",  # Microsoft (Brazil)
    "https://openalex.org/I4210153468",  # Microsoft (Canada)
    "https://openalex.org/I4210161634",  # Microsoft (France)
    "https://openalex.org/I4210110431",  # Microsoft (Netherlands)
    "https://openalex.org/I4210099966",  # Microsoft (Denmark)
    "https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
    "https://openalex.org/I4210135422",  # Microsoft (Norway)
    "https://openalex.org/I4210139986",  # Microsoft (Switzerland)
    "https://openalex.org/I4210109507",  # Microsoft (Ireland)
    "https://openalex.org/I4210092974",  # Microsoft (Portugal)
    "https://openalex.org/I4210151458",  # Microsoft (Belgium)
    "https://openalex.org/I4210161460",  # OpenAI (United States)
    "https://openalex.org/I45928872",  # Alibaba Group (China)
    "https://openalex.org/I4210095624",  # Alibaba Group (United States)
    "https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
    "https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
    "https://openalex.org/I4210127875",  # Nvidia (United States)
    "https://openalex.org/I98301712",  # Baidu (China)
    "https://openalex.org/I1311688040",  # Amazon (United States)
    "https://openalex.org/I4210089985",  # Amazon (Germany)
    "https://openalex.org/I4210123934",  # Amazon (United Kingdom)
    "https://openalex.org/I2250653659",  # Tencent (China)
    "https://openalex.org/I4210103558",  # Tencent Healthcare (China)
    "https://openalex.org/I1341412227",  # IBM (United States)
    "https://openalex.org/I4210114115",  # IBM Research - Thomas J. Watson Research Center
    "https://openalex.org/I4210085935",  # IBM Research - Almaden
    "https://openalex.org/I4210126328",  # IBM Research - Zurich
    "https://openalex.org/I4210167297",  # IBM Research - Haifa
    "https://openalex.org/I4210156936",  # IBM Research - Austin
    "https://openalex.org/I4210145865",  # IBM Research - Tokyo
    "https://openalex.org/I4210103279",  # IBM Research - India
    "https://openalex.org/I4210126794",  # IBM Research - China
    "https://openalex.org/I4210113654",  # IBM (Canada)
    "https://openalex.org/I4210095996",  # IBM (Germany)
    "https://openalex.org/I4210145784",  # IBM Research - Ireland
    "https://openalex.org/I4210120068",  # IBM Research - Australia
    "https://openalex.org/I4210121844",  # IBM (United Kingdom)
    "https://openalex.org/I4210113516",  # IBM Research - Brazil
    "https://openalex.org/I4210112067",  # IBM (France)
    "https://openalex.org/I4210167072",  # IBM (Brazil)
    "https://openalex.org/I4210145636",  # IBM (Spain)
    "https://openalex.org/I4210129961",  # IBM (India)
    "https://openalex.org/I4210152544",  # IBM (Egypt)
    "https://openalex.org/I4210122524",  # IBM (Italy)
    "https://openalex.org/I4210100977",  # IBM (Ireland)
    "https://openalex.org/I4210148065",  # IBM (Netherlands)
    "https://openalex.org/I4210131585",  # IBM (Belgium)
    "https://openalex.org/I4210129932",  # IBM (Portugal)
    "https://openalex.org/I1343180700",  # Intel (United States)
    "https://openalex.org/I4210158342",  # Intel (United Kingdom)
    "https://openalex.org/I4210104622",  # Intel (Israel)
    "https://openalex.org/I4210094487",  # Intel (Germany)
    "https://openalex.org/I4210142644",  # Intel (Malaysia)
    "https://openalex.org/I131781684",  # Intel (Ireland)
    "https://openalex.org/I4210146682",  # Intel (India)
    "https://openalex.org/I4210133876",  # Intel (Taiwan)
    "https://openalex.org/I4210140447",  # Intel (Brazil)
    "https://openalex.org/I4210155185",  # Intel (Poland)
    "https://openalex.org/I4210135487",  # Intel (United Arab Emirates)
    "https://openalex.org/I2250955327",  # Huawei Technologies (China)
    "https://openalex.org/I4210123571",  # Huawei Technologies (France)
    "https://openalex.org/I4210146936",  # Huawei Technologies (United States)
    "https://openalex.org/I4210159102",  # Huawei Technologies (Sweden)
    "https://openalex.org/I4210115038",  # Huawei Technologies (Canada)
    "https://openalex.org/I4210129353",  # Huawei Technologies (Germany)
    "https://openalex.org/I4210160618",  # Huawei Technologies (United Kingdom)
    "https://openalex.org/I4210166625",  # Huawei German Research Center
    "https://openalex.org/I113979032",  # Twitter (United States)
    "https://openalex.org/I1306409833",  # Adobe Systems (United States)
    "https://openalex.org/I4210132870",  # Xerox (United States)
    "https://openalex.org/I33976269",  # Xerox (France)
    "https://openalex.org/I4210159094",  # Xerox (Canada)
    "https://openalex.org/I4210138183",  # Xerox (United Kingdom)
    "https://openalex.org/I4210155268",  # Salesforce (United States)
    "https://openalex.org/I60922564",  # Naver (South Korea)
    "https://openalex.org/I4210121859",  # Enthought (United States)
    "https://openalex.org/I4210098317",  # Quansight (United States)
    "https://openalex.org/I4210128910",  # Group Sense (China)
    "https://openalex.org/I118347220",  # NEC (Japan)
    "https://openalex.org/I4210107353",  # NEC (United States)
    "https://openalex.org/I4210164122",  # NEC (Germany)
    "https://openalex.org/I4210149379",  # NEC (China)
    "https://openalex.org/I4210161085",  # NEC Technologies (United Kingdom)
    "https://openalex.org/I4210087554",  # NEC (United Kingdom)
    "https://openalex.org/I4210156496",  # Uber AI (United States)
    "https://openalex.org/I2946016260",  # Uber AI (United States)
    "https://openalex.org/I58957048",  # Yandex (Russia)
    "https://openalex.org/I869089601",  # Netflix (United States)
]
len(selected_institution_ids)

106

In [6]:
selected_institutions_text = """
"https://openalex.org/I1291425158",  # Google (United States)
"https://openalex.org/I4210113297",  # Google (United Kingdom)
"https://openalex.org/I4210100430",  # Google (Switzerland)
"https://openalex.org/I4210148186",  # Google (Canada)
"https://openalex.org/I4210117425",  # Google (Israel)
"https://openalex.org/I4210131802",  # Google (Ireland)
"https://openalex.org/I4210090411",  # DeepMind (United Kingdom)
"https://openalex.org/I2252078561",  # Meta (Israel)
"https://openalex.org/I4210114444",  # Meta (United States)
"https://openalex.org/I4210111288",  # Meta (United Kingdom)
"https://openalex.org/I1290206253",  # Microsoft (United States)
"https://openalex.org/I4210164937",  # Microsoft Research (United Kingdom)
"https://openalex.org/I4210113369",  # Microsoft Research Asia (China)
"https://openalex.org/I4210124949",  # Microsoft Research (India)
"https://openalex.org/I4210105678",  # Microsoft (Finland)
"https://openalex.org/I4210087053",  # Microsoft (Germany)
"https://openalex.org/I4210125051",  # Microsoft (Israel)
"https://openalex.org/I4210162141",  # Microsoft (India)
"https://openalex.org/I4210086099",  # Microsoft (Brazil)
"https://openalex.org/I4210153468",  # Microsoft (Canada)
"https://openalex.org/I4210161634",  # Microsoft (France)
"https://openalex.org/I4210110431",  # Microsoft (Netherlands)
"https://openalex.org/I4210099966",  # Microsoft (Denmark)
"https://openalex.org/I4210108625",  # Microsoft (United Kingdom)
"https://openalex.org/I4210135422",  # Microsoft (Norway)
"https://openalex.org/I4210139986",  # Microsoft (Switzerland)
"https://openalex.org/I4210109507",  # Microsoft (Ireland)
"https://openalex.org/I4210092974",  # Microsoft (Portugal)
"https://openalex.org/I4210151458",  # Microsoft (Belgium)
"https://openalex.org/I4210161460",  # OpenAI (United States)
"https://openalex.org/I45928872",  # Alibaba Group (China)
"https://openalex.org/I4210095624",  # Alibaba Group (United States)
"https://openalex.org/I4210086143",  # Alibaba Group (Cayman Islands)
"https://openalex.org/I1304085615",  # Nvidia (United Kingdom)
"https://openalex.org/I4210127875",  # Nvidia (United States)
"https://openalex.org/I98301712",  # Baidu (China)
"https://openalex.org/I1311688040",  # Amazon (United States)
"https://openalex.org/I4210089985",  # Amazon (Germany)
"https://openalex.org/I4210123934",  # Amazon (United Kingdom)
"https://openalex.org/I2250653659",  # Tencent (China)
"https://openalex.org/I4210103558",  # Tencent Healthcare (China)
"https://openalex.org/I1341412227",  # IBM (United States)
"https://openalex.org/I4210114115",  # IBM Research - Thomas J. Watson Research Center
"https://openalex.org/I4210085935",  # IBM Research - Almaden
"https://openalex.org/I4210126328",  # IBM Research - Zurich
"https://openalex.org/I4210167297",  # IBM Research - Haifa
"https://openalex.org/I4210156936",  # IBM Research - Austin
"https://openalex.org/I4210145865",  # IBM Research - Tokyo
"https://openalex.org/I4210103279",  # IBM Research - India
"https://openalex.org/I4210126794",  # IBM Research - China
"https://openalex.org/I4210113654",  # IBM (Canada)
"https://openalex.org/I4210095996",  # IBM (Germany)
"https://openalex.org/I4210145784",  # IBM Research - Ireland
"https://openalex.org/I4210120068",  # IBM Research - Australia
"https://openalex.org/I4210121844",  # IBM (United Kingdom)
"https://openalex.org/I4210113516",  # IBM Research - Brazil
"https://openalex.org/I4210112067",  # IBM (France)
"https://openalex.org/I4210167072",  # IBM (Brazil)
"https://openalex.org/I4210145636",  # IBM (Spain)
"https://openalex.org/I4210129961",  # IBM (India)
"https://openalex.org/I4210152544",  # IBM (Egypt)
"https://openalex.org/I4210122524",  # IBM (Italy)
"https://openalex.org/I4210100977",  # IBM (Ireland)
"https://openalex.org/I4210148065",  # IBM (Netherlands)
"https://openalex.org/I4210131585",  # IBM (Belgium)
"https://openalex.org/I4210129932",  # IBM (Portugal)
"https://openalex.org/I1343180700",  # Intel (United States)
"https://openalex.org/I4210158342",  # Intel (United Kingdom)
"https://openalex.org/I4210104622",  # Intel (Israel)
"https://openalex.org/I4210094487",  # Intel (Germany)
"https://openalex.org/I4210142644",  # Intel (Malaysia)
"https://openalex.org/I131781684",  # Intel (Ireland)
"https://openalex.org/I4210146682",  # Intel (India)
"https://openalex.org/I4210133876",  # Intel (Taiwan)
"https://openalex.org/I4210140447",  # Intel (Brazil)
"https://openalex.org/I4210155185",  # Intel (Poland)
"https://openalex.org/I4210135487",  # Intel (United Arab Emirates)
"https://openalex.org/I2250955327",  # Huawei Technologies (China)
"https://openalex.org/I4210123571",  # Huawei Technologies (France)
"https://openalex.org/I4210146936",  # Huawei Technologies (United States)
"https://openalex.org/I4210159102",  # Huawei Technologies (Sweden)
"https://openalex.org/I4210115038",  # Huawei Technologies (Canada)
"https://openalex.org/I4210129353",  # Huawei Technologies (Germany)
"https://openalex.org/I4210160618",  # Huawei Technologies (United Kingdom)
"https://openalex.org/I4210166625",  # Huawei German Research Center
"https://openalex.org/I113979032",  # Twitter (United States)
"https://openalex.org/I1306409833",  # Adobe Systems (United States)
"https://openalex.org/I4210132870",  # Xerox (United States)
"https://openalex.org/I33976269",  # Xerox (France)
"https://openalex.org/I4210159094",  # Xerox (Canada)
"https://openalex.org/I4210138183",  # Xerox (United Kingdom)
"https://openalex.org/I4210155268",  # Salesforce (United States)
"https://openalex.org/I60922564",  # Naver (South Korea)
"https://openalex.org/I4210121859",  # Enthought (United States)
"https://openalex.org/I4210098317",  # Quansight (United States)
"https://openalex.org/I4210128910",  # Group Sense (China)
"https://openalex.org/I118347220",  # NEC (Japan)
"https://openalex.org/I4210107353",  # NEC (United States)
"https://openalex.org/I4210164122",  # NEC (Germany)
"https://openalex.org/I4210149379",  # NEC (China)
"https://openalex.org/I4210161085",  # NEC Technologies (United Kingdom)
"https://openalex.org/I4210087554",  # NEC (United Kingdom)
"https://openalex.org/I4210156496",  # Uber AI (United States)
"https://openalex.org/I2946016260",  # Uber AI (United States)
"https://openalex.org/I58957048",  # Yandex (Russia)
"https://openalex.org/I869089601",  # Netflix (United States)
"""

# Map each institution id to the first word after the comment
institution_aliases = {}
for line in selected_institutions_text.splitlines()[1:]:
    institution_id = line.split(",")[0].strip('"')
    institution_name = line.split("#")[1].strip()
    if "Group Sense" in institution_name:
        institution_alias = "Group Sense"
    else:
        institution_alias = institution_name.split(" ")[0].strip()
    institution_aliases[institution_id] = institution_alias
institution_aliases

{'https://openalex.org/I1291425158': 'Google',
 'https://openalex.org/I4210113297': 'Google',
 'https://openalex.org/I4210100430': 'Google',
 'https://openalex.org/I4210148186': 'Google',
 'https://openalex.org/I4210117425': 'Google',
 'https://openalex.org/I4210131802': 'Google',
 'https://openalex.org/I4210090411': 'DeepMind',
 'https://openalex.org/I2252078561': 'Meta',
 'https://openalex.org/I4210114444': 'Meta',
 'https://openalex.org/I4210111288': 'Meta',
 'https://openalex.org/I1290206253': 'Microsoft',
 'https://openalex.org/I4210164937': 'Microsoft',
 'https://openalex.org/I4210113369': 'Microsoft',
 'https://openalex.org/I4210124949': 'Microsoft',
 'https://openalex.org/I4210105678': 'Microsoft',
 'https://openalex.org/I4210087053': 'Microsoft',
 'https://openalex.org/I4210125051': 'Microsoft',
 'https://openalex.org/I4210162141': 'Microsoft',
 'https://openalex.org/I4210086099': 'Microsoft',
 'https://openalex.org/I4210153468': 'Microsoft',
 'https://openalex.org/I4210161634

In [7]:
len(selected_institution_ids)

106

In [8]:
institution_query = "|".join(selected_institution_ids)
institution_query

'https://openalex.org/I1291425158|https://openalex.org/I4210113297|https://openalex.org/I4210100430|https://openalex.org/I4210148186|https://openalex.org/I4210117425|https://openalex.org/I4210131802|https://openalex.org/I4210090411|https://openalex.org/I2252078561|https://openalex.org/I4210114444|https://openalex.org/I4210111288|https://openalex.org/I1290206253|https://openalex.org/I4210164937|https://openalex.org/I4210113369|https://openalex.org/I4210124949|https://openalex.org/I4210105678|https://openalex.org/I4210087053|https://openalex.org/I4210125051|https://openalex.org/I4210162141|https://openalex.org/I4210086099|https://openalex.org/I4210153468|https://openalex.org/I4210161634|https://openalex.org/I4210110431|https://openalex.org/I4210099966|https://openalex.org/I4210108625|https://openalex.org/I4210135422|https://openalex.org/I4210139986|https://openalex.org/I4210109507|https://openalex.org/I4210092974|https://openalex.org/I4210151458|https://openalex.org/I4210161460|https://o

In [9]:
concept_ids = [
    'https://openalex.org/C154945302',  # Artificial intelligence
    'https://openalex.org/C119857082',  # Machine learning
]

In [10]:
concept_query = "|".join(concept_ids)
concept_query

'https://openalex.org/C154945302|https://openalex.org/C119857082'

In [11]:
# Took ~15 minutes for ~75K works on Macbook Pro 2022
params = []
n_max = int(1e6)
works = []
unique_work_ids = set()
for institution_id in tqdm(selected_institution_ids):
    works_instance = Works()
    try:
        new_works = merge_pages(
            works_instance \
                .filter(authorships={"institutions": {"id": institution_id}}) \
                .filter(concepts={"id": concept_query}) \
                .filter(from_publication_date="2010-01-01") \
                .filter(to_publication_date="2023-06-01") \
                .paginate(per_page=200, n_max=n_max)
        )
    except Exception as e:
        print(f"Error for {institution_id}")
        print(f"{works_instance.params}")
        print(e)
    for work in new_works:
        if work['id'] not in unique_work_ids:
            works.append(work)
            unique_work_ids.add(work['id'])
    params.append(works_instance.params)

assert len(works) < n_max
len(works)

  0%|          | 0/106 [00:00<?, ?it/s]

62page [02:10,  2.10s/page]
1page [00:02,  2.93s/page]
2page [00:03,  1.86s/page]
1page [00:01,  1.82s/page]
1page [00:01,  1.65s/page]
1page [00:00,  1.03page/s]
5page [00:10,  2.14s/page]
16page [00:31,  1.99s/page]
7page [00:15,  2.25s/page]
1page [00:01,  1.85s/page]
44page [01:34,  2.14s/page]
21page [00:44,  2.14s/page]
17page [00:34,  2.00s/page]
4page [00:05,  1.42s/page]
1page [00:01,  1.81s/page]
1page [00:02,  2.02s/page]
1page [00:01,  1.87s/page]
1page [00:01,  1.98s/page]
1page [00:01,  1.60s/page]
1page [00:01,  1.84s/page]
1page [00:01,  1.35s/page]
1page [00:01,  1.10s/page]
1page [00:01,  1.23s/page]
1page [00:01,  1.48s/page]
1page [00:01,  1.38s/page]
1page [00:01,  1.50s/page]
1page [00:01,  1.19s/page]
1page [00:01,  1.38s/page]
1page [00:01,  1.09s/page]
1page [00:01,  1.65s/page]
13page [00:25,  1.97s/page]
7page [00:16,  2.29s/page]
1page [00:02,  2.10s/page]
5page [00:08,  1.74s/page]
5page [00:10,  2.00s/page]
11page [00:19,  1.80s/page]
13page [00:24,  1.86s

76544

In [12]:
# Save to avoid fetching every time
timestamp = datetime.datetime.now()
with open(data_file_location + f"selected_institution_works_openalex_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}", "wb") as f:
    obj = {
        "params": params,  # for reproducibility
        "works": works,
    }
    pickle.dump(obj, f)

In [11]:
with open(data_file_location + f"selected_institution_works_openalex_2023-08-28_16-23-39", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

76544

In [12]:
# There shouldn't be duplication of IDs here
assert len(set([work['id'] for work in works])) == len(works)

# Deduplicate

In [13]:
unique_titles = set()
unique_works = []
for work in works:
    title = work['display_name']
    if title is not None:
        title = title.lower()
        for char in title:
            if not char.isalnum():
                title = title.replace(char, '')
        if title not in unique_titles:
            unique_titles.add(title)
            unique_works.append(work)
len(unique_titles), len(unique_works)

(66093, 66093)

In [14]:
len(unique_works) / len(works)

0.8634641513377926

In [15]:
sum([w['cited_by_count'] for w in unique_works]) / sum([w['cited_by_count'] for w in works])

0.9343082322183108

Remove the less-cited duplicates

This is the safe option, to avoid double-counting citations.

In [16]:
unique_works = {}
for work in works:
    title = work['display_name']
    if title is not None:
        title = title.lower()
        for char in title:
            if not char.isalnum():
                title = title.replace(char, '')
        if title in unique_works.keys():
            duplicate_work = unique_works[title]

            print(work['id'], work['display_name'])
            print(duplicate_work['id'], duplicate_work['display_name'])
            print(work['cited_by_count'], duplicate_work['cited_by_count'])
            print()

            # If the new work has more citations, replace the old one
            if work['cited_by_count'] > duplicate_work['cited_by_count']:
                unique_works[title] = work
        else:
            unique_works[title] = work

https://openalex.org/W1836465849 Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
https://openalex.org/W2949117887 Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift
3448 7349

https://openalex.org/W2626778328 Attention Is All You Need
https://openalex.org/W2963403868 Attention is All you Need
3052 15573

https://openalex.org/W2557283755 Deep Learning
https://openalex.org/W2919115771 Deep learning
3034 48856

https://openalex.org/W2950133940 Distributed Representations of Words and Phrases and their Compositionality
https://openalex.org/W2153579005 Distributed Representations of Words and Phrases and their Compositionality
2562 8521

https://openalex.org/W1614298861 Efficient Estimation of Word Representations in Vector Space
https://openalex.org/W2950577311 Efficient Estimation of Word Representations in Vector Space
2356 6666

https://openalex.org/W2949888546 Sequence to Sequence Learning with Neura

In [17]:
len(unique_works)

66093

In [18]:
works_with_duplicates = works  # for safekeeping
works = list(unique_works.values())

In [19]:
len(works)

66093

# Save

In [21]:
# Save to avoid fetching every time
timestamp = datetime.datetime.now()
fname = f"selected_institution_works_openalex_deduplicated_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}"
with open(data_file_location + fname, "wb") as f:
    obj = {
        # "params": works_instance.params,  # for reproducibility
        "works": works,
        "selected_institution_ids": selected_institution_ids,
        "institution_aliases": institution_aliases,
    }
    pickle.dump(obj, f)
    print("Saved to", fname)

Saved to selected_institution_works_openalex_deduplicated_2023-08-29_12-22-34


# EAC corpus filtering (experimental, not in use)

In [23]:
eac_works_df = pd.read_csv(data_file_location + 'experimental_ai_corpus/ai_openalex_works.csv')
eac_works_df

Unnamed: 0,work_id,doi,display_name,publication_year,publication_date,cited_by_count,is_retracted,venue_id,venue_display_name,venue_url,predicted_language,language_probability,has_abstract,arxiv_id,ambiguous
0,https://openalex.org/W1479999262,,Optimal EEG feature extraction based on R-squa...,2012,2012-12-31,2,False,,"international conference on control, automatio...",http://ieeexplore.ieee.org/xpls/abs_all.jsp?ar...,en,0.710485,True,,False
1,https://openalex.org/W1483393056,,Natural corners extraction algorithm in 2D unk...,2012,2012-12-31,6,False,,"international conference on control, automatio...",https://scholarworks.bwise.kr/erica/handle/202...,en,0.810423,True,,False
2,https://openalex.org/W1485658267,,Adaptive synergy control for a dexterous artif...,2012,2012-12-31,3,False,,"international conference on control, automatio...",http://ieeexplore.ieee.org/iel5/6375948/639302...,en,0.888194,True,,False
3,https://openalex.org/W1485753303,,Endoscopic vision based tracking of multiple s...,2012,2012-12-31,4,False,,"international conference on control, automatio...",http://ieeexplore.ieee.org/iel5/6375948/639302...,en,0.916199,True,,False
4,https://openalex.org/W1487477006,,Departability motion for car-like robot based ...,2012,2012-12-31,0,False,,"international conference on control, automatio...",http://ieeexplore.ieee.org/abstract/document/6...,en,0.961740,True,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1008707,https://openalex.org/W4255677482,https://doi.org/10.4018/978-1-7998-3441-0.ch015,Dealing With Noise and Partial Volume Effects ...,2021,2021-01-01,0,False,,Research Anthology on Diagnosing and Treating ...,https://doi.org/10.4018/978-1-7998-3441-0.ch015,en,0.906979,True,,False
1008708,https://openalex.org/W4255966921,https://doi.org/10.1007/978-3-030-76445-6_4,Problem-Solving,2021,2021-01-01,0,False,,Augmented Humanity,https://doi.org/10.1007/978-3-030-76445-6_4,en,0.754651,True,,False
1008709,https://openalex.org/W4256321930,https://doi.org/10.1049/icp.2021.2552,Power system transient stability assessment ba...,2021,2021-01-01,0,False,,2021 Annual Meeting of CSEE Study Committee of...,https://doi.org/10.1049/icp.2021.2552,en,0.885797,True,,False
1008710,https://openalex.org/W4256479831,https://doi.org/10.1093/bioadv/vbab039,Genomic style: yet another deep-learning appro...,2021,2021-01-01,0,False,https://openalex.org/V4210234069,Bioinformatics advances,https://doi.org/10.1093/bioadv/vbab039,en,0.689846,True,,False


In [24]:
eac_work_ids = set(eac_works_df['work_id'])
len(eac_work_ids)

1008712

In [25]:
work_counts_by_year = defaultdict(int)
for work in works:
    work_counts_by_year[work['publication_year']] += 1
work_counts_by_year

defaultdict(int,
            {2015: 3135,
             2017: 4003,
             2016: 3401,
             2020: 9019,
             2018: 5278,
             2014: 2918,
             2013: 2956,
             2011: 2565,
             2012: 2707,
             2019: 7505,
             2010: 2348,
             2021: 10250,
             2022: 6883,
             2023: 3125})

In [26]:
for work in works[:100]:
    if work['id'] not in eac_work_ids and work['publication_year'] <= 2021:
        print(work['display_name'])

SciPy 1.0: fundamental algorithms for scientific computing in Python
DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs
MobileNetV2: Inverted Residuals and Linear Bottlenecks
Scikit-learn: Machine Learning in Python
Deep Neural Networks for Acoustic Modeling in Speech Recognition: The Shared Views of Four Research Groups
Array programming with NumPy
Google Earth Engine: Planetary-scale geospatial analysis for everyone
Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
Adaptive Subgradient Methods for Online Learning and Stochastic Optimization
Natural Language Processing (Almost) from Scratch
Generative adversarial networks
Searching for MobileNetV3
Accurate, Dense, and Robust Multiview Stereopsis
EfficientDet: Scalable and Efficient Object Detection
Reading Digits in Natural Images with Unsupervised Feature Learning
Unsupervised Learning of Depth and Ego-Motion from Video
MnasNet: Platform-Awar

In [27]:
eac_work_titles = set(eac_works_df['display_name'])
len(eac_work_titles)

975895

In [28]:
eac_work_titles_reduced = set()
for title in eac_work_titles:
    title = title.lower()
    for char in title:
        if not char.isalnum():
            title = title.replace(char, '')
    eac_work_titles_reduced.add(title)
len(eac_work_titles_reduced)

951704

In [29]:
for work in works[:100]:
    title = work['display_name']
    if title is not None:
        title = title.lower()
        for char in title:
            if not char.isalnum():
                title = title.replace(char, '')
    if title not in eac_work_titles_reduced and work['publication_year'] <= 2021:
        print(work['display_name'])

SciPy 1.0: fundamental algorithms for scientific computing in Python
GAN（Generative Adversarial Nets）
MobileNetV2: Inverted Residuals and Linear Bottlenecks
Caffe
Deep Neural Networks for Acoustic Modeling in Speech Recognition: The Shared Views of Four Research Groups
Array programming with NumPy
Google Earth Engine: Planetary-scale geospatial analysis for everyone
Adaptive Subgradient Methods for Online Learning and Stochastic Optimization
Natural Language Processing (Almost) from Scratch
Searching for MobileNetV3
Accurate, Dense, and Robust Multiview Stereopsis
EfficientDet: Scalable and Efficient Object Detection
Reading Digits in Natural Images with Unsupervised Feature Learning
Wide &amp; Deep Learning for Recommender Systems
MnasNet: Platform-Aware Neural Architecture Search for Mobile
A guide to deep learning in healthcare
Listen, attend and spell: A neural network for large vocabulary conversational speech recognition
AutoAugment: Learning Augmentation Strategies From Data
Int

The EAC corpus may have filtered out a lot of false positives for AI/ML publications, but it also appears to have filtered out significant true positives.