In [11]:
import numpy as np
from numpy.random import default_rng
import pickle
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
from scholarly import scholarly
from scholarly import ProxyGenerator
import tqdm

In [2]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [3]:
data_file_location = 'data/'

In [4]:
SEED = 20230105
rng = default_rng(seed=SEED)

In [5]:
# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
# pg = ProxyGenerator()
# pg.FreeProxies()
# scholarly.use_proxy(pg)

In [6]:
with open(data_file_location + "selected_institution_works_openalex_2023-05-29_15-29-57", "rb") as f:
    works_obj = pickle.load(f)
    works = works_obj["works"]
len(works)

134178

In [9]:
works[9]

{'id': 'https://openalex.org/W2618530766',
 'doi': 'https://doi.org/10.1145/3065386',
 'title': 'ImageNet classification with deep convolutional neural networks',
 'display_name': 'ImageNet classification with deep convolutional neural networks',
 'publication_year': 2017,
 'publication_date': '2017-05-24',
 'ids': {'openalex': 'https://openalex.org/W2618530766',
  'doi': 'https://doi.org/10.1145/3065386',
  'mag': '2618530766'},
 'language': 'en',
 'primary_location': {'is_oa': True,
  'landing_page_url': 'https://doi.org/10.1145/3065386',
  'pdf_url': 'http://dl.acm.org/ft_gateway.cfm?id=3065386&type=pdf',
  'source': {'id': 'https://openalex.org/S103482838',
   'display_name': 'Communications of The ACM',
   'issn_l': '0001-0782',
   'issn': ['1557-7317', '0001-0782'],
   'host_organization': 'https://openalex.org/P4310319798',
   'host_organization_name': 'Association for Computing Machinery',
   'host_organization_lineage': ['https://openalex.org/P4310319798'],
   'host_organizati

In [19]:
works_sample = rng.choice(works, size=10000, replace=False)
# works_sample = works[:20]

In [25]:
from researcher_impact.pyalex_utils import merge_sample

works_sample = merge_sample(
    Works().search_filter(abstract="imagenet"),
    sample_size=1000,
    seed=SEED,
)

In [27]:
imagenet_papers = 0
imagenet_count = 0
non_imagenet_count = 0
null_count = 0
for work in tqdm.tqdm(works_sample):
    inv_idx = work['abstract_inverted_index']
    if inv_idx is None:
        continue
    if 'ImageNet' in inv_idx.keys():
        imagenet_papers += 1
        for referenced_work_id in work['referenced_works']:
            referenced_work = Works()[referenced_work_id]
            referenced_inv_idx = referenced_work['abstract_inverted_index']
            if referenced_inv_idx is None:
                null_count += 1
            else:
                if 'ImageNet' in referenced_work['abstract_inverted_index'].keys():
                    imagenet_count += 1
                else:
                    non_imagenet_count += 1

 32%|███▏      | 322/1000 [29:43<1:02:34,  5.54s/it]


HTTPError: 404 Client Error: NOT FOUND for url: https://api.openalex.org/works/W2426267443

In [28]:
print(imagenet_papers, imagenet_count, non_imagenet_count, null_count)

200 650 2546 202


In [22]:
imagenet_count / (imagenet_count + non_imagenet_count)

0.2103225806451613

In [25]:
max_works = len(works_sample)
oa_cited_by_counts = []
scholar_cited_by_counts = []
for work in works_sample:
    work_display_name = work['display_name']
    print(work_display_name)
    work_query = scholarly.search_pubs(work_display_name)
    try:
        first_work_result = next(work_query)
    except StopIteration:
        print("Work not found in Google Scholar - skipping")
        continue
    scholar_citations = first_work_result['num_citations']
    scholar_cited_by_counts.append(scholar_citations)
    oa_cited_by_counts.append(work['cited_by_count'])
    if len(oa_cited_by_counts) >= max_works:
        break

Deep Residual Learning for Image Recognition
Deep learning
ImageNet Large Scale Visual Recognition Challenge
Going deeper with convolutions
Densely Connected Convolutional Networks
Human-level control through deep reinforcement learning
You Only Look Once: Unified, Real-Time Object Detection
Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks
Attention is All you Need
ImageNet classification with deep convolutional neural networks
Rethinking the Inception Architecture for Computer Vision


MaxTriesExceededException: Cannot Fetch from Google Scholar.

In [26]:
scholar_cited_by_counts = np.array(scholar_cited_by_counts)
oa_cited_by_counts = np.array(oa_cited_by_counts)

In [27]:
scholar_cited_by_counts

array([166591,  64970,  37745,  50480,  35050,  24374,  35425,  59381,
        76820, 133192])

In [28]:
oa_cited_by_counts

array([98132, 46365, 26234, 26126, 18987, 17161, 16115, 16074, 15711,
       15059])

In [35]:
works[9]['display_name']

'ImageNet classification with deep convolutional neural networks'

In [29]:
np.mean(scholar_cited_by_counts - oa_cited_by_counts)

38806.4

In [30]:
np.mean(np.abs(scholar_cited_by_counts - oa_cited_by_counts))

38806.4

In [31]:
np.sqrt(np.mean((scholar_cited_by_counts - oa_cited_by_counts)**2))

50996.60420067203

In [32]:
np.mean(scholar_cited_by_counts[oa_cited_by_counts > 0] / oa_cited_by_counts[oa_cited_by_counts > 0])

2.9362898667387656

In [33]:
np.std(scholar_cited_by_counts[oa_cited_by_counts > 0] / oa_cited_by_counts[oa_cited_by_counts > 0])

2.2465921884986715