In [1]:
from collections import defaultdict
import numpy as np
from numpy.random import default_rng
import pickle
import pyalex
import plotly.graph_objects as go
from pyalex import Authors, Concepts, Institutions, Works
from semanticscholar import SemanticScholar
import time
from tqdm.notebook import tqdm

In [2]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [3]:
data_file_location = 'data/'

In [4]:
sch = SemanticScholar()

In [5]:
with open(data_file_location + "selected_institution_works_openalex_deduplicated_2023-08-29_12-22-34", "rb") as f:
    works_obj = pickle.load(f)
    selected_institution_ids = works_obj["selected_institution_ids"]
    institution_aliases = works_obj["institution_aliases"]

In [6]:
results = sch.search_paper("Attention is all you need")
print(f'{results.total} results.', f'First occurrence: {results[0].title}.')

6690 results. First occurrence: Attention is All you Need.


# By institution

## Fetch data

Fetching the Semantic Scholar data take a full day.

Not all works end up in the list due to missing DOIs, DOIs that can't be found in Semantic Scholar, and timeout errors.

This section is commented out by default, to avoid re-running when we have saved data already.

In [7]:
# works_sample = works
# max_works = len(works)
# oa_works = []
# semantic_works = []
# oa_cited_by_counts = []
# semantic_cited_by_counts = []

# pbar = tqdm(total=max_works)

# for work in works_sample:
#     work_display_name = work['display_name']
#     # print(work_display_name)
#     try:
#         if work['doi'] is not None:
#             semantic_work = sch.get_paper(work['doi'].replace('https://doi.org/', ''))
#         else:
#             continue
#     except Exception as e:
#         print(e)
#         continue
#     oa_works.append(work)
#     semantic_works.append(semantic_work)
#     semantic_citations = semantic_work.citationCount
#     semantic_cited_by_counts.append(semantic_citations)
#     oa_cited_by_counts.append(work['cited_by_count'])
#     pbar.update(1)
#     if len(oa_works) >= max_works:
#         pbar.close()
#         break

In [8]:
# assert len(oa_works) == len(semantic_works)

In [9]:
# reduced_semantic_data = []
# for work in semantic_works:
#     data = {
#         'id': work.paperId,
#         'title': work.title,
#         'cited_by_count': work.citationCount,
#         'authorships': work.authors,
#         'concepts': work.fieldsOfStudy,
#         'publication_date': work.publicationDate,
#         'publiation_year': work.year,
#     }
#     reduced_semantic_data.append(data)

In [10]:
# with open(data_file_location + 'openalex_semantic_scholar_comparison_data', 'wb') as f:
#     pickle.dump({
#         'oa_works': oa_works,
#         'semantic_works': reduced_semantic_data,
#         'oa_cited_by_counts': oa_cited_by_counts,
#         'semantic_cited_by_counts': semantic_cited_by_counts,
#     }, f)

## Process (saved) data

In [11]:
with open(data_file_location + 'openalex_semantic_scholar_comparison_data', 'rb') as f:
    data_obj = pickle.load(f)
    oa_works = data_obj['oa_works']
    semantic_works = data_obj['semantic_works']
    oa_cited_by_counts = data_obj['oa_cited_by_counts']
    semantic_cited_by_counts = data_obj['semantic_cited_by_counts']

In [27]:
len(oa_works), len(semantic_works)

(42255, 42255)

In [12]:
semantic_cited_by_counts = np.array(semantic_cited_by_counts)
oa_cited_by_counts = np.array(oa_cited_by_counts)

In [13]:
semantic_cited_by_counts

array([ 60062,  37466, 102513, ...,      0,      0,      1])

In [14]:
oa_cited_by_counts

array([48856, 27353, 16712, ...,     0,     0,     0])

In [15]:
np.mean(semantic_cited_by_counts - oa_cited_by_counts)

17.825393444562774

In [28]:
# Mean absolute error
np.mean(np.abs(semantic_cited_by_counts - oa_cited_by_counts))

20.297100934800614

In [29]:
# Root mean squared error
np.sqrt(np.mean((semantic_cited_by_counts - oa_cited_by_counts)**2))

479.9554426524655

In [42]:
# Mean ratio
np.mean(semantic_cited_by_counts[oa_cited_by_counts > 0] / oa_cited_by_counts[oa_cited_by_counts > 0])

1.9037512643613745

In [43]:
# Median ratio
np.median(semantic_cited_by_counts[oa_cited_by_counts > 0] / oa_cited_by_counts[oa_cited_by_counts > 0])

1.375

In [44]:
# Std of ratio
np.std(semantic_cited_by_counts[oa_cited_by_counts > 0] / oa_cited_by_counts[oa_cited_by_counts > 0])

2.4993584887895413

In [21]:
# Compare histograms of OA and Semantic Scholar citation counts
fig = go.Figure()
fig.add_trace(go.Histogram(x=np.log10(oa_cited_by_counts[oa_cited_by_counts > 0]), name='OpenAlex'))
fig.add_trace(go.Histogram(x=np.log10(semantic_cited_by_counts[semantic_cited_by_counts > 0]), name='Semantic Scholar'))
fig.update_xaxes(
    ticktext=[1, 10, 100, 1000, 10000, 100000],
    tickvals=[0, 1, 2, 3, 4, 5],
    title_text='Citation count',
)
fig.update_yaxes(
    title_text='Number of publications',
)
fig.update_traces(opacity=0.6)
fig.update_layout(
    barmode='overlay',
    width=800,
    height=500,
    title_text='Histogram of OpenAlex and Semantic Scholar citation counts for most-cited publications',
)
fig.show()

In [22]:
institution_oa_cited_by_counts = defaultdict(list)
institution_semantic_cited_by_counts = defaultdict(list)
for work, semantic_work in zip(oa_works, semantic_works):
    cited_by_count = work["cited_by_count"]
    semantic_cited_by_count = semantic_work["cited_by_count"]
    citations_added = defaultdict(bool)
    for authorship in work["authorships"]:
        if len(authorship["institutions"]) == 0:
            continue
        for ins in authorship["institutions"]:
            if ins.get("id") is None:
                continue
            if ins["id"] not in selected_institution_ids:
                continue
            alias = institution_aliases.get(ins["id"], ins["id"])
            if not citations_added[alias]:
                institution_oa_cited_by_counts[alias].append(cited_by_count)
                institution_semantic_cited_by_counts[alias].append(semantic_cited_by_count)
                citations_added[alias] = True

In [23]:
institution_oa_cited_by_counts

defaultdict(list,
            {'Meta': [48856,
              561,
              485,
              384,
              208,
              205,
              200,
              191,
              186,
              162,
              154,
              142,
              135,
              111,
              902,
              97,
              96,
              92,
              90,
              69,
              67,
              58,
              58,
              57,
              51,
              49,
              48,
              41,
              41,
              41,
              41,
              39,
              39,
              34,
              33,
              30,
              30,
              28,
              26,
              25,
              25,
              25,
              25,
              23,
              23,
              22,
              22,
              21,
              19,
              18,
              18,
              17,
              17,
   

In [24]:
institution_semantic_cited_by_counts

defaultdict(list,
            {'Meta': [60062,
              697,
              731,
              482,
              321,
              344,
              305,
              272,
              325,
              319,
              199,
              170,
              180,
              273,
              1486,
              135,
              144,
              138,
              106,
              103,
              94,
              65,
              244,
              72,
              101,
              133,
              98,
              47,
              97,
              67,
              120,
              54,
              40,
              66,
              4,
              59,
              51,
              43,
              25,
              31,
              21,
              58,
              49,
              0,
              36,
              41,
              35,
              24,
              40,
              23,
              18,
              10,
             

In [46]:
institution_citation_ratios = {}

for institution in institution_oa_cited_by_counts:
    oa_counts = np.array(institution_oa_cited_by_counts[institution])
    semantic_counts = np.array(institution_semantic_cited_by_counts[institution])
    
    oa_nonzero_counts = oa_counts[oa_counts > 0]
    semantic_nonzero_counts = semantic_counts[oa_counts > 0]

    mean_ratio = np.mean(semantic_nonzero_counts / oa_nonzero_counts)
    median_ratio = np.median(semantic_nonzero_counts / oa_nonzero_counts)
    std_ratio = np.std(semantic_nonzero_counts / oa_nonzero_counts)

    institution_citation_ratios[institution] = mean_ratio

    print(f"{institution} ({len(oa_nonzero_counts)} works)")
    print(f"Mean ratio: {mean_ratio}")
    print(f"Median ratio: {median_ratio}")
    print(f"Std ratio: {std_ratio}")
    print()

Meta (89 works)
Mean ratio: 1.896489463923849
Median ratio: 1.5
Std ratio: 2.3966775254687116

Google (4745 works)
Mean ratio: 2.125162511711967
Median ratio: 1.446808510638298
Std ratio: 3.163094871588936

OpenAI (48 works)
Mean ratio: 4.195710141475012
Median ratio: 1.7225490196078432
Std ratio: 10.123030634727396

Quansight (2 works)
Mean ratio: 1.0878747103236899
Median ratio: 1.0878747103236899
Std ratio: 0.026650220527771554

Enthought (8 works)
Mean ratio: 1.3421375634061228
Median ratio: 1.0878747103236899
Std ratio: 0.5436382899090877

DeepMind (105 works)
Mean ratio: 3.602968531075915
Median ratio: 1.78
Std ratio: 8.384972810243548

Microsoft (6063 works)
Mean ratio: 1.9897740057098392
Median ratio: 1.4
Std ratio: 3.0399758889439563

IBM (6262 works)
Mean ratio: 1.5940899080573192
Median ratio: 1.3
Std ratio: 2.040493327646889

Amazon (1517 works)
Mean ratio: 2.312106270683693
Median ratio: 1.5041322314049588
Std ratio: 2.93420658934553

Adobe (1503 works)
Mean ratio: 2.01895

In [47]:
institution_citation_ratios

{'Meta': 1.896489463923849,
 'Google': 2.125162511711967,
 'OpenAI': 4.195710141475012,
 'Quansight': 1.0878747103236899,
 'Enthought': 1.3421375634061228,
 'DeepMind': 3.602968531075915,
 'Microsoft': 1.9897740057098392,
 'IBM': 1.5940899080573192,
 'Amazon': 2.312106270683693,
 'Adobe': 2.018958974526198,
 'Netflix': 1.7066145168013023,
 'Intel': 1.5304073375544947,
 'Huawei': 1.8590031653202637,
 'Salesforce': 2.730892192212752,
 'Baidu': 1.9553524367120914,
 'Nvidia': 2.1296540292409865,
 'Yandex': 1.8705082271871645,
 'NEC': 1.544143939987172,
 'Twitter': 1.5433443217405982,
 'Tencent': 2.087809403113888,
 'Naver': 2.135714036751277,
 'Uber': 2.3951002077014047,
 'Alibaba': 2.1228645192955518,
 'Xerox': 1.4120552646159417,
 'Group Sense': 2.059920063914083}