# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import datetime
import json
import numpy as np
from numpy.random import default_rng
import os
import pandas as pd
import pickle
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import pyalex
from pyalex import Authors, Concepts, Institutions, Works
import seaborn as sns
from tqdm.notebook import tqdm
import xarray as xr
import re

from research_impact import plotting
from research_impact.bootstrap import bootstrap_wrapper, propagate_bootstrap_list, propagate_bootstrap_dict, bootstrap_stats
from research_impact.citations import get_bounded_citations
from research_impact.processors import OpenAlexProcessor
from research_impact.regression import fit_linear_regression, predict
from research_impact.utils import dict_to_dataarray
from research_impact.plotting import save_plot
from research_impact.pyalex_utils import merge_pages, merge_sample

In [3]:
# Number of years after publication year to count citations
CITATION_YEAR_BOUND = 3

In [4]:
# Bootstrap with this number of iterations
# Bootstrap sampling is performed on the set of publications
# This is to estimate variance in the results, to measure the potential effect of false positives and missing publications
# 1000 iterations takes about 15 minutes to process on 2022 Macbook Pro
USE_BOOTSTRAP = False
BOOTSTRAP_ITERATIONS = 1000

In [5]:
# Create a random number generator, with a fixed random seed for reproducibility
SEED = 20230105
rng = default_rng(seed=SEED)

In [6]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [7]:
# Configure numpy to display numbers in scientific notation
np.set_printoptions(formatter={'float': '{: .2e}'.format})

In [8]:
# Apply the default theme
sns.set_theme()

In [9]:
pio.templates.default = "plotly_white"

In [10]:
DATA_START_YEAR = 2010  # Inclusive. Default: 2010
DATA_END_YEAR = 2023  # Inclusive. Default: 2023

In [11]:
all_years = np.arange(DATA_START_YEAR, DATA_END_YEAR + 1)

In [12]:
# Location to save data and results
data_file_location = 'data/publications/'
os.makedirs(data_file_location, exist_ok=True)

result_file_location = f'results/publications/extra_openai_{DATA_START_YEAR}-{DATA_END_YEAR}_cyb={CITATION_YEAR_BOUND}/'
os.makedirs(result_file_location, exist_ok=True)

# Load data

In [13]:
works = []
with open(data_file_location + "selected_institution_works_openalex_deduplicated_extra_openai_2023-11-01_12-03-19.jsonl", "rb") as f:
    for line in f:
        works.append(json.loads(line))

In [14]:
selected_institution_ids = []
with open(data_file_location + "selected_institution_ids_2023-11-01_12-03-19.jsonl", "rb") as f:
    for line in f:
        selected_institution_ids.append(json.loads(line))

In [15]:
institution_aliases = {}
with open(data_file_location + "institution_aliases_2023-11-01_12-03-19.jsonl", "rb") as f:
    for line in f:
        institution_aliases.update(json.loads(line))

In [16]:
len(works)

66175

In [17]:
works[0]

{'id': 'https://openalex.org/W2919115771',
 'doi': 'https://doi.org/10.1038/nature14539',
 'title': 'Deep learning',
 'display_name': 'Deep learning',
 'publication_year': 2015,
 'publication_date': '2015-05-27',
 'ids': {'openalex': 'https://openalex.org/W2919115771',
  'doi': 'https://doi.org/10.1038/nature14539',
  'mag': '2919115771',
  'pmid': 'https://pubmed.ncbi.nlm.nih.gov/26017442'},
 'language': 'en',
 'primary_location': {'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1038/nature14539',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S137773608',
   'display_name': 'Nature',
   'issn_l': '0028-0836',
   'issn': ['1476-4687', '0028-0836'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310319908',
   'host_organization_name': 'Nature Portfolio',
   'host_organization_lineage': ['https://openalex.org/P4310319908',
    'https://openalex.org/P4310319965'],
   'host_organization_lineage_names': ['Nature Portfolio'

In [18]:
selected_institution_ids

['https://openalex.org/I1291425158',
 'https://openalex.org/I4210113297',
 'https://openalex.org/I4210100430',
 'https://openalex.org/I4210148186',
 'https://openalex.org/I4210117425',
 'https://openalex.org/I4210131802',
 'https://openalex.org/I4210090411',
 'https://openalex.org/I2252078561',
 'https://openalex.org/I4210114444',
 'https://openalex.org/I4210111288',
 'https://openalex.org/I1290206253',
 'https://openalex.org/I4210164937',
 'https://openalex.org/I4210113369',
 'https://openalex.org/I4210124949',
 'https://openalex.org/I4210105678',
 'https://openalex.org/I4210087053',
 'https://openalex.org/I4210125051',
 'https://openalex.org/I4210162141',
 'https://openalex.org/I4210086099',
 'https://openalex.org/I4210153468',
 'https://openalex.org/I4210161634',
 'https://openalex.org/I4210110431',
 'https://openalex.org/I4210099966',
 'https://openalex.org/I4210108625',
 'https://openalex.org/I4210135422',
 'https://openalex.org/I4210139986',
 'https://openalex.org/I4210109507',
 

In [19]:
institution_aliases

{'https://openalex.org/I1291425158': 'Google',
 'https://openalex.org/I4210113297': 'Google',
 'https://openalex.org/I4210100430': 'Google',
 'https://openalex.org/I4210148186': 'Google',
 'https://openalex.org/I4210117425': 'Google',
 'https://openalex.org/I4210131802': 'Google',
 'https://openalex.org/I4210090411': 'DeepMind',
 'https://openalex.org/I2252078561': 'Meta',
 'https://openalex.org/I4210114444': 'Meta',
 'https://openalex.org/I4210111288': 'Meta',
 'https://openalex.org/I1290206253': 'Microsoft',
 'https://openalex.org/I4210164937': 'Microsoft',
 'https://openalex.org/I4210113369': 'Microsoft',
 'https://openalex.org/I4210124949': 'Microsoft',
 'https://openalex.org/I4210105678': 'Microsoft',
 'https://openalex.org/I4210087053': 'Microsoft',
 'https://openalex.org/I4210125051': 'Microsoft',
 'https://openalex.org/I4210162141': 'Microsoft',
 'https://openalex.org/I4210086099': 'Microsoft',
 'https://openalex.org/I4210153468': 'Microsoft',
 'https://openalex.org/I4210161634

# Data processing

## Per-publication

In [20]:
works = np.array(works)  # For indexing

In [21]:
def processor_iter(works):
    processor = OpenAlexProcessor(works, selected_institution_ids, institution_aliases, citation_year_bound=CITATION_YEAR_BOUND)
    processor.process_works()
    return processor

In [22]:
# Without bootstrapping
processor = processor_iter(works)

In [23]:
# Count number of works that we ended up with data for
authors_by_work = processor.get_authors_by_work()
works_processed = []
for d in authors_by_work.values():
    works_processed.extend(d.keys())
works_processed = set(works_processed)
len(works_processed)

66168

In [24]:
# With bootstrapping
# 1000 iterations takes ~15 minutes on 2022 Macbook Pro
processor_sample = bootstrap_wrapper(processor_iter, works, rng, bootstrap_size=BOOTSTRAP_ITERATIONS, mock=(not USE_BOOTSTRAP))

## Aggregation

In [25]:
institution_author_data = propagate_bootstrap_list(lambda processor: processor.get_author_data(), processor_sample)

In [26]:
institution_author_data[0]

defaultdict(<function research_impact.processors.OpenAlexProcessor.deduplicate_authors.<locals>.<lambda>()>,
            {'Meta': defaultdict(set,
                         {2015: {'https://openalex.org/A5000684736',
                           'https://openalex.org/A5000788715',
                           'https://openalex.org/A5001226970',
                           'https://openalex.org/A5001239550',
                           'https://openalex.org/A5001493943',
                           'https://openalex.org/A5002078510',
                           'https://openalex.org/A5002419451',
                           'https://openalex.org/A5004095177',
                           'https://openalex.org/A5005519086',
                           'https://openalex.org/A5007720098',
                           'https://openalex.org/A5008151243',
                           'https://openalex.org/A5009121120',
                           'https://openalex.org/A5009706553',
                           '

In [27]:
institution_authors_by_work = propagate_bootstrap_list(lambda processor: processor.get_authors_by_work(), processor_sample)

In [28]:
institution_authors_by_work[0]

defaultdict(<function research_impact.processors.OpenAlexProcessor.process_works.<locals>.<lambda>()>,
            {'Meta': defaultdict(list,
                         {'https://openalex.org/W2919115771': ['https://openalex.org/A5001226970'],
                          'https://openalex.org/W2964153729': ['https://openalex.org/A5089960673'],
                          'https://openalex.org/W2963373786': ['https://openalex.org/A5076651586'],
                          'https://openalex.org/W2962896489': ['https://openalex.org/A5031540264',
                           'https://openalex.org/A5036069974',
                           'https://openalex.org/A5083212922'],
                          'https://openalex.org/W581956982': ['https://openalex.org/A5076651586'],
                          'https://openalex.org/W2118434577': ['https://openalex.org/A5076651586'],
                          'https://openalex.org/W2951714314': ['https://openalex.org/A5020917394'],
                          'https:

In [29]:
institution_author_name_data = processor.get_author_name_data()

In [30]:
institution_author_counts = propagate_bootstrap_list(lambda processor: processor.get_author_counts(), processor_sample)

In [31]:
institution_author_counts

[{'Meta': <xarray.DataArray (year: 14)>
  array([ 1.00e+01,  2.90e+01,  2.50e+01,  4.20e+01,  7.30e+01,  9.10e+01,
          1.21e+02,  1.96e+02,  3.40e+02,  6.03e+02,  9.27e+02,  1.10e+03,
          4.53e+02,  9.60e+01])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Google': <xarray.DataArray (year: 14)>
  array([ 2.41e+02,  2.89e+02,  3.27e+02,  3.53e+02,  3.84e+02,  5.27e+02,
          7.96e+02,  8.71e+02,  1.32e+03,  1.86e+03,  2.15e+03,  2.22e+03,
          1.51e+03,  8.90e+02])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Quansight': <xarray.DataArray (year: 2)>
  array([ 4.00e+00,  1.00e+00])
  Coordinates:
    * year     (year) int64 2020 2022,
  'Enthought': <xarray.DataArray (year: 8)>
  array([ 1.00e+00,  1.00e+00,  1.00e+00,  2.00e+00,  1.00e+00,  7.00e+00,
          3.00e+00,  3.00e+00])
  Coordinates:
    * year     (year) int64 2010 2011 2014 2016 2017 2019 2020

In [32]:
institution_citations = propagate_bootstrap_list(lambda processor: processor.get_bounded_citations(), processor_sample)

In [33]:
institution_citations

[{'Meta': <xarray.DataArray (year: 14)>
  array([ 5.10e+01,  8.22e+02,  7.01e+02,  1.07e+03,  3.04e+03,  1.67e+04,
          1.16e+04,  5.17e+04,  2.45e+04,  3.56e+04,  2.72e+04,  1.26e+04,
          2.04e+03,  3.80e+01])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Google': <xarray.DataArray (year: 14)>
  array([ 2.94e+03,  4.63e+03,  5.50e+03,  8.55e+03,  1.53e+04,  3.73e+04,
          4.04e+04,  6.28e+04,  8.37e+04,  7.45e+04,  7.36e+04,  1.99e+04,
          3.76e+03,  1.34e+02])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Quansight': <xarray.DataArray (year: 2)>
  array([ 2.04e+04,  0.00e+00])
  Coordinates:
    * year     (year) int64 2020 2022,
  'Enthought': <xarray.DataArray (year: 8)>
  array([ 0.00e+00,  4.87e+02,  2.73e+02,  3.00e+01,  2.40e+01,  4.00e+00,
          2.04e+04,  1.00e+00])
  Coordinates:
    * year     (year) int64 2010 2011 2014 2016 2017 2019 2020

In [34]:
institution_work_counts = propagate_bootstrap_list(lambda processor: processor.get_work_counts(), processor_sample)

In [35]:
institution_work_counts

[{'Meta': <xarray.DataArray (year: 14)>
  array([ 1.00e+01,  2.10e+01,  3.30e+01,  3.30e+01,  5.60e+01,  9.00e+01,
          1.21e+02,  2.04e+02,  3.41e+02,  5.34e+02,  7.35e+02,  8.85e+02,
          2.14e+02,  5.20e+01])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Google': <xarray.DataArray (year: 14)>
  array([ 2.09e+02,  2.64e+02,  2.81e+02,  3.27e+02,  3.54e+02,  4.51e+02,
          5.71e+02,  6.41e+02,  1.00e+03,  1.46e+03,  1.74e+03,  1.68e+03,
          7.39e+02,  3.68e+02])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Quansight': <xarray.DataArray (year: 2)>
  array([ 2.00e+00,  1.00e+00])
  Coordinates:
    * year     (year) int64 2020 2022,
  'Enthought': <xarray.DataArray (year: 8)>
  array([ 1.00e+00,  1.00e+00,  1.00e+00,  2.00e+00,  1.00e+00,  2.00e+00,
          2.00e+00,  2.00e+00])
  Coordinates:
    * year     (year) int64 2010 2011 2014 2016 2017 2019 2020

In [36]:
institution_individual_citations = propagate_bootstrap_list(lambda processor: processor.get_individual_bounded_citations(), processor_sample)

In [37]:
institution_individual_citations

[{'Meta': array([9053,  307,  281, ...,    0,    0,    0]),
  'Google': array([9053, 6642, 3100, ...,    0,    0,    0]),
  'Quansight': array([12895,  7530,     0]),
  'Enthought': array([12895,  7530,   487,     0,     1,   273,    30,     0,    24,
             3,     1,     0]),
  'DeepMind': array([ 4591,  1239,   164,   258,   114,   551,    38,    25,    12,
             4,     2,     1,   468,   328,   188,   159,    92,    34,
            41,    53,    27,    37,    28,    31,    32,    25,    31,
            23,     2,    10,     8,     7,     5,     3,     1,   167,
            23,     0,  1251,   103,    85,    66,    64,    55,    51,
            50,   297,    35,    37,    34,    31,    25,    25,    26,
            25,    20,    19,    19,    16,    51,    21,    25,    12,
            11,    11,     9,   107,     8,     7,     6,     6,    51,
             5,     5,    30,     4,     3,     3,     3,    34,    29,
             2,    27,     2,    17,    13,     1,     1

In [38]:
institutions_coauthor_counts = propagate_bootstrap_list(lambda processor: processor.get_coauthor_counts(), processor_sample)

In [39]:
def get_combined_coauthor_counts(institutions_coauthor_counts):
    combined_coauthor_counts = defaultdict(list)
    for ins, counts_by_year in institutions_coauthor_counts.items():
        for year, counts in counts_by_year.items():
            combined_coauthor_counts[year].extend(counts)
    return combined_coauthor_counts

combined_coauthor_counts = propagate_bootstrap_list(get_combined_coauthor_counts, institutions_coauthor_counts)

In [40]:
combined_avg_coauthor_counts = propagate_bootstrap_list(
    lambda combined_coauthor_counts: dict_to_dataarray(combined_coauthor_counts, 'year', lambda x: np.mean(x)),
    combined_coauthor_counts,
)
combined_avg_coauthor_counts

[<xarray.DataArray (year: 14)>
 array([ 3.71e+00,  3.87e+00,  3.89e+00,  4.02e+00,  4.17e+00,  4.29e+00,
         4.51e+00,  4.66e+00,  4.95e+00,  5.23e+00,  5.64e+00,  5.79e+00,
         6.19e+00,  6.36e+00])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023]

In [41]:
combined_std_coauthor_counts = propagate_bootstrap_list(
    lambda combined_coauthor_counts: dict_to_dataarray(combined_coauthor_counts, 'year', lambda x: np.std(x)),
    combined_coauthor_counts,
)
combined_std_coauthor_counts

[<xarray.DataArray (year: 14)>
 array([ 2.19e+00,  2.76e+00,  2.09e+00,  2.33e+00,  2.71e+00,  2.62e+00,
         3.42e+00,  3.13e+00,  2.96e+00,  4.27e+00,  4.98e+00,  4.41e+00,
         5.30e+00,  5.16e+00])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023]

In [42]:
def get_institution_citations_per_author(idx):
    institution_citations_i = institution_citations[idx]
    institution_author_counts_i = institution_author_counts[idx]
    institution_citations_per_author = {}
    for ins, citation_counts in institution_citations_i.items():
        citations_per_author = citation_counts / institution_author_counts_i[ins]
        citations_per_author.values = np.nan_to_num(citations_per_author.values)
        institution_citations_per_author[ins] = citations_per_author
    return institution_citations_per_author

institution_citations_per_author = propagate_bootstrap_list(get_institution_citations_per_author, range(len(processor_sample)))
institution_citations_per_author

[{'Meta': <xarray.DataArray (year: 14)>
  array([ 5.10e+00,  2.83e+01,  2.80e+01,  2.54e+01,  4.16e+01,  1.83e+02,
          9.55e+01,  2.64e+02,  7.21e+01,  5.90e+01,  2.94e+01,  1.14e+01,
          4.51e+00,  3.96e-01])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Google': <xarray.DataArray (year: 14)>
  array([ 1.22e+01,  1.60e+01,  1.68e+01,  2.42e+01,  3.98e+01,  7.08e+01,
          5.08e+01,  7.21e+01,  6.36e+01,  4.02e+01,  3.42e+01,  8.94e+00,
          2.48e+00,  1.51e-01])
  Coordinates:
    * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023,
  'Quansight': <xarray.DataArray (year: 2)>
  array([ 5.11e+03,  0.00e+00])
  Coordinates:
    * year     (year) int64 2020 2022,
  'Enthought': <xarray.DataArray (year: 8)>
  array([ 0.00e+00,  4.87e+02,  2.73e+02,  1.50e+01,  2.40e+01,  5.71e-01,
          6.81e+03,  3.33e-01])
  Coordinates:
    * year     (year) int64 2010 2011 2014 2016 2017 2019 2020

In [43]:
def get_total_author_counts(institution_author_counts):
    total_author_counts = xr.DataArray(np.zeros(len(all_years)), dims=['year'], coords={'year': all_years})
    for ins, author_counts in institution_author_counts.items():
        for count in author_counts:
            if count.year.item() in all_years:
                total_author_counts.loc[count.year] += count
    return total_author_counts

total_author_counts = propagate_bootstrap_list(
    lambda institution_author_counts_: get_total_author_counts(institution_author_counts_),
    institution_author_counts,
)
total_author_counts

[<xarray.DataArray (year: 14)>
 array([ 2.73e+03,  3.02e+03,  3.07e+03,  3.38e+03,  3.40e+03,  3.69e+03,
         4.32e+03,  4.95e+03,  6.77e+03,  9.56e+03,  1.16e+04,  1.34e+04,
         1.08e+04,  5.77e+03])
 Coordinates:
   * year     (year) int64 2010 2011 2012 2013 2014 ... 2019 2020 2021 2022 2023]

In [44]:
dummy_data = institution_author_counts[0]['Google']
mask = dummy_data.year.isin([2016, 2017, 2024])
dummy_data[mask]

In [45]:
mean_func = lambda x: x[x.year.isin(all_years)].mean().item()
sum_func = lambda x: x[x.year.isin(all_years)].sum().item()

In [46]:
institution_mean_authors = propagate_bootstrap_dict(mean_func, institution_author_counts)
institution_mean_authors

[{'Meta': 293.42857142857144,
  'Google': 980.9285714285714,
  'Quansight': 2.5,
  'Enthought': 2.375,
  'DeepMind': 125.41666666666667,
  'Microsoft': 973.9285714285714,
  'IBM': 1024.5714285714287,
  'Amazon': 303.92857142857144,
  'OpenAI': 30.875,
  'Adobe': 122.28571428571429,
  'Netflix': 16.916666666666668,
  'Intel': 527.6428571428571,
  'Huawei': 456.2142857142857,
  'Salesforce': 19.90909090909091,
  'NEC': 149.21428571428572,
  'Baidu': 176.07142857142858,
  'Nvidia': 142.0,
  'Yandex': 20.285714285714285,
  'Twitter': 51.07142857142857,
  'Uber': 33.333333333333336,
  'Tencent': 265.42857142857144,
  'Naver': 56.5,
  'Alibaba': 345.07142857142856,
  'Xerox': 47.92307692307692,
  'SenseTime': 72.91666666666667}]

In [47]:
institution_mean_authors_stats = bootstrap_stats(institution_mean_authors)
institution_mean_authors_stats

defaultdict(dict,
            {'Meta': {'mean': 293.42857142857144,
              'median': 293.42857142857144,
              'std': 0.0,
              'ci': array([ 2.93e+02,  2.93e+02])},
             'Google': {'mean': 980.9285714285714,
              'median': 980.9285714285714,
              'std': 0.0,
              'ci': array([ 9.81e+02,  9.81e+02])},
             'Quansight': {'mean': 2.5,
              'median': 2.5,
              'std': 0.0,
              'ci': array([ 2.50e+00,  2.50e+00])},
             'Enthought': {'mean': 2.375,
              'median': 2.375,
              'std': 0.0,
              'ci': array([ 2.38e+00,  2.38e+00])},
             'DeepMind': {'mean': 125.41666666666667,
              'median': 125.41666666666667,
              'std': 0.0,
              'ci': array([ 1.25e+02,  1.25e+02])},
             'Microsoft': {'mean': 973.9285714285714,
              'median': 973.9285714285714,
              'std': 0.0,
              'ci': array([ 9.74e+02,  9.

In [48]:
institution_total_work_counts = propagate_bootstrap_dict(sum_func, institution_work_counts)
institution_total_work_counts

[{'Meta': 3329.0,
  'Google': 10094.0,
  'Quansight': 3.0,
  'Enthought': 12.0,
  'DeepMind': 755.0,
  'Microsoft': 14550.0,
  'IBM': 9738.0,
  'Amazon': 2962.0,
  'OpenAI': 163.0,
  'Adobe': 2335.0,
  'Netflix': 170.0,
  'Intel': 4928.0,
  'Huawei': 5140.0,
  'Salesforce': 211.0,
  'NEC': 1312.0,
  'Baidu': 1920.0,
  'Nvidia': 1568.0,
  'Yandex': 246.0,
  'Twitter': 328.0,
  'Uber': 232.0,
  'Tencent': 3495.0,
  'Naver': 583.0,
  'Alibaba': 3424.0,
  'Xerox': 532.0,
  'SenseTime': 677.0}]

In [49]:
institution_total_work_counts_stats = bootstrap_stats(institution_total_work_counts)
institution_total_work_counts_stats

defaultdict(dict,
            {'Meta': {'mean': 3329.0,
              'median': 3329.0,
              'std': 0.0,
              'ci': array([ 3.33e+03,  3.33e+03])},
             'Google': {'mean': 10094.0,
              'median': 10094.0,
              'std': 0.0,
              'ci': array([ 1.01e+04,  1.01e+04])},
             'Quansight': {'mean': 3.0,
              'median': 3.0,
              'std': 0.0,
              'ci': array([ 3.00e+00,  3.00e+00])},
             'Enthought': {'mean': 12.0,
              'median': 12.0,
              'std': 0.0,
              'ci': array([ 1.20e+01,  1.20e+01])},
             'DeepMind': {'mean': 755.0,
              'median': 755.0,
              'std': 0.0,
              'ci': array([ 7.55e+02,  7.55e+02])},
             'Microsoft': {'mean': 14550.0,
              'median': 14550.0,
              'std': 0.0,
              'ci': array([ 1.46e+04,  1.46e+04])},
             'IBM': {'mean': 9738.0,
              'median': 9738.0,
            

In [50]:
institution_total_citations = propagate_bootstrap_dict(sum_func, institution_citations)
institution_total_citations

[{'Meta': 187553.0,
  'Google': 433009.0,
  'Quansight': 20425.0,
  'Enthought': 21244.0,
  'DeepMind': 68983.0,
  'Microsoft': 336207.0,
  'IBM': 105101.0,
  'Amazon': 40306.0,
  'OpenAI': 20385.0,
  'Adobe': 53920.0,
  'Netflix': 2136.0,
  'Intel': 57723.0,
  'Huawei': 54969.0,
  'Salesforce': 3920.0,
  'NEC': 11806.0,
  'Baidu': 36272.0,
  'Nvidia': 48236.0,
  'Yandex': 3530.0,
  'Twitter': 10301.0,
  'Uber': 7287.0,
  'Tencent': 56119.0,
  'Naver': 13596.0,
  'Alibaba': 41471.0,
  'Xerox': 6220.0,
  'SenseTime': 27480.0}]

In [51]:
institution_total_citations_stats = bootstrap_stats(institution_total_citations)
institution_total_citations_stats

defaultdict(dict,
            {'Meta': {'mean': 187553.0,
              'median': 187553.0,
              'std': 0.0,
              'ci': array([ 1.88e+05,  1.88e+05])},
             'Google': {'mean': 433009.0,
              'median': 433009.0,
              'std': 0.0,
              'ci': array([ 4.33e+05,  4.33e+05])},
             'Quansight': {'mean': 20425.0,
              'median': 20425.0,
              'std': 0.0,
              'ci': array([ 2.04e+04,  2.04e+04])},
             'Enthought': {'mean': 21244.0,
              'median': 21244.0,
              'std': 0.0,
              'ci': array([ 2.12e+04,  2.12e+04])},
             'DeepMind': {'mean': 68983.0,
              'median': 68983.0,
              'std': 0.0,
              'ci': array([ 6.90e+04,  6.90e+04])},
             'Microsoft': {'mean': 336207.0,
              'median': 336207.0,
              'std': 0.0,
              'ci': array([ 3.36e+05,  3.36e+05])},
             'IBM': {'mean': 105101.0,
              'm

In [52]:
institution_mean_citations = propagate_bootstrap_dict(mean_func, institution_citations)
institution_mean_citations

[{'Meta': 13396.642857142857,
  'Google': 30929.214285714286,
  'Quansight': 10212.5,
  'Enthought': 2655.5,
  'DeepMind': 5748.583333333333,
  'Microsoft': 24014.785714285714,
  'IBM': 7507.214285714285,
  'Amazon': 2879.0,
  'OpenAI': 2548.125,
  'Adobe': 3851.4285714285716,
  'Netflix': 178.0,
  'Intel': 4123.071428571428,
  'Huawei': 3926.3571428571427,
  'Salesforce': 356.3636363636364,
  'NEC': 843.2857142857143,
  'Baidu': 2590.8571428571427,
  'Nvidia': 3445.4285714285716,
  'Yandex': 252.14285714285714,
  'Twitter': 735.7857142857143,
  'Uber': 809.6666666666666,
  'Tencent': 4008.5,
  'Naver': 971.1428571428571,
  'Alibaba': 2962.214285714286,
  'Xerox': 478.46153846153845,
  'SenseTime': 2290.0}]

In [53]:
institution_mean_citations_stats = bootstrap_stats(institution_mean_citations)
institution_mean_citations_stats

defaultdict(dict,
            {'Meta': {'mean': 13396.642857142857,
              'median': 13396.642857142857,
              'std': 0.0,
              'ci': array([ 1.34e+04,  1.34e+04])},
             'Google': {'mean': 30929.214285714286,
              'median': 30929.214285714286,
              'std': 0.0,
              'ci': array([ 3.09e+04,  3.09e+04])},
             'Quansight': {'mean': 10212.5,
              'median': 10212.5,
              'std': 0.0,
              'ci': array([ 1.02e+04,  1.02e+04])},
             'Enthought': {'mean': 2655.5,
              'median': 2655.5,
              'std': 0.0,
              'ci': array([ 2.66e+03,  2.66e+03])},
             'DeepMind': {'mean': 5748.583333333333,
              'median': 5748.583333333333,
              'std': 0.0,
              'ci': array([ 5.75e+03,  5.75e+03])},
             'Microsoft': {'mean': 24014.785714285714,
              'median': 24014.785714285714,
              'std': 0.0,
              'ci': array([ 2

In [54]:
institution_mean_citations_per_author = propagate_bootstrap_dict(mean_func, institution_citations_per_author)
institution_mean_citations_per_author

[{'Meta': 60.53879585692334,
  'Google': 32.30786347758447,
  'Quansight': 2553.125,
  'Enthought': 951.0297619047618,
  'DeepMind': 46.734182979096886,
  'Microsoft': 25.54660959797563,
  'IBM': 6.820683327831922,
  'Amazon': 9.927198901406697,
  'OpenAI': 95.53395552609315,
  'Adobe': 33.79890448139291,
  'Netflix': 10.793690225669394,
  'Intel': 7.631702033424398,
  'Huawei': 8.749056362329444,
  'Salesforce': 25.029527608958503,
  'NEC': 5.924201886674041,
  'Baidu': 14.690704379222769,
  'Nvidia': 20.45413012983975,
  'Yandex': 11.291748132819562,
  'Twitter': 13.655817775121848,
  'Uber': 17.11749774071386,
  'Tencent': 13.40648849473885,
  'Naver': 12.125248946931388,
  'Alibaba': 9.231962369048231,
  'Xerox': 8.739549483738429,
  'SenseTime': 36.90711549655804}]

In [55]:
institution_mean_citations_per_author_stats = bootstrap_stats(institution_mean_citations_per_author)
institution_mean_citations_per_author_stats

defaultdict(dict,
            {'Meta': {'mean': 60.53879585692334,
              'median': 60.53879585692334,
              'std': 0.0,
              'ci': array([ 6.05e+01,  6.05e+01])},
             'Google': {'mean': 32.30786347758447,
              'median': 32.30786347758447,
              'std': 0.0,
              'ci': array([ 3.23e+01,  3.23e+01])},
             'Quansight': {'mean': 2553.125,
              'median': 2553.125,
              'std': 0.0,
              'ci': array([ 2.55e+03,  2.55e+03])},
             'Enthought': {'mean': 951.0297619047618,
              'median': 951.0297619047618,
              'std': 0.0,
              'ci': array([ 9.51e+02,  9.51e+02])},
             'DeepMind': {'mean': 46.734182979096886,
              'median': 46.734182979096886,
              'std': 0.0,
              'ci': array([ 4.67e+01,  4.67e+01])},
             'Microsoft': {'mean': 25.54660959797563,
              'median': 25.54660959797563,
              'std': 0.0,
         

In [56]:
institution_mean_citations_per_author_means = {ins:stats['mean'] for ins, stats in institution_mean_citations_per_author_stats.items()}
institution_mean_citations_per_author_means

{'Meta': 60.53879585692334,
 'Google': 32.30786347758447,
 'Quansight': 2553.125,
 'Enthought': 951.0297619047618,
 'DeepMind': 46.734182979096886,
 'Microsoft': 25.54660959797563,
 'IBM': 6.820683327831922,
 'Amazon': 9.927198901406697,
 'OpenAI': 95.53395552609315,
 'Adobe': 33.79890448139291,
 'Netflix': 10.793690225669394,
 'Intel': 7.631702033424398,
 'Huawei': 8.749056362329444,
 'Salesforce': 25.029527608958503,
 'NEC': 5.924201886674041,
 'Baidu': 14.690704379222769,
 'Nvidia': 20.45413012983975,
 'Yandex': 11.291748132819562,
 'Twitter': 13.655817775121848,
 'Uber': 17.11749774071386,
 'Tencent': 13.40648849473885,
 'Naver': 12.125248946931388,
 'Alibaba': 9.231962369048231,
 'Xerox': 8.739549483738429,
 'SenseTime': 36.90711549655804}

In [57]:
for ins, stats in institution_mean_citations_per_author_stats.items():
    print(ins, stats['ci'])

Meta [ 6.05e+01  6.05e+01]
Google [ 3.23e+01  3.23e+01]
Quansight [ 2.55e+03  2.55e+03]
Enthought [ 9.51e+02  9.51e+02]
DeepMind [ 4.67e+01  4.67e+01]
Microsoft [ 2.55e+01  2.55e+01]
IBM [ 6.82e+00  6.82e+00]
Amazon [ 9.93e+00  9.93e+00]
OpenAI [ 9.55e+01  9.55e+01]
Adobe [ 3.38e+01  3.38e+01]
Netflix [ 1.08e+01  1.08e+01]
Intel [ 7.63e+00  7.63e+00]
Huawei [ 8.75e+00  8.75e+00]
Salesforce [ 2.50e+01  2.50e+01]
NEC [ 5.92e+00  5.92e+00]
Baidu [ 1.47e+01  1.47e+01]
Nvidia [ 2.05e+01  2.05e+01]
Yandex [ 1.13e+01  1.13e+01]
Twitter [ 1.37e+01  1.37e+01]
Uber [ 1.71e+01  1.71e+01]
Tencent [ 1.34e+01  1.34e+01]
Naver [ 1.21e+01  1.21e+01]
Alibaba [ 9.23e+00  9.23e+00]
Xerox [ 8.74e+00  8.74e+00]
SenseTime [ 3.69e+01  3.69e+01]


# Results

## Publications

In [58]:
# Exclude Enthought and Quansight due to having a small number of outlier citations
institutions = sorted(set(v for v in institution_aliases.values() if v not in ['Quansight', 'Enthought']))

error_y = [
    institution_total_work_counts_stats[ins]['ci'][1] - \
    institution_total_work_counts_stats[ins]['mean']
    for ins in institutions
]
error_y_minus = [
    institution_total_work_counts_stats[ins]['mean'] - \
    institution_total_work_counts_stats[ins]['ci'][0]
    for ins in institutions
]

fig = go.Figure([go.Bar(
    x=institutions,
    y=[institution_total_work_counts_stats[ins]['mean'] for ins in institutions],
    error_y=dict(
        type='data',
        array=error_y,
        arrayminus=error_y_minus,
        visible=False,
        # color='rgb(180, 180, 200)',
        # color='black',
    ),
)])

## Plot layout
fig.update_yaxes(type="log", tickvals=[100, 300, 1000, 3000, 10000])

fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.update_layout(
    title=f'Publication count, {DATA_START_YEAR} to {DATA_END_YEAR}',
    # xaxis_title='Company',
    yaxis_title='Total publications',
)

fig.update_layout(
    autosize=False,
    width=480,
    height=200,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=0, r=20, t=10, b=75),
)

plotting.prettify_bar_chart(fig)

## Save plot
save_plot(fig, result_file_location, 'publications_bar_no_error')

## Show plot
fig.show()

## Citations

In [59]:
# Exclude Enthought and Quansight due to having a small number of outlier citations
institutions = sorted(set(v for v in institution_aliases.values() if v not in ['Quansight', 'Enthought']))

error_y = [
    institution_total_citations_stats[ins]['ci'][1] - \
    institution_total_citations_stats[ins]['mean']
    for ins in institutions
]
error_y_minus = [
    institution_total_citations_stats[ins]['mean'] - \
    institution_total_citations_stats[ins]['ci'][0]
    for ins in institutions
]

fig = go.Figure([go.Bar(
    x=institutions,
    y=[institution_total_citations_stats[ins]['mean'] for ins in institutions],
    error_y=dict(
        type='data',
        array=error_y,
        arrayminus=error_y_minus,
        visible=False,
        # color='rgb(180, 180, 200)',
        # color='black',
    ),
)])

## Plot layout
fig.update_yaxes(type="log", tickvals=[1000, 3000, 10000, 30000, 100000, 300000, 1000000])

fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.update_layout(
    title=f'Citation count in {CITATION_YEAR_BOUND}-year window after publication, {DATA_START_YEAR} to {DATA_END_YEAR}',
    # xaxis_title='Company',
    yaxis_title='Total citations',
)

fig.update_layout(
    autosize=False,
    width=480,
    height=200,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=10, t=20, b=10),
)

plotting.prettify_bar_chart(fig)

## Save plot
save_plot(fig, result_file_location, 'citations_bar_no_error')

## Show plot
fig.show()

In [60]:
# From Semantic Scholar comparison
institution_citation_ratios = {
    'Meta': 1.896489463923849,
    'Google': 2.125162511711967,
    'OpenAI': 4.195710141475012,
    'Quansight': 1.0878747103236899,
    'Enthought': 1.3421375634061228,
    'DeepMind': 3.602968531075915,
    'Microsoft': 1.9897740057098392,
    'IBM': 1.5940899080573192,
    'Amazon': 2.312106270683693,
    'Adobe': 2.018958974526198,
    'Netflix': 1.7066145168013023,
    'Intel': 1.5304073375544947,
    'Huawei': 1.8590031653202637,
    'Salesforce': 2.730892192212752,
    'Baidu': 1.9553524367120914,
    'Nvidia': 2.1296540292409865,
    'Yandex': 1.8705082271871645,
    'NEC': 1.544143939987172,
    'Twitter': 1.5433443217405982,
    'Tencent': 2.087809403113888,
    'Naver': 2.135714036751277,
    'Uber': 2.3951002077014047,
    'Alibaba': 2.1228645192955518,
    'Xerox': 1.4120552646159417,
    'SenseTime': 2.059920063914083
}

We multiply by the average ratio of citations between OpenAlex and Semantic Scholar just to get a rough sense of the effect.

In [61]:
# Exclude Enthought and Quansight due to having a small number of outlier citations
institutions = sorted(set(v for v in institution_aliases.values() if v not in ['Quansight', 'Enthought']))

error_y = [
    institution_total_citations_stats[ins]['ci'][1] - \
    institution_total_citations_stats[ins]['mean']
    for ins in institutions
]
error_y_minus = [
    institution_total_citations_stats[ins]['mean'] - \
    institution_total_citations_stats[ins]['ci'][0]
    for ins in institutions
]

fig = go.Figure([go.Bar(
    x=institutions,
    y=[institution_total_citations_stats[ins]['mean'] * institution_citation_ratios[ins] for ins in institutions],
    error_y=dict(
        type='data',
        array=error_y,
        arrayminus=error_y_minus,
        visible=False,
        # color='rgb(180, 180, 200)',
        # color='black',
    ),
)])

## Plot layout
fig.update_yaxes(type="log", tickvals=[1000, 3000, 10000, 30000, 100000, 300000, 1000000])

fig.update_layout(xaxis={'categoryorder':'total descending'})

fig.update_layout(
    title=f'Citation count in {CITATION_YEAR_BOUND}-year window after publication, {DATA_START_YEAR} to {DATA_END_YEAR}',
    # xaxis_title='Company',
    yaxis_title='Total citations',
)

fig.update_layout(
    autosize=False,
    width=480,
    height=200,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=10, b=20),
)

plotting.prettify_bar_chart(fig)

## Save plot
save_plot(fig, result_file_location, 'citations_bar_semantic_scholar_adjustment')

## Show plot
fig.show()

## Citations-authors phase plot

In [62]:
institution_mean_authors_stats

defaultdict(dict,
            {'Meta': {'mean': 293.42857142857144,
              'median': 293.42857142857144,
              'std': 0.0,
              'ci': array([ 2.93e+02,  2.93e+02])},
             'Google': {'mean': 980.9285714285714,
              'median': 980.9285714285714,
              'std': 0.0,
              'ci': array([ 9.81e+02,  9.81e+02])},
             'Quansight': {'mean': 2.5,
              'median': 2.5,
              'std': 0.0,
              'ci': array([ 2.50e+00,  2.50e+00])},
             'Enthought': {'mean': 2.375,
              'median': 2.375,
              'std': 0.0,
              'ci': array([ 2.38e+00,  2.38e+00])},
             'DeepMind': {'mean': 125.41666666666667,
              'median': 125.41666666666667,
              'std': 0.0,
              'ci': array([ 1.25e+02,  1.25e+02])},
             'Microsoft': {'mean': 973.9285714285714,
              'median': 973.9285714285714,
              'std': 0.0,
              'ci': array([ 9.74e+02,  9.

In [63]:
for ins, mean in sorted(institution_mean_citations_per_author_means.items(), key=lambda x: x[1], reverse=True):
    print(f"{ins}: {mean:.2f}")

Quansight: 2553.12
Enthought: 951.03
OpenAI: 95.53
Meta: 60.54
DeepMind: 46.73
SenseTime: 36.91
Adobe: 33.80
Google: 32.31
Microsoft: 25.55
Salesforce: 25.03
Nvidia: 20.45
Uber: 17.12
Baidu: 14.69
Twitter: 13.66
Tencent: 13.41
Naver: 12.13
Yandex: 11.29
Netflix: 10.79
Amazon: 9.93
Alibaba: 9.23
Huawei: 8.75
Xerox: 8.74
Intel: 7.63
IBM: 6.82
NEC: 5.92


In [64]:
# Reject Quansight and Enthought because they are one-hit wonders
citations_authors_institutions = []
for ins, mean in sorted(institution_mean_citations_per_author_means.items(), key=lambda x: x[1], reverse=True):
    if ins not in ['Quansight', 'Enthought']:
        citations_authors_institutions.append(ins)
citations_authors_institutions

['OpenAI',
 'Meta',
 'DeepMind',
 'SenseTime',
 'Adobe',
 'Google',
 'Microsoft',
 'Salesforce',
 'Nvidia',
 'Uber',
 'Baidu',
 'Twitter',
 'Tencent',
 'Naver',
 'Yandex',
 'Netflix',
 'Amazon',
 'Alibaba',
 'Huawei',
 'Xerox',
 'Intel',
 'IBM',
 'NEC']

In [65]:
fig = go.Figure()

annotation_positions_for_2010_to_2023 = {
    'Twitter':  'top center',
    'Google':   'top center',
    'Yandex':   'top center',
    'Huawei':   'top center',
    'DeepMind': 'top center',
    'Tencent':  'top center',

    'Alibaba': 'bottom center',
    'Huawei': 'top +2 right -2',

    'Naver': 'middle -3 right +4',
    'Netflix': 'bottom -3 center',
    'NEC': 'top +2 center',
    'Amazon': 'middle left',
    'Adobe': 'bottom -2 center',
    'Baidu': 'bottom -2 center -6',
    'Nvidia': 'bottom -2 center -6',
}

for ins in citations_authors_institutions:
    num_authors = institution_mean_authors_stats[ins]['mean']
    num_citations = institution_mean_citations_per_author_stats[ins]['mean']

    # 2010-2023 positions
    if DATA_START_YEAR == 2010 and DATA_END_YEAR == 2023:
        text_position = annotation_positions_for_2010_to_2023.get(ins, 'bottom center')

    # 2010-2016 positions
    if DATA_START_YEAR == 2010 and DATA_END_YEAR == 2016:
        if ins in ['Huawei', 'Baidu', 'Yandex', 'Naver']:
            text_position = 'top center'
        else:
            text_position = 'bottom center'
    
    # 2017-2023 positions
    if DATA_START_YEAR == 2017 and DATA_END_YEAR == 2023:
        if ins in ['Yandex', 'Alibaba']:
            text_position = 'top center'
        elif ins in ['Amazon']:
            text_position = 'bottom left'
        elif ins in ['Huawei']:
            text_position = 'bottom right'
        else:
            text_position = 'bottom center'

    fig.add_trace(
        go.Scatter(
            x=[num_authors],  # take square root of x values
            y=[num_citations],  # take square root of y values
            mode='markers',
            showlegend=False,
            # marker_opacity=0,
            marker_color='#02767C',
            name=ins,
        )
    )

    position_pattern = re.compile(
        r"(top|middle|bottom)"          # Vertical position
        r"(?: *([+-]?\d+\.?\d*))?"      # Optional signed offset for vertical position
        r" +(left|center|right)"        # Horizontal position
        r"(?: *([+-]?\d+\.?\d*))?"      # Optional signed offset for horizontal position
    )

    ypos, yoffset, xpos, xoffset = position_pattern.search(text_position).groups()
    yoffset = float(yoffset or 0)
    xoffset = float(xoffset or 0)

    static_plot_mode = True

    if static_plot_mode and (yoffset or xoffset):
        # Precise positioning. Add annotation.
        xanchor = {'left': 'right', 'right': 'left'}.get(xpos, xpos)
        yanchor = {'top': 'bottom', 'bottom': 'top'}.get(ypos, ypos)
        fig.add_annotation(
            x=np.log10(num_authors),   # We have to take logs manually :(
            y=np.log10(num_citations), # (see https://github.com/plotly/plotly.js/issues/1258#issuecomment-267654806)
            text=ins,
            showarrow=False,
            xanchor=xanchor,
            yanchor=yanchor,
            xshift=xoffset,
            yshift=yoffset,
        )
    else:
        # Regular positioning. Just add a label to the last trace.
        fig.data[-1].mode = 'markers+text'
        fig.data[-1].text = [ins]
        fig.data[-1].textposition = f'{ypos} {xpos}'

# Add image overlays
# for ins in citations_authors_institutions:
#     num_authors = institution_mean_authors_stats[ins]['mean']
#     num_citations = institution_mean_citations_per_author_stats[ins]['mean']
#     fig.add_layout_image(
#         dict(source=logo_urls[ins]),  # URL of the logo
#         xref="x",
#         yref="y",
#         x=np.log10(num_authors),
#         y=np.log10(num_citations),
#         sizex=0.15,  # you may need to adjust this
#         sizey=0.15,  # you may need to adjust this
#         xanchor="center",
#         yanchor="middle",
#         layer="above",
#     )

## Plot layout
fig.update_xaxes(type="log", tickvals=[1, 2, 5, 10, 20, 50, 100, 200, 500])
fig.update_yaxes(type="log", tickvals=[1, 2, 5, 10, 20, 50, 100, 200, 500])

fig.update_layout(
    # title='Citations vs. number of authors',
    xaxis_title=f'Mean unique authors from organization per year ({DATA_START_YEAR} to {DATA_END_YEAR})',
    yaxis_title='Mean citations per author per year',
)

fig.update_layout(
    autosize=False,
    width=480,
    height=250,
    # title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

plotting.prettify_figure(fig, highlight_ticks=True)

## Save plot
save_plot(fig, result_file_location, 'citations_per_author_authors_phase', scale=1)

## Show plot
fig.show()

In [66]:
# Ratio of Google and Microsoft citations per author to the average of the top 4 Chinese companies
chinese_avg = np.mean([
    institution_mean_citations_per_author_stats[ins]['mean']
    for ins in ['SenseTime', 'Alibaba', 'Baidu', 'Huawei', 'Tencent']
])
print(institution_mean_citations_per_author_stats['Google']['mean'] / chinese_avg)
print(institution_mean_citations_per_author_stats['Microsoft']['mean'] / chinese_avg)

1.9466009598247276
1.539224492457989


In [67]:
fig = go.Figure()

for ins in citations_authors_institutions:
    num_authors = institution_mean_authors_stats[ins]['mean']
    num_citations = institution_mean_citations_per_author_stats[ins]['mean'] * institution_citation_ratios[ins]

    # 2010-2023 positions
    if DATA_START_YEAR == 2010 and DATA_END_YEAR == 2023:
        if ins in ['Twitter', 'Google', 'Yandex', 'Huawei']:
            text_position = 'top center'
        elif ins in ['Amazon']:
            text_position = 'bottom left'
        elif ins in []:
            text_position = 'bottom right'
        else:
            text_position = 'bottom center'

    # 2010-2016 positions
    if DATA_START_YEAR == 2010 and DATA_END_YEAR == 2016:
        if ins in ['Huawei', 'Baidu', 'Yandex', 'Naver']:
            text_position = 'top center'
        else:
            text_position = 'bottom center'
    
    # 2017-2023 positions
    if DATA_START_YEAR == 2017 and DATA_END_YEAR == 2023:
        if ins in ['Yandex', 'Alibaba']:
            text_position = 'top center'
        elif ins in ['Amazon']:
            text_position = 'bottom left'
        elif ins in ['Huawei']:
            text_position = 'bottom right'
        else:
            text_position = 'bottom center'

    fig.add_trace(
        go.Scatter(
            x=[num_authors],  # take square root of x values
            y=[num_citations],  # take square root of y values
            mode='markers',
            showlegend=False,
            # marker_opacity=0,
            marker_color='#02767C',
            name=ins,
            text=[ins],
            textposition=[text_position],
        )
    )

# Add image overlays
# for ins in citations_authors_institutions:
#     num_authors = institution_mean_authors_stats[ins]['mean']
#     num_citations = institution_mean_citations_per_author_stats[ins]['mean']
#     fig.add_layout_image(
#         dict(source=logo_urls[ins]),  # URL of the logo
#         xref="x",
#         yref="y",
#         x=np.log10(num_authors),
#         y=np.log10(num_citations),
#         sizex=0.15,  # you may need to adjust this
#         sizey=0.15,  # you may need to adjust this
#         xanchor="center",
#         yanchor="middle",
#         layer="above",
#     )

## Plot layout
fig.update_xaxes(type="log", tickvals=[1, 2, 5, 10, 20, 50, 100, 200, 500])
fig.update_yaxes(type="log", tickvals=[1, 2, 5, 10, 20, 50, 100, 200, 500])

fig.update_layout(
    # title='Citations vs. number of authors',
    xaxis_title=f'Mean unique authors from organization per year ({DATA_START_YEAR} to {DATA_END_YEAR})',
    yaxis_title='Mean citations per author per year',
)

fig.update_layout(
    autosize=False,
    width=480,
    height=250,
    # title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'citations_per_author_authors_phase_semantic_scholar_adjustment', scale=1)

## Show plot
fig.show()