In [1]:
from scholarly import scholarly
from scholarly import ProxyGenerator
import time
from tqdm import tqdm

In [2]:
# Set up a ProxyGenerator object to use free proxies
# This needs to be done only once per session
pg = ProxyGenerator()
pg.FreeProxies()
scholarly.use_proxy(pg)

# Examples

## `search_author`

In [23]:
# Retrieve the author's data, fill-in, and print
# Get an iterator for the author results
search_query = scholarly.search_author('Steven A Cholewiak')

In [24]:
# Retrieve the first result from the iterator
first_author_result = next(search_query)
first_author_result

{'container_type': 'Author',
 'filled': [],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': '4bahYMkAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ',
 'name': 'Steven A. Cholewiak, PhD',
 'affiliation': 'Vision Scientist at Google LLC',
 'email_domain': '@google.com',
 'interests': ['Depth Cues',
  '3D Shape',
  'Shape from Texture & Shading',
  'Naive Physics',
  'Haptics'],
 'citedby': 510}

In [25]:
# Retrieve all the details for the author
author = scholarly.fill(first_author_result)
author

{'container_type': 'Author',
 'filled': ['basics',
  'indices',
  'counts',
  'coauthors',
  'publications',
  'public_access'],
 'source': <AuthorSource.SEARCH_AUTHOR_SNIPPETS: 'SEARCH_AUTHOR_SNIPPETS'>,
 'scholar_id': '4bahYMkAAAAJ',
 'url_picture': 'https://scholar.google.com/citations?view_op=medium_photo&user=4bahYMkAAAAJ',
 'name': 'Steven A. Cholewiak, PhD',
 'affiliation': 'Vision Scientist at Google LLC',
 'email_domain': '@google.com',
 'interests': ['Depth Cues',
  '3D Shape',
  'Shape from Texture & Shading',
  'Naive Physics',
  'Haptics'],
 'citedby': 510,
 'organization': 6518679690484165796,
 'homepage': 'http://steven.cholewiak.com/',
 'citedby5y': 342,
 'hindex': 12,
 'hindex5y': 11,
 'i10index': 14,
 'i10index5y': 12,
 'cites_per_year': {2008: 3,
  2009: 5,
  2010: 7,
  2011: 9,
  2012: 22,
  2013: 17,
  2014: 16,
  2015: 18,
  2016: 35,
  2017: 32,
  2018: 29,
  2019: 61,
  2020: 66,
  2021: 82,
  2022: 70,
  2023: 32},
 'coauthors': [{'container_type': 'Author',
  

In [26]:
# Take a closer look at the first publication
first_publication = author['publications'][0]
first_publication

{'container_type': 'Publication',
 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>,
 'bib': {'title': 'A frequency-domain analysis of haptic gratings',
  'pub_year': '2009',
  'citation': 'IEEE Transactions on Haptics 3 (1), 3-14, 2009'},
 'filled': False,
 'author_pub_id': '4bahYMkAAAAJ:u5HHmVD_uO8C',
 'num_citations': 67,
 'citedby_url': 'https://scholar.google.com/scholar?oi=bibs&hl=en&cites=18104797610932568627',
 'cites_id': ['18104797610932568627']}

In [11]:

first_publication_filled = scholarly.fill(first_publication)
first_publication_filled

{'container_type': 'Publication',
 'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>,
 'bib': {'title': 'A frequency-domain analysis of haptic gratings',
  'pub_year': 2009,
  'citation': 'IEEE Transactions on Haptics 3 (1), 3-14, 2009',
  'author': 'Steven A Cholewiak and Kwangtaek Kim and Hong Z Tan and Bernard D Adelstein',
  'journal': 'IEEE Transactions on Haptics',
  'volume': '3',
  'number': '1',
  'pages': '3-14',
  'publisher': 'IEEE',
  'abstract': 'The detectability and discriminability of virtual haptic gratings were analyzed in the frequency domain. Detection (Exp. 1) and discrimination (Exp. 2) thresholds for virtual haptic gratings were estimated using a force-feedback device that simulated sinusoidal and square-wave gratings with spatial periods from 0.2 to 38.4 mm. The detection threshold results indicated that for spatial periods up to 6.4 mm (i.e., spatial frequencies >0.156 cycle/mm), the detectability of square-wave gratings could 

In [12]:
first_publication_filled['cites_per_year']

{2010: 2,
 2011: 7,
 2012: 10,
 2013: 5,
 2014: 3,
 2015: 4,
 2016: 4,
 2017: 5,
 2018: 3,
 2019: 3,
 2020: 9,
 2021: 7,
 2022: 4,
 2023: 1}

In [18]:
author['publications']

[{'container_type': 'Publication',
  'source': <PublicationSource.AUTHOR_PUBLICATION_ENTRY: 'AUTHOR_PUBLICATION_ENTRY'>,
  'bib': {'title': 'A frequency-domain analysis of haptic gratings',
   'pub_year': 2009,
   'citation': 'IEEE Transactions on Haptics 3 (1), 3-14, 2009',
   'author': 'Steven A Cholewiak and Kwangtaek Kim and Hong Z Tan and Bernard D Adelstein',
   'journal': 'IEEE Transactions on Haptics',
   'volume': '3',
   'number': '1',
   'pages': '3-14',
   'publisher': 'IEEE',
   'abstract': 'The detectability and discriminability of virtual haptic gratings were analyzed in the frequency domain. Detection (Exp. 1) and discrimination (Exp. 2) thresholds for virtual haptic gratings were estimated using a force-feedback device that simulated sinusoidal and square-wave gratings with spatial periods from 0.2 to 38.4 mm. The detection threshold results indicated that for spatial periods up to 6.4 mm (i.e., spatial frequencies >0.156 cycle/mm), the detectability of square-wave gra

## `search_pubs`

IMPORTANT: Making certain types of queries, such as scholarly.citedby or scholarly.search_pubs, will lead to Google Scholar blocking your requests and may eventually block your IP address. You must use proxy services to avoid this situation. See the "Using proxies" section in the documentation for more details. 

In [14]:
work_query = scholarly.search_pubs("Langage models are few-shot learners")

In [15]:
first_work_result = next(work_query)
first_work_result

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Language models are few-shot learners',
  'author': ['T Brown', 'B Mann', 'N Ryder'],
  'pub_year': '2020',
  'venue': 'Advances in neural …',
  'abstract': 'We demonstrate that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even becoming competitive with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text'},
 'filled': False,
 'gsrank': 1,
 'pub_url': 'https://proceedings.neurips.cc/paper/2020/hash/1457c0d6bfcb4967418bfb8ac142f64a-Abstract.html',
 'author_id': ['RLvsC94AAAAJ'

Looks like there's no `cites_per_year` when you query publications directly. This reflects the interface I experienced.

In [21]:
first_work_filled = scholarly.fill(first_work_result)
first_work_filled

{'container_type': 'Publication',
 'source': <PublicationSource.PUBLICATION_SEARCH_SNIPPET: 'PUBLICATION_SEARCH_SNIPPET'>,
 'bib': {'title': 'Language models are few-shot learners',
  'author': 'Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others',
  'pub_year': '2020',
  'venue': 'Advances in neural …',
  'abstract': 'We demonstrate that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even becoming competitive with prior state-of-the-art fine-tuning approaches. Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text',
 

# Speed test

In [3]:
author_list = [
    'Kaiwen Guo',
    'M. Zając',
    'Amelia Archer',
    'Alex Irpan',
    'Michael Krainin',
    'Peter Thomas Denny',
    'Wu Yonghui',
    'Yuhua Chen',
    'David F. Steiner',
    'Angeliki Lazaridou',
]

In [4]:
len(author_list)

10

In [5]:
t0 = time.time()
num_works = 0
for author in tqdm(author_list):
    # Retrieve the author's data, fill-in, and print
    # Get an iterator for the author results
    search_query = scholarly.search_author(author)
    # Retrieve the first result from the iterator
    try:
        first_author_result = next(search_query)
    except StopIteration:
        continue
    # Retrieve all the details for the author
    author = scholarly.fill(first_author_result)
    author_works = author['publications']
    num_works += len(author_works)
    for work in tqdm(author_works):
        work_filled = scholarly.fill(work)
        cites_per_year = work_filled['cites_per_year']

print(f"took {time.time() - t0} seconds to process {num_works} works: average {num_works / (time.time() - t0)} works per second")

 67%|██████▋   | 18/27 [16:52<08:26, 56.24s/it]
  0%|          | 0/10 [21:06<?, ?it/s]


KeyboardInterrupt: 