The first step is find open access literature reviews. We will use the Semantic Scholar API.

In [40]:
import requests
import json
from tqdm import tqdm
import os
from dotenv import load_dotenv

load_dotenv()

# API key is optional, but can help with rate limiting
SEMANTIC_SCHOLAR_API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

data_dir = "data"

In [2]:
# Get latest dataset
dataset = requests.get('https://api.semanticscholar.org/datasets/v1/release/latest').json()

{'release_id': '2024-01-24', 'README': 'Semantic Scholar Academic Graph Datasets\n\nThese datasets provide a variety of information about research papers taken from a snapshot in time of the Semantic Scholar corpus.\n\nThis site is provided by The Allen Institute for Artificial Intelligence (“AI2”) as a service to the\nresearch community. The site is covered by AI2 Terms of Use and Privacy Policy. AI2 does not claim\nownership of any materials on this site unless specifically identified. AI2 does not exercise editorial\ncontrol over the contents of this site. AI2 respects the intellectual property rights of others. If\nyou believe your copyright or trademark is being infringed by something on this site, please follow\nthe "DMCA Notice" process set out in the Terms of Use (https://allenai.org/terms).\n\nSAMPLE DATA ACCESS\nSample data files can be downloaded with the following UNIX command:\n\nfor f in $(curl https://s3-us-west-2.amazonaws.com/ai2-s2ag/samples/MANIFEST.txt)\n  do curl -

In [67]:
# Get open access reviews with a minimum citation count. The citation count serves as a proxy for quality.
# See See https://api.semanticscholar.org/api-docs/#tag/Paper-Data/operation/get_graph_paper_bulk_search
query = f"https://api.semanticscholar.org/graph/v1/paper/search/bulk?isOpenAccess&openAccessPdf&publicationTypes=Review&minCitationCount=200&fields=title,externalIds,citationCount,fieldsOfStudy,venue,year"
headers = {
    'x-api-key': SEMANTIC_SCHOLAR_API_KEY,
    'Content-Type': 'application/json'
}
response = requests.get(query, headers=headers)
data = response.json()
total = data['total']
data

{'total': 117090,
 'token': 'PCOKWVSKJJGM4TWNJNI3EUSQJIWVNUSRKBFEYK2JFUBHFI4VBQGJGUQMZSGM3DIMSOGVFDMNKPGS2LCN2MGFEDGSJTJMYTJNSKJZELJNZXGS2ERNSRRGWAPKNQKBM',
 'data': [{'paperId': '00008d6c3dc6ad96622d601c8dad4a1b05c33946',
   'externalIds': {'MAG': '2018797083',
    'DOI': '10.1063/1.3552291',
    'CorpusId': 121773812},
   'title': 'Tunable, continuous-wave Terahertz photomixer sources and applications',
   'venue': '',
   'year': 2011,
   'citationCount': 432,
   'fieldsOfStudy': ['Physics']},
  {'paperId': '0000bd1f21da8b28e58287842958ba781a051b9a',
   'externalIds': {'MAG': '1972641293',
    'DOI': '10.1182/BLOOD-2002-04-1149',
    'CorpusId': 16283567,
    'PubMed': '12351373'},
   'title': 'Current molecular models for NADPH oxidase regulation by Rac GTPase.',
   'venue': 'Blood',
   'year': 2002,
   'citationCount': 350,
   'fieldsOfStudy': ['Biology', 'Medicine']},
  {'paperId': '000153cd09823a5b65d904553b9c1d5c97054d0a',
   'externalIds': {'MAG': '2232335510',
    'DOI': '10.1

In [68]:
# Get all paper IDs in batches using the continuation token
batch_size = 1000
n_batches = total // batch_size + 1
papers = []
for i in tqdm(range(n_batches)):
    batch_q = query
    cont_token = data['token']
    if cont_token:
        batch_q += f"&token={cont_token}"
    response = requests.get(batch_q, headers=headers)
    data = response.json()
    papers.extend(data['data'])

100%|██████████| 118/118 [03:52<00:00,  1.97s/it]


In [73]:
# Assert that IDs are unique
paper_ids = [paper['paperId'] for paper in papers]
assert len(list(set(paper_ids))) == len(paper_ids) == total

In [74]:
len(papers)

117090

In [75]:
# Save paper IDs
papers_path = os.path.join(data_dir, "papers.json")
with open(papers_path, 'w') as f:
    json.dump(papers, f)

In [92]:
# Get citations for each paper
paper_details_path = os.path.join(data_dir, "paper_details.json")
details_query = 'https://api.semanticscholar.org/graph/v1/paper/batch'
batch_size = 20
n_batches = total // batch_size + 1
if os.path.exists(paper_details_path):
    with open(paper_details_path, 'r') as f:
        paper_details = json.load(f)
paper_details = {}
skip_existing = True
for i in tqdm(range(n_batches)):
    batch_q = details_query
    ids = paper_ids[i*batch_size:(i+1)*batch_size]
    if skip_existing:
        ids = [id for id in ids if id not in paper_details]
    if len(ids) == 0:
        continue
    response = requests.post(batch_q, headers=headers, json={"ids": ids}, params={"fields": "citations.title,citations.year,citations.abstract"})
    data = response.json()
    for paper in data:
        try:
            paper_details[paper['paperId']] = paper
        except TypeError:
            print(f"Error: {paper}")
            continue
    with open(paper_details_path, 'w') as f:
        json.dump(paper_details, f)

  0%|          | 5/5855 [00:44<14:42:31,  9.05s/it]

Error: error


  0%|          | 7/5855 [01:07<17:57:14, 11.05s/it]

Error: error


  0%|          | 8/5855 [01:23<20:36:20, 12.69s/it]

Error: error


  0%|          | 12/5855 [02:33<28:12:43, 17.38s/it]

Error: error


  0%|          | 13/5855 [02:51<28:20:48, 17.47s/it]

Error: error


  0%|          | 14/5855 [03:07<27:40:13, 17.05s/it]

Error: error


  0%|          | 16/5855 [03:38<26:09:55, 16.13s/it]

Error: error


  0%|          | 17/5855 [03:54<25:52:18, 15.95s/it]

Error: error


  0%|          | 20/5855 [04:43<26:10:01, 16.14s/it]

Error: error


  0%|          | 21/5855 [05:00<26:35:39, 16.41s/it]

Error: error


  0%|          | 29/5855 [07:21<29:03:14, 17.95s/it]

Error: error


  1%|          | 36/5855 [09:22<28:32:10, 17.65s/it]

Error: error


  1%|          | 37/5855 [09:41<28:57:25, 17.92s/it]

Error: error


  1%|          | 42/5855 [11:06<27:45:25, 17.19s/it]

Error: error


  1%|          | 45/5855 [11:59<28:02:26, 17.37s/it]

Error: error


  1%|          | 46/5855 [12:16<27:49:38, 17.25s/it]

Error: error


  1%|          | 49/5855 [13:12<29:30:50, 18.30s/it]

Error: error


  1%|          | 50/5855 [13:30<29:30:39, 18.30s/it]

Error: error


  1%|          | 52/5855 [14:04<28:21:02, 17.59s/it]

Error: error


  1%|          | 54/5855 [14:40<28:33:50, 17.73s/it]

Error: error


  1%|          | 55/5855 [14:58<28:44:24, 17.84s/it]

Error: error


  1%|          | 56/5855 [15:17<29:18:11, 18.19s/it]

Error: error


  1%|          | 57/5855 [15:34<28:57:30, 17.98s/it]

Error: error


  1%|          | 58/5855 [15:51<28:13:17, 17.53s/it]

Error: error


  1%|          | 60/5855 [16:39<32:56:52, 20.47s/it]

Error: error


  1%|          | 63/5855 [17:32<29:12:39, 18.16s/it]

Error: error


  1%|          | 67/5855 [18:47<29:54:50, 18.61s/it]

Error: error


  1%|          | 68/5855 [19:05<29:32:09, 18.37s/it]

Error: error


  1%|▏         | 76/5855 [21:29<28:33:07, 17.79s/it]

Error: error


  1%|▏         | 78/5855 [22:05<28:59:27, 18.07s/it]

Error: error


  1%|▏         | 81/5855 [23:01<29:47:14, 18.57s/it]

Error: error


  1%|▏         | 82/5855 [23:19<29:13:38, 18.23s/it]

Error: error


  1%|▏         | 83/5855 [23:37<29:12:45, 18.22s/it]

Error: error


  1%|▏         | 84/5855 [23:57<30:02:11, 18.74s/it]

Error: error


  1%|▏         | 85/5855 [24:15<29:52:45, 18.64s/it]

Error: error


  1%|▏         | 86/5855 [24:35<30:11:59, 18.85s/it]

Error: error


  2%|▏         | 89/5855 [25:33<30:58:56, 19.34s/it]

Error: error


  2%|▏         | 92/5855 [26:31<30:28:55, 19.04s/it]

Error: error


  2%|▏         | 94/5855 [27:06<29:04:10, 18.17s/it]

Error: error


  2%|▏         | 95/5855 [27:25<29:44:01, 18.58s/it]

Error: error


  2%|▏         | 96/5855 [27:46<30:46:56, 19.24s/it]

Error: error


  2%|▏         | 97/5855 [28:05<30:49:43, 19.27s/it]

Error: error


  2%|▏         | 98/5855 [28:27<31:49:29, 19.90s/it]

Error: error


  2%|▏         | 104/5855 [30:24<30:42:49, 19.23s/it]

Error: error


  2%|▏         | 106/5855 [31:03<30:51:56, 19.33s/it]

Error: error


  2%|▏         | 107/5855 [31:22<30:39:10, 19.20s/it]

Error: error


  2%|▏         | 108/5855 [31:42<31:11:24, 19.54s/it]

Error: error


  2%|▏         | 112/5855 [32:56<29:06:05, 18.24s/it]

Error: error


  2%|▏         | 113/5855 [33:17<30:16:38, 18.98s/it]

Error: error


  2%|▏         | 115/5855 [33:58<31:52:00, 19.99s/it]

Error: error


  2%|▏         | 116/5855 [34:19<31:59:41, 20.07s/it]

Error: error


  2%|▏         | 117/5855 [34:41<32:53:52, 20.64s/it]

Error: error


  2%|▏         | 121/5855 [36:06<33:53:50, 21.28s/it]

Error: error


  2%|▏         | 122/5855 [36:29<34:17:15, 21.53s/it]

Error: error


  2%|▏         | 123/5855 [36:51<34:39:33, 21.77s/it]

Error: error


  2%|▏         | 124/5855 [37:12<34:16:51, 21.53s/it]

Error: error


  2%|▏         | 126/5855 [37:54<34:13:06, 21.50s/it]

Error: error


  2%|▏         | 135/5855 [41:09<34:27:45, 21.69s/it]

Error: error


  2%|▏         | 136/5855 [41:32<34:49:42, 21.92s/it]

Error: error


  2%|▏         | 137/5855 [41:53<34:35:28, 21.78s/it]

Error: error


  2%|▏         | 140/5855 [43:06<37:06:12, 23.37s/it]

Error: error


  2%|▏         | 142/5855 [43:52<36:58:42, 23.30s/it]

Error: error


  2%|▏         | 143/5855 [44:13<36:17:03, 22.87s/it]

Error: error


  2%|▏         | 144/5855 [44:35<35:39:16, 22.48s/it]

Error: error


  2%|▏         | 145/5855 [45:00<36:39:39, 23.11s/it]

Error: error


  2%|▏         | 146/5855 [45:22<36:25:54, 22.97s/it]

Error: error


  3%|▎         | 147/5855 [45:46<37:02:55, 23.37s/it]

Error: error


  3%|▎         | 149/5855 [46:37<38:25:55, 24.25s/it]

Error: error


  3%|▎         | 150/5855 [46:58<37:01:48, 23.37s/it]

Error: error


  3%|▎         | 151/5855 [47:22<37:04:53, 23.40s/it]

Error: error


  3%|▎         | 153/5855 [48:07<36:50:57, 23.27s/it]

Error: error


  3%|▎         | 155/5855 [48:50<35:02:48, 22.13s/it]

Error: error


  3%|▎         | 158/5855 [50:00<37:16:32, 23.55s/it]

Error: error


  3%|▎         | 160/5855 [50:45<36:17:03, 22.94s/it]

Error: error


  3%|▎         | 161/5855 [51:06<35:29:07, 22.44s/it]

Error: error


  3%|▎         | 163/5855 [51:49<34:16:05, 21.67s/it]

Error: error


  3%|▎         | 165/5855 [52:32<34:20:33, 21.73s/it]

Error: error


  3%|▎         | 172/5855 [55:01<33:59:38, 21.53s/it]

Error: error


  3%|▎         | 173/5855 [55:24<34:24:56, 21.81s/it]

Error: error


  3%|▎         | 174/5855 [55:45<34:13:07, 21.68s/it]

Error: error


  3%|▎         | 180/5855 [58:06<38:21:00, 24.33s/it]

Error: error


  3%|▎         | 181/5855 [58:30<38:10:36, 24.22s/it]

Error: error


  3%|▎         | 187/5855 [1:01:01<40:00:23, 25.41s/it]

Error: error


  3%|▎         | 187/5855 [1:01:23<31:01:01, 19.70s/it]


KeyboardInterrupt: 