# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [2]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
import openreview
import json
import numpy as np
import os
# from get_paper_data import get_paper_data_multi
import json

## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [3]:
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username="",
    password=""
)

In [4]:
venue_id = 'ICLR.cc/2025/Conference'
venue_group = client.get_group(venue_id)
submission_name = venue_group.content['submission_name']['value']
submissions = client.get_all_notes(invitation=f'{venue_id}/-/{submission_name}', details='directReplies')

Retrying request: GET /groups?id=ICLR.cc%2F2025%2FConference, response: no response, error: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


Getting V2 Notes: 100%|█████████▉| 11660/11672 [03:52<00:00, 50.05it/s]


In [5]:
decisions = []
venue_group_settings = client.get_group(venue_id).content
decision_invitation_name = venue_group_settings['decision_name']['value']
for submission in submissions:
    for reply in submission.details['directReplies']:
        if any(invitation.endswith(f'/-/{decision_invitation_name}') for invitation in reply['invitations']):
            decisions.append(reply)

In [6]:
sub = submissions[1]
print(dir(sub))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'cdate', 'content', 'ddate', 'details', 'domain', 'forum', 'from_json', 'id', 'invitations', 'license', 'mdate', 'nonreaders', 'number', 'odate', 'pdate', 'readers', 'replyto', 'signatures', 'tcdate', 'tmdate', 'to_json', 'writers']


In [7]:
for prop in dir(sub):
    if '_' not in prop:
        value = getattr(sub, prop)
        if isinstance(value, dict):
            print(prop, value.keys())
        else:
            print(prop, value)

cdate 1726738855000
content dict_keys(['title', 'authors', 'authorids', 'keywords', 'abstract', 'pdf', 'primary_area', 'code_of_ethics', 'submission_guidelines', 'reciprocal_reviewing', 'anonymous_url', 'no_acknowledgement_section', 'venue', 'venueid', 'supplementary_material', '_bibtex', 'paperhash'])
ddate None
details dict_keys(['directReplies'])
domain ICLR.cc/2025/Conference
forum zz9jAssrwL
id zz9jAssrwL
invitations ['ICLR.cc/2025/Conference/-/Submission', 'ICLR.cc/2025/Conference/-/Post_Submission', 'ICLR.cc/2025/Conference/-/Withdrawn_Submission']
license CC BY 4.0
mdate 1731477746113
nonreaders None
number 1812
odate 1728008565725
pdate None
readers ['everyone']
replyto None
signatures ['ICLR.cc/2025/Conference/Submission1812/Authors']
tcdate 1726738855000
tmdate 1731477746113
writers ['ICLR.cc/2025/Conference', 'ICLR.cc/2025/Conference/Submission1812/Authors']


In [17]:
def submission2note(submission, idx):
    # review_keys=['summary', 'strengths', 'weaknesses', 'questions']
    # total_replies = submission.details["directReplies"]
    rating_replies = [reply for reply in submission.details["directReplies"] if "rating" in reply["content"]]
    reply = 'no decision'
    for rep in submission.details['directReplies']:
        if any(invitation.endswith(f'/-/{decision_invitation_name}') for invitation in rep['invitations']):
            reply = rep['content']['decision']['value']
    # print(rating_replies[0]['content']['rating']['value'])
    ratings = [
            int(reply["content"]["rating"]["value"])
            for reply in rating_replies
    ]
    rating = "{:.2f}".format(sum(ratings) / len(ratings) if len(ratings) > 0 else 0.)
    variance = "{:.2f}".format(sum([(float(rating) - r) ** 2 for r in ratings]) / len(ratings) if len(ratings) > 1 else 0.)
    authors = submission.content["authors"]["value"] if "authors" in submission.content else []
    note = {
        "id": submission.id,
        "decision": reply,
        "authors": authors,
        "emails": [],
        "rank": idx,
        "title": submission.content["title"]["value"],
        "keywords": submission.content["keywords"]["value"],
        "ratings": ratings,
        "rating": rating,
        "confidences":
        [
            int(reply["content"]["confidence"]["value"])
            for reply in rating_replies
        ],
        "variance": variance,
        "withdraw": 1 if "Withdrawn" in submission.content["venue"]["value"] else 0,
        # "review_lengths": [
        #     sum([len(reply["content"][key]["value"].split()) for key in review_keys])
        #     for reply in rating_replies
        # ],
        "abstract": submission.content["abstract"]["value"],
        # "comments": len(total_replies),
        "url": f"https://openreview.net/forum?id={submission.id}",

    }
    return note

In [10]:
print(submissions[0])

{'cdate': 1727453482945,
 'content': {'TLDR': {'value': 'We give a nearly optimal high probability '
                               'bound for the cross-learning contextual '
                               'bandits with unknown context distributions.'},
             '_bibtex': {'value': '@misc{\n'
                                  'anonymous2025high,\n'
                                  'title={High Probability Bounds for '
                                  'Cross-Learning Contextual Bandits with '
                                  'Unknown Context Distributions},\n'
                                  'author={Anonymous},\n'
                                  'year={2025},\n'
                                  'url={https://openreview.net/forum?id=zzR1Uskhj0}\n'
                                  '}'},
             'abstract': {'value': 'Motivated by applications in online '
                                   'bidding and sleeping bandits, we examine '
                                   't

In [19]:
notes = [submission2note(submission, idx) for idx, submission in enumerate(submissions)]
with open('data/iclr2025.json', 'w') as f:
    json.dump(notes, f)