# OpenReview Venue Scraping

In [1]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm, trange
import requests

## Scrape list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [2]:
DATA_PATH = '../data/'
venue = 'ICLR.cc/2023/Conference'
venue_short = 'iclr2023'

In [3]:
def get_conference_notes(venue, blind_submission=False):
    """
    Get all notes of a conference (data) from OpenReview API.
    If results are not final, you should set blind_submission=True.
    """

    blind_param = '-/Blind_Submission' if blind_submission else ''
    offset = 0
    notes = []
    while True:
        print('Offset:', offset, 'Data:', len(notes))
        url = f'https://api.openreview.net/notes?invitation={venue}/{blind_param}&offset={offset}'
        response = requests.get(url)
        data = response.json()
        if len(data['notes']) == 0:
            break
        offset += 1000
        notes.extend(data['notes'])
    return notes

In [4]:
raw_notes = get_conference_notes(venue, blind_submission=True)
print("Number of submissions:", len(raw_notes))

Offset: 0 Data: 0
Offset: 1000 Data: 1000
Offset: 2000 Data: 2000
Offset: 3000 Data: 3000
Offset: 4000 Data: 4000
Offset: 5000 Data: 4875
Number of submissions: 4875


In [5]:
df_raw = pd.json_normalize(raw_notes)
# set index as first column
# df_raw.set_index(df_raw.columns[0], inplace=True)
df_raw.head()

Unnamed: 0,id,original,number,cdate,mdate,ddate,tcdate,tmdate,tddate,forum,...,content.no_acknowledgement_section,content.code_of_ethics,content.submission_guidelines,content.resubmission,content.student_author,content.Please_choose_the_closest_area_that_your_submission_falls_into,content.paperhash,content.pdf,content._bibtex,content.supplementary_material
0,kRvZ2PcsxjJj,cnlsip-X_k,6623,1663850591061,,,1663850591061,1666794126677,,kRvZ2PcsxjJj,...,I certify that there is no acknowledgement sec...,I acknowledge that I and all co-authors of thi...,Yes,,,"Reinforcement Learning (eg, decision and contr...",anonymous|quantum_reinforcement_learning,/pdf/bd1412beeb070314478ba69a52979cd9d7057106.pdf,"@inproceedings{\nanonymous2023quantum,\ntitle=...",
1,RUzSobdYy0V,pmo4AKuE4-p,6620,1663850590815,,,1663850590815,1666794126368,,RUzSobdYy0V,...,I certify that there is no acknowledgement sec...,I acknowledge that I and all co-authors of thi...,Yes,,,"Social Aspects of Machine Learning (eg, AI saf...",anonymous|quantifying_and_mitigating_the_impac...,/pdf/fa20300b4f58971f6a0663a5cb2c8efd17fe6240.pdf,"@inproceedings{\nanonymous2023quantifying,\nti...",/attachment/151652f4d981a49f9dfa81be992839a243...
2,N3kGYG3ZcTi,kVYulJycT2K,6611,1663850589829,,,1663850589829,1666794126045,,N3kGYG3ZcTi,...,I certify that there is no acknowledgement sec...,I acknowledge that I and all co-authors of thi...,Yes,,,Deep Learning and representational learning,anonymous|suppression_helps_lateral_inhibition...,/pdf/fe61792a0bdac18c97e72754f6fd250b79e65ffc.pdf,"@inproceedings{\nanonymous2023suppression,\nti...",
3,tmIiMPl4IPa,RAIF4RUF0T,6610,1663850589709,,,1663850589709,1666794125709,,tmIiMPl4IPa,...,I certify that there is no acknowledgement sec...,I acknowledge that I and all co-authors of thi...,Yes,,,"Machine Learning for Sciences (eg biology, phy...",anonymous|factorized_fourier_neural_operators,/pdf/f165fba1a61fac089a88a6f600dafa6100768f5c.pdf,"@inproceedings{\nanonymous2023factorized,\ntit...",/attachment/528ca783f12ed545d4727d9b5edcb4e4d3...
4,mhnHqRqcjYU,ix_LR-W0OM2,6603,1663850588877,,,1663850588877,1666794125400,,mhnHqRqcjYU,...,I certify that there is no acknowledgement sec...,I acknowledge that I and all co-authors of thi...,Yes,,,Deep Learning and representational learning,anonymous|dfpc_data_flow_driven_pruning_of_cou...,/pdf/55db83e926f940361b1f96359cd90eb9e5461681.pdf,"@inproceedings{\nanonymous2023dfpc,\ntitle={{D...",


## Scrape forums of each submission
Here we scrape the forums of each submissions, it can be pretty fast thanks to:
- OpenReview's API (we use requests)
- Multiprocessing to parallelize the scraping of each paper

In [7]:
# Create multiprocessing pool of requests over index of dataframe

extra = "trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"

def get_paper_data(paper_id):
    try:
        url = f"https://api.openreview.net/notes?forum={paper_id}&{extra}"
        response = requests.get(url)
        data = response.json()
        return data
    except:
        print(f"Error for paper {paper_id}")
        return None

def get_paper_data_multi(paper_ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(get_paper_data, paper_ids), total=len(paper_ids)))
    return data

In [8]:
# filter df with only id, title, url and keywords
df_raw_filtered = df_raw[['id', 'content.title', 'content.keywords']]
df_raw_filtered.head()

Unnamed: 0,id,content.title,content.keywords
0,kRvZ2PcsxjJj,Quantum reinforcement learning,"[quantum reinforcement learning, multi-agent, ..."
1,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[]
2,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw..."
3,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na..."
4,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]"


In [9]:
ids = list(df_raw_filtered['id'])
data = get_paper_data_multi(ids, ratio=1)

  0%|          | 0/4875 [00:00<?, ?it/s]

Error for paper 36g8Ept_CCjError for paper CQsmMYmlP5TError for paper LQIjzPdDt3qError for paper JmkjrlVE-DGError for paper sqPEs1wEizUError for paper 3yEIFSMwKBCError for paper YlGsTZODyjzError for paper MHgYMtHpKsCError for paper XYUaprBSDjpError for paper GVWySHBD3ClError for paper qYO0f9WnUupError for paper B4maZQLLW0_Error for paper SZYXyhE2c6f


Error for paper 0eTTKOOOQkV
Error for paper GVMwL15UrZO







Error for paper Hcq7zGgcsOg





KeyboardInterrupt: 

In [None]:
# get only notes
notes = [d['notes'] for d in data]

In [10]:
def filter_data(item, 
                review_keys=['summary_of_the_paper', 'main_review', 'summary_of_the_review']
                ):
    """Filter only ratings, confidence, withdraw status and decisions"""
    # parse each note
    withdraw = 0
    # filter meta note
    meta_note = [d for d in item if 'Paper' not in d['invitation']]
    # check withdrawn
    withdraw = 1 if 'Withdrawn_Submission' in meta_note[0]['invitation'] else 0
    # decision
    if withdraw == 0:
        decision_note = [d for d in item if 'Decision' in d['invitation']]
        decision = decision_note[0]['content']['decision']
    else:
        decision = ''
    # filter reviewer comments
    comment_notes = [d for d in item \
                     if 'Official_Review' in d['invitation'] and 'recommendation' in d['content'].keys()]
    comment_notes = sorted(comment_notes, key=lambda d: d['number'])[::-1]
    ratings = [int(note['content']['recommendation'].split(':')[0]) for note in comment_notes]
    confidences = [int(note['content']['confidence'].split(':')[0]) for note in comment_notes]
    review_lengths = [sum(len(note['content'][key].split()) for key in review_keys) for note in comment_notes] # review lengths

    return {'ratings': ratings, 'confidences': confidences, 'withdraw': withdraw, 'decision': decision, 'review_lengths': review_lengths}

In [11]:
# filter data in a pool of processes
with Pool(8) as p:
    filtered_notes = list(tqdm(p.imap(filter_data, notes), total=len(notes)))

  0%|          | 0/2628 [00:00<?, ?it/s]

In [12]:
# create dataframe
ratings = pd.DataFrame(filtered_notes)
ratings.head()

Unnamed: 0,ratings,confidences,withdraw,decision,review_lengths
0,"[8, 8, 8, 8]","[4, 4, 4, 3]",0,Accept (Spotlight),"[493, 788, 460, 460]"
1,"[8, 6, 6, 6]","[3, 3, 2, 3]",0,Accept (Poster),"[318, 551, 275, 210]"
2,"[5, 8, 5, 5]","[4, 5, 4, 4]",0,Accept (Poster),"[434, 1113, 463, 338]"
3,"[6, 6, 6, 6]","[4, 4, 3, 3]",0,Accept (Poster),"[294, 677, 604, 316]"
4,"[6, 6, 6, 6]","[3, 4, 5, 4]",0,Accept (Poster),"[262, 284, 265, 425]"


In [13]:
# Merge with df_raw_filtered
df_final = pd.concat([df_raw_filtered, ratings], axis=1)
df_final.head()

Unnamed: 0,id,content.title,content.keywords,ratings,confidences,withdraw,decision,review_lengths
0,g1SzIRLQXMM,Wiring Up Vision: Minimizing Supervised Synapt...,"[computational neuroscience, primate visual ve...","[8, 8, 8, 8]","[4, 4, 4, 3]",0,Accept (Spotlight),"[493, 788, 460, 460]"
1,HndgQudNb91,Learning to Downsample for Segmentation of Ult...,"[ultra-high resolution image segmentation, non...","[8, 6, 6, 6]","[3, 3, 2, 3]",0,Accept (Poster),"[318, 551, 275, 210]"
2,7fFO4cMBx_9,Variational Neural Cellular Automata,"[Neural Cellular Automata, Cellular Automata, ...","[5, 8, 5, 5]","[4, 5, 4, 4]",0,Accept (Poster),"[434, 1113, 463, 338]"
3,FKp8-pIRo3y,Wish you were here: Hindsight Goal Selection f...,"[goal-conditioned reinforcement learning, lear...","[6, 6, 6, 6]","[4, 4, 3, 3]",0,Accept (Poster),"[294, 677, 604, 316]"
4,KntaNRo6R48,L0-Sparse Canonical Correlation Analysis,[],"[6, 6, 6, 6]","[3, 4, 5, 4]",0,Accept (Poster),"[262, 284, 265, 425]"


## Save filtered dataset 

In [14]:
# Save dataframe as csv
# rename title
df_final.rename(columns={'content.title': 'title'}, inplace=True)
#rename keywords
df_final.rename(columns={'content.keywords': 'keywords'}, inplace=True)
df_final.to_csv(f'{DATA_PATH}{venue_short}_{time.strftime("%Y%m%d")}.csv', index=False)

## Saving full crawled dataset

Note that this dataset is raw and contains everyting; so it will be pretty large (>100 MBs)!

In [19]:
# Save dataframe as hdf5
notes_df = pd.DataFrame([n['notes'] for n in data])
count_df = pd.DataFrame({'notes_count': [n['count'] for n in data]})
df = pd.concat([df_raw, notes_df, count_df], axis=1)
df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index([                            'id',                       'original',
                                'mdate',                          'ddate',
                               'tddate',                          'forum',
                              'replyto',                     'invitation',
                           'signatures',                        'readers',
                           'nonreaders',                        'writers',
                        'content.title',              'content.authorids',
                      'content.authors',               'cont