# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [39]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [40]:
DATA_PATH = '../data/'
venue = 'ICLR.cc/2023/Conference'
venue_short = 'iclr2023'

In [41]:
def get_conference_notes(venue, blind_submission=False):
    """
    Get all notes of a conference (data) from OpenReview API.
    If results are not final, you should set blind_submission=True.
    """

    blind_param = '-/Blind_Submission' if blind_submission else ''
    offset = 0
    notes = []
    while True:
        print('Offset:', offset, 'Data:', len(notes))
        url = f'https://api.openreview.net/notes?invitation={venue}/{blind_param}&offset={offset}'
        response = requests.get(url)
        data = response.json()
        if len(data['notes']) == 0:
            break
        offset += 1000
        notes.extend(data['notes'])
    return notes

In [42]:
raw_notes = get_conference_notes(venue, blind_submission=True)
print("Number of submissions:", len(raw_notes))

Offset: 0 Data: 0
Offset: 1000 Data: 1000
Offset: 2000 Data: 2000
Offset: 3000 Data: 3000
Offset: 4000 Data: 3851
Number of submissions: 3851


In [43]:
df_raw = pd.json_normalize(raw_notes)
# set index as first column
# df_raw.set_index(df_raw.columns[0], inplace=True)
df_raw.head()

Unnamed: 0,id,original,number,cdate,mdate,ddate,tcdate,tmdate,tddate,forum,...,content.resubmission,content.student_author,content.Please_choose_the_closest_area_that_your_submission_falls_into,content.paperhash,content.pdf,content.supplementary_material,content._bibtex,content.venue,content.venueid,content.TL;DR
0,RUzSobdYy0V,pmo4AKuE4-p,6620,1663850590815,,,1663850590815,1675279442105,,RUzSobdYy0V,...,,,"Social Aspects of Machine Learning (eg, AI saf...",adebayo|quantifying_and_mitigating_the_impact_...,/pdf/c91f0bb05e5ac8a99867a69b17cb3772ccf04628.pdf,/attachment/151652f4d981a49f9dfa81be992839a243...,"@inproceedings{\nadebayo2023quantifying,\ntitl...",ICLR 2023 poster,ICLR.cc/2023/Conference,
1,N3kGYG3ZcTi,kVYulJycT2K,6611,1663850589829,,,1663850589829,1675279442101,,N3kGYG3ZcTi,...,,,Deep Learning and representational learning,zhuang|suppression_helps_lateral_inhibitionins...,/pdf/bc66a3bbb804a7158ba77a4de9f91a196e8eaf9a.pdf,,"@misc{\nzhuang2023suppression,\ntitle={Suppres...",Submitted to ICLR 2023,ICLR.cc/2023/Conference,Improving feature learning with lateral inhibi...
2,tmIiMPl4IPa,RAIF4RUF0T,6610,1663850589709,,,1663850589709,1675279442055,,tmIiMPl4IPa,...,,,"Machine Learning for Sciences (eg biology, phy...",tran|factorized_fourier_neural_operators,/pdf/d80c8a332eea03754e31236157306d3491a3b832.pdf,/attachment/dc36a32060af8da1f81e6570a28b4890b1...,"@inproceedings{\ntran2023factorized,\ntitle={F...",ICLR 2023 poster,ICLR.cc/2023/Conference,An efficient and scalable neural PDE solver us...
3,mhnHqRqcjYU,ix_LR-W0OM2,6603,1663850588877,,,1663850588877,1675279442026,,mhnHqRqcjYU,...,,,Deep Learning and representational learning,narshana|dfpc_data_flow_driven_pruning_of_coup...,/pdf/491bdb807d29b7113e662ff30f32501632e3a1d3.pdf,,"@inproceedings{\nnarshana2023dfpc,\ntitle={{DF...",ICLR 2023 poster,ICLR.cc/2023/Conference,We propose a novel data-free algorithm to acce...
4,sZI1Oj9KBKy,vRziu1jJDu,6601,1663850588630,,,1663850588630,1675279441672,,sZI1Oj9KBKy,...,,,Deep Learning and representational learning,murti|tvsprune_pruning_nondiscriminative_filte...,/pdf/41f9c167358fdf50559b697f2a7ee4b4d3b0f9a7.pdf,,"@inproceedings{\nmurti2023tvsprune,\ntitle={{T...",ICLR 2023 poster,ICLR.cc/2023/Conference,We use the total variation distance between th...


## (optional) older crawled data

In [44]:
# Read data from old version 
df_old = pd.read_csv(DATA_PATH + 'iclr2023_20221120.csv')
df_old.head()

Unnamed: 0,id,title,keywords,ratings,confidences,withdraw,review_lengths
0,kRvZ2PcsxjJj,Quantum reinforcement learning,"['quantum reinforcement learning', 'multi-agen...","[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]"
1,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[],"[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]"
2,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"['Lateral Inhibition', 'Convolutional Neural N...","[3, 5, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]"
3,tmIiMPl4IPa,Factorized Fourier Neural Operators,"['fourier transform', 'fourier operators', 'pd...","[8, 6, 3, 8, 3]","[5, 4, 4, 2, 2]",0,"[203, 142, 323, 520, 635]"
4,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"['Pruning', 'Data Free', 'Model Compression']","[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]"


In [45]:
papers_ids = df_old['id'].values
print("Number of papers (including old):", len(papers_ids))

Number of papers (including old): 4874


## Crawl forums of each submission
Here we scrape the forums of each submissions, it can be pretty fast thanks to:
- OpenReview's API (we use requests)
- Multiprocessing to parallelize the scraping of each paper

In [46]:
# Create multiprocessing pool of requests over index of dataframe

extra = "trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"

def get_paper_data(paper_id, extra='', timeout=5):
    try:
        url = f"https://api.openreview.net/notes?forum={paper_id}&{extra}"
        response = requests.get(url, timeout=timeout)
        data = response.json()
        return data
    except requests.exceptions.Timeout:
        print(f"Error for paper {paper_id}: Request timed out")
        return None
    except:
        print(f"Error for paper {paper_id}: General error")
        return None

def retry_get_paper_data(paper_id, extra='', timeout=5, retries=10):
    for i in range(retries):
        data = get_paper_data(paper_id, extra, timeout)
        if data is not None:
            return data
    print(f"Error for paper {paper_id}: All {retries} attempts failed")
    return None

def get_paper_data_multi(paper_ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(retry_get_paper_data, paper_ids), total=len(paper_ids)))
    return data

In [47]:
# filter df with only id, title, url and keywords
df_raw_filtered = df_raw[['id', 'content.title', 'content.keywords']]
df_raw_filtered.head()

Unnamed: 0,id,content.title,content.keywords
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw..."
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na..."
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]"
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[Structured pruning, model compression]"


In [48]:
# ids = list(df_raw_filtered['id'])
ids = df_old['id'].values # use old ids to get data from old papers
data = get_paper_data_multi(ids, ratio=1)

  0%|          | 0/4874 [00:00<?, ?it/s]

In [50]:
# get only notes
notes = [d['notes'] for d in data]

In [58]:
def filter_data(item, 
                review_keys=['summary_of_the_paper', 'strength_and_weaknesses', 'clarity,_quality,_novelty_and_reproducibility', 'summary_of_the_review'],
                decision=True):
    """Filter only ratings, confidence, withdraw status and decisions"""
    # parse each note
    withdraw = 0
    # filter meta note
    meta_note = [d for d in item if 'Paper' not in d['invitation']]
    # check withdrawn
    withdraw = 1 if 'Withdrawn_Submission' in meta_note[0]['invitation'] else 0
    # decision
    if decision:
        try:
            if withdraw == 0:
                decision_note = [d for d in item if 'Decision' in d['invitation']]
                decision = decision_note[0]['content']['decision']
            else:
                decision = ''
        except:
            decision = ''
    # filter reviewer comments
    comment_notes = [d for d in item \
                     if 'Official_Review' in d['invitation'] and 'recommendation' in d['content'].keys()]
    comment_notes = sorted(comment_notes, key=lambda d: d['number'])[::-1]
    ratings = [int(note['content']['recommendation'].split(':')[0]) for note in comment_notes]
    confidences = [int(note['content']['confidence'].split(':')[0]) for note in comment_notes]
    review_lengths = [sum(len(note['content'][key].split()) for key in review_keys) for note in comment_notes] # review lengths

    data = {'ratings': ratings, 'confidences': confidences, 'withdraw': withdraw, 'review_lengths': review_lengths}
    if decision: data['decision'] = decision
    return data

In [59]:
# filter data in a pool of processes
with Pool(8) as p:
    filtered_notes = list(tqdm(p.imap(filter_data, notes), total=len(notes)))

  0%|          | 0/4874 [00:00<?, ?it/s]

In [60]:
# create dataframe
ratings = pd.DataFrame(filtered_notes)
ratings.head()

Unnamed: 0,ratings,confidences,withdraw,review_lengths,decision
0,"[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]",
1,"[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]",Accept: poster
2,"[3, 6, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]",Reject
3,"[8, 6, 5, 8, 6]","[5, 4, 4, 2, 3]",0,"[203, 142, 323, 520, 752]",Accept: poster
4,"[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]",Accept: poster


In [61]:
# Merge with df_raw_filtered
df_final = pd.concat([df_raw_filtered, ratings], axis=1)
df_final.head()

Unnamed: 0,id,content.title,content.keywords,ratings,confidences,withdraw,review_lengths,decision
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[],"[1, 1, 1, 1]","[5, 5, 5, 5]",1,"[45, 49, 25, 283]",
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw...","[5, 6, 8]","[4, 3, 3]",0,"[443, 274, 401]",Accept: poster
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na...","[3, 6, 3, 1]","[5, 5, 5, 5]",0,"[333, 360, 362, 304]",Reject
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]","[8, 6, 5, 8, 6]","[5, 4, 4, 2, 3]",0,"[203, 142, 323, 520, 752]",Accept: poster
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[Structured pruning, model compression]","[8, 6, 6]","[3, 2, 3]",0,"[302, 90, 257]",Accept: poster


## Save filtered dataset 
We will be saving a smaller version of the dataset in csv format with the data we need for our analysis - this can also be saved directly in Github

In [62]:
# Save dataframe as csv
# rename title
df_final.rename(columns={'content.title': 'title'}, inplace=True)
#rename keywords
df_final.rename(columns={'content.keywords': 'keywords'}, inplace=True)
df_final.to_csv(f'{DATA_PATH}{venue_short}_{time.strftime("%Y%m%d")}.csv', index=False)

## Saving full crawled dataset

Note that this dataset is raw and contains everyting; so it will be pretty large (>100 MBs)!

In [63]:
# Save dataframe as hdf5
notes_df = pd.DataFrame([n['notes'] for n in data])
count_df = pd.DataFrame({'notes_count': [n['count'] for n in data]})
df = pd.concat([df_raw, notes_df, count_df], axis=1)
df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index([                                                                    'id',
                                                                     'original',
                                                                        'mdate',
                                                                        'ddate',
                                                                       'tddate',
                                                                        'forum',
                                                                      'replyto',
                  