# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [3]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [4]:
DATA_PATH = '../data/'
venue = 'NeurIPS.cc/2022/Conference'
venue_short = 'neurips2022'

In [5]:
def get_conference_notes(venue, blind_submission=False):
    """
    Get all notes of a conference (data) from OpenReview API.
    If results are not final, you should set blind_submission=True.
    """

    blind_param = '-/Blind_Submission' if blind_submission else ''
    offset = 0
    notes = []
    while True:
        print('Offset:', offset, 'Data:', len(notes))
        url = f'https://api.openreview.net/notes?invitation={venue}/{blind_param}&offset={offset}'
        response = requests.get(url)
        data = response.json()
        if len(data['notes']) == 0:
            break
        offset += 1000
        notes.extend(data['notes'])
    return notes

In [6]:
raw_notes = get_conference_notes(venue, blind_submission=True)
print("Number of submissions:", len(raw_notes))

Offset: 0 Data: 0
Offset: 1000 Data: 1000
Offset: 2000 Data: 2000
Offset: 3000 Data: 2824
Number of submissions: 2824


In [7]:
df_raw = pd.json_normalize(raw_notes)
# set index as first column
# df_raw.set_index(df_raw.columns[0], inplace=True)
df_raw.head()

Unnamed: 0,id,original,number,cdate,mdate,ddate,tcdate,tmdate,tddate,forum,...,content.authors,content.keywords,content.TL;DR,content.abstract,content.paperhash,content.pdf,content.supplementary_material,content.venue,content.venueid,content._bibtex
0,09QFnDWPF8,aOfJ7v_eSQ,13051,1652737878213,,,1652737878213,1665609116870,,09QFnDWPF8,...,[Yuri Fonseca],"[Statistical Learning, Inverse Problems, Stoch...",An algorithm based on stochastic gradient desc...,Inverse problems are paramount in Science and ...,fonseca|statistical_learning_and_inverse_probl...,/pdf/85e17283506c1b0eb36e6605d20d0a63c353dee0.pdf,/attachment/f62b0d177da19741960fb6a0c517e24f11...,NeurIPS 2022 Accept,NeurIPS.cc/2022/Conference,"@inproceedings{\nfonseca2022statistical,\ntitl..."
1,pnSyqRXx73,A8t9ZkxKCGk,13028,1652737876923,,,1652737876923,1664811975926,,pnSyqRXx73,...,"[Jie Hu, Vishwaraj Doshi, Do Young Eun]","[Stochastic Gradient Descent, Asymptotic Analy...",We introduce the notion of efficiency ordering...,We consider the stochastic gradient descent (S...,hu|efficiency_ordering_of_stochastic_gradient_...,/pdf/46707b3f4c2c8b67443d46355b6425cdaca5433b.pdf,/attachment/9d8cb6af1a11b99f4861aa5e2810d7e434...,NeurIPS 2022 Accept,NeurIPS.cc/2022/Conference,"@inproceedings{\nhu2022efficiency,\ntitle={Eff..."
2,EqJ5_hZSqgy,lry9OSDqcjX,13014,1652737876203,,,1652737876203,1665338594988,,EqJ5_hZSqgy,...,"[Huili Chen, Jie Ding, Eric William Tramel, Sh...","[Federared Learning, Personalization]",We propose a new adaptive federated learning a...,In the context of personalized federated learn...,chen|selfaware_personalized_federated_learning,/pdf/787bf2f7c996e36550d7711ee5ac443972a1639e.pdf,,NeurIPS 2022 Accept,NeurIPS.cc/2022/Conference,"@inproceedings{\nchen2022selfaware,\ntitle={Se..."
3,xnI37HyfoP,2KaAzuA3w9,13003,1652737875652,,,1652737875652,1664985890083,,xnI37HyfoP,...,"[Caleb Xavier Bugg, Chen Chen, Anil Aswani]","[tensor completion, machine learning]",We present a new norm for nonnegative tensor c...,"Unlike matrix completion, tensor completion do...",bugg|nonnegative_tensor_completion_via_integer...,/pdf/5a742411d1edb029cca015406caf0625a44dc8a9.pdf,/attachment/d31a28f5ef619eaa012d8915f434e784f8...,NeurIPS 2022 Accept,NeurIPS.cc/2022/Conference,"@inproceedings{\nbugg2022nonnegative,\ntitle={..."
4,OoNmOfYVhEU,VSnMoS2U6n7,12999,1652737875452,,,1652737875452,1664433835404,,OoNmOfYVhEU,...,"[Felix Chern, Blake Hechtman, Andy Davis, Ruiq...","[TPU, K-nearest neighbor search, Approximate n...",Novel nearest neighbor search algorithm achiev...,This paper presents a novel nearest neighbor s...,chern|tpuknn_k_nearest_neighbor_search_at_peak...,/pdf/52539903633df821868e6fa5130d6c7d494b23e7.pdf,/attachment/423c1e265019ac611ce8399bae116029b2...,NeurIPS 2022 Accept,NeurIPS.cc/2022/Conference,"@inproceedings{\nchern2022tpuknn,\ntitle={{TPU..."


## Crawl forums of each submission
Here we scrape the forums of each submissions, it can be pretty fast thanks to:
- OpenReview's API (we use requests)
- Multiprocessing to parallelize the scraping of each paper

In [8]:
# Create multiprocessing pool of requests over index of dataframe

extra = "trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"

def get_paper_data(paper_id):
    try:
        url = f"https://api.openreview.net/notes?forum={paper_id}&{extra}"
        response = requests.get(url)
        data = response.json()
        return data
    except:
        print(f"Error for paper {paper_id}")
        return None

def get_paper_data_multi(paper_ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(get_paper_data, paper_ids), total=len(paper_ids)))
    return data

In [9]:
# filter df with only id, title, url and keywords
df_raw_filtered = df_raw[['id', 'content.title', 'content.keywords']]
df_raw_filtered.head()

Unnamed: 0,id,content.title,content.keywords
0,09QFnDWPF8,Statistical Learning and Inverse Problems: A S...,"[Statistical Learning, Inverse Problems, Stoch..."
1,pnSyqRXx73,Efficiency Ordering of Stochastic Gradient Des...,"[Stochastic Gradient Descent, Asymptotic Analy..."
2,EqJ5_hZSqgy,Self-Aware Personalized Federated Learning,"[Federared Learning, Personalization]"
3,xnI37HyfoP,Nonnegative Tensor Completion via Integer Opti...,"[tensor completion, machine learning]"
4,OoNmOfYVhEU,TPU-KNN: K Nearest Neighbor Search at Peak FLOP/s,"[TPU, K-nearest neighbor search, Approximate n..."


In [10]:
ids = list(df_raw_filtered['id'])
data = get_paper_data_multi(ids, ratio=1)

  0%|          | 0/2824 [00:00<?, ?it/s]

In [11]:
# get only notes
notes = [d['notes'] for d in data]

In [26]:
def filter_data(item, 
                review_keys=['summary', 'strengths_and_weaknesses', 'questions', 'limitations'],
                decision=True):
    """Filter only ratings, confidence, withdraw status and decisions"""
    # parse each note
    withdraw = 0
    # filter meta note
    meta_note = [d for d in item if 'Paper' not in d['invitation']]
    # check withdrawn
    withdraw = 1 if 'Withdrawn_Submission' in meta_note[0]['invitation'] else 0
    # decision
    if decision:
        if withdraw == 0:
            decision_note = [d for d in item if 'Decision' in d['invitation']]
            decision = decision_note[0]['content']['decision']
        else:
            decision = ''
    # filter reviewer comments
    comment_notes = [d for d in item \
                     if 'Official_Review' in d['invitation'] and 'questions' in d['content'].keys()]

    comment_notes = sorted(comment_notes, key=lambda d: d['number'])[::-1]
    ratings = [int(note['content']['rating'].split(':')[0]) for note in comment_notes]
    confidences = [int(note['content']['confidence'].split(':')[0]) for note in comment_notes]
    review_lengths = [sum(len(note['content'][key].split()) for key in review_keys) for note in comment_notes] # review lengths

    data = {'ratings': ratings, 'confidences': confidences, 'withdraw': withdraw, 'review_lengths': review_lengths}
    if decision: data['decision'] = decision
    return data

In [27]:
# filter data in a pool of processes
with Pool(8) as p:
    filtered_notes = list(tqdm(p.imap(filter_data, notes), total=len(notes)))

  0%|          | 0/2824 [00:00<?, ?it/s]

In [28]:
# create dataframe
ratings = pd.DataFrame(filtered_notes)
ratings.head()

Unnamed: 0,ratings,confidences,withdraw,review_lengths,decision
0,"[8, 5, 7]","[3, 3, 3]",0,"[875, 649, 214]",Accept
1,"[5, 6, 7, 7]","[3, 3, 4, 4]",0,"[1280, 118, 287, 777]",Accept
2,"[7, 7, 3]","[3, 3, 3]",0,"[209, 303, 221]",Accept
3,"[7, 6, 6]","[4, 3, 4]",0,"[445, 269, 1083]",Accept
4,"[6, 5, 6, 6]","[4, 5, 4, 4]",0,"[162, 204, 446, 233]",Accept


In [29]:
# Merge with df_raw_filtered
df_final = pd.concat([df_raw_filtered, ratings], axis=1)
df_final.head()

Unnamed: 0,id,content.title,content.keywords,ratings,confidences,withdraw,review_lengths,decision
0,09QFnDWPF8,Statistical Learning and Inverse Problems: A S...,"[Statistical Learning, Inverse Problems, Stoch...","[8, 5, 7]","[3, 3, 3]",0,"[875, 649, 214]",Accept
1,pnSyqRXx73,Efficiency Ordering of Stochastic Gradient Des...,"[Stochastic Gradient Descent, Asymptotic Analy...","[5, 6, 7, 7]","[3, 3, 4, 4]",0,"[1280, 118, 287, 777]",Accept
2,EqJ5_hZSqgy,Self-Aware Personalized Federated Learning,"[Federared Learning, Personalization]","[7, 7, 3]","[3, 3, 3]",0,"[209, 303, 221]",Accept
3,xnI37HyfoP,Nonnegative Tensor Completion via Integer Opti...,"[tensor completion, machine learning]","[7, 6, 6]","[4, 3, 4]",0,"[445, 269, 1083]",Accept
4,OoNmOfYVhEU,TPU-KNN: K Nearest Neighbor Search at Peak FLOP/s,"[TPU, K-nearest neighbor search, Approximate n...","[6, 5, 6, 6]","[4, 5, 4, 4]",0,"[162, 204, 446, 233]",Accept


## Save filtered dataset 
We will be saving a smaller version of the dataset in csv format with the data we need for our analysis - this can also be saved directly in Github

In [31]:
# Save dataframe as csv
# rename title
df_final.rename(columns={'content.title': 'title'}, inplace=True)
#rename keywords
df_final.rename(columns={'content.keywords': 'keywords'}, inplace=True)
df_final.to_csv(f'{DATA_PATH}{venue_short}_{time.strftime("%Y%m%d")}.csv', index=False)

## Saving full crawled dataset

Note that this dataset is raw and contains everyting; so it will be pretty large (>100 MBs)!

In [32]:
# Save dataframe as hdf5
notes_df = pd.DataFrame([n['notes'] for n in data])
count_df = pd.DataFrame({'notes_count': [n['count'] for n in data]})
df = pd.concat([df_raw, notes_df, count_df], axis=1)
df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  df.to_hdf(f'{DATA_PATH}{venue_short}_data_full_{time.strftime("%Y%m%d")}.h5', key='df', mode='w')
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index([                            'id',                       'original',
                                'mdate',                          'ddate',
                               'tddate',                          'forum',
                              'replyto',                     'invitation',
                           'signatures',                        'readers',
                           'nonreaders',                        'writers',
                        'content.title',              'content.authorids',
                      'content.authors',               'cont