# Goals

# Libraries

In [1]:
import openreview
import csv
import pandas as pd
import csv
from tqdm import tqdm
import datetime
import json
import os
from datetime import datetime
import numpy as np

# Globals

In [2]:
DIR = os.path.dirname(os.getcwd()) + "/"
OUT = DIR + "00_rawData/"

In [3]:
OUT

'/home/jupyter/sandbox/valsInICLR/00_rawData/'

In [4]:
ICLR_FIELDS ={
    2018 : {
        "decision": "decision",
        "reviewer_id": "AnonReviewer",
        "rating": "rating",
        "text": ["review"],
    }, 
    2019: {
        "decision": "recommendation",
        "reviewer_id": "AnonReviewer",
        "rating": "rating",
        "text": ["review"],
    },
    2020: {
        "decision": "decision",
        "reviewer_id": "AnonReviewer",
        "rating": "rating",
        "text": ["review"],
    },
    2021: {
        "decision": "decision",
        "reviewer_id": "AnonReviewer",
        "rating": "rating",
        "text": ["review"],
    },
    2022: {
        "decision": "decision",
        "reviewer_id": "Reviewer_",
        "rating": "recommendation",
        "text": ['summary_of_the_paper', 
                 'main_review', 
                 'summary_of_the_review', 
                 'correctness', 
                 'technical_novelty_and_significance', 
                 'empirical_novelty_and_significance'
                ],
    },
    2023: {
        "decision": "decision",
        "reviewer_id": "Reviewer_",
        "rating": "recommendation",
        "text": ['summary_of_the_paper', 
                 'strength_and_weaknesses', 
                 'clarity,_quality,_novelty_and_reproducibility', 
                 'summary_of_the_review', 
                 'correctness', 
                 'technical_novelty_and_significance', 
                 'empirical_novelty_and_significance'
                ],
    },
    2024: {
        "decision": "decision",
        "reviewer_id": "Reviewer_",
        "rating": "recommendation",
        "text": ['summary', 'strengths', 'weaknesses', 'questions'],
    }
}

## Public ICLR data from API
We'll use the public reviews from the API to get the final outcome of the manuscripts across the years. In a previous iteration, we also used reviewers' texts and ratings from the API but in this new version we'll use pre-discussion data that do not get changed in response to other reviewers and ACs.

## 2018–2023

In [5]:
def get_iclr_data(out_csv_path, fields):
    """
    Takes a dict with API object keys and returns a PDF and
    Writes a json file containing all relevant review fields
    For specified ICLR years
    """
    
    # Create map of API request links
    INVITATION_MAP = {year: f"ICLR.cc/{year}/Conference/-/Blind_Submission" for year in fields.keys()}
    
    # Initialize OpenReview client
    GUEST_CLIENT = openreview.Client(
    baseurl='https://api.openreview.net',
    username=open('daniels_info.txt', 'r').readlines()[0].strip(),
    password=open('daniels_info.txt', 'r').readlines()[1].strip()
    )

    # Prepare for JSON writing
    rows = []

    # Get reviews for all manuscripts
    for year, invitation in INVITATION_MAP.items():
        forum_notes = list(openreview.tools.iterget_notes(GUEST_CLIENT, invitation=invitation))
        
        for forum_note in tqdm(forum_notes, desc=f"Parsing {year}"):
            forum_id = forum_note.id
            forum_forum = forum_note.forum
            assert forum_id == forum_forum
            
            # loop thru 'notes', which contain reviews,
            # decisions, and comment threads
            reviews = GUEST_CLIENT.get_notes(forum=forum_id)
            decision = ''
            for review in reviews:
                # Capture decision when present
                if fields[year]['decision'] in review.content:
                    decision = review.content[fields[year]['decision']]
             
                # Process actual review notes; exclude notes that
                # contain non-review keys
                flags = set(["authorids", fields[year]['decision'], "comment", "withdrawal confirmation"])
                keys = set(review.content.keys())
                if len(keys.intersection(flags)) == 0:
                    review_content = review.content
                    reviewer = fields[year]['reviewer_id'] + review.signatures[0].split(fields[year]['reviewer_id'])[-1]
                    reviewer_id = forum_id + "&&" + review.signatures[0].split(fields[year]['reviewer_id'])[-1]
                    text = " ".join([review_content[field] for field in fields[year]["text"]])
                    public_rating = int(review_content[fields[year]['rating']].split(":")[0])
                    rows.append([year, forum_id, reviewer, reviewer_id, text, public_rating, decision])
    
    df = pd.DataFrame(rows, columns=['year', 'forum', 'reviewer', 'reviewer_id', 'review', 'rating', 'decision'])
    df.to_pickle(out_csv_path)
    return df

In [6]:
iclr_2018_2023 = get_iclr_data(OUT+"iclr_2018_2023.pkl", ICLR_FIELDS)

Parsing 2018: 100%|██████████| 930/930 [00:56<00:00, 16.36it/s]
Getting Notes: 100%|█████████▉| 1417/1419 [00:00<00:00, 5261.34it/s]
Parsing 2019: 100%|██████████| 1419/1419 [01:28<00:00, 16.00it/s]
Getting Notes: 100%|█████████▉| 2210/2213 [00:00<00:00, 3863.15it/s]
Parsing 2020: 100%|██████████| 2213/2213 [02:14<00:00, 16.43it/s]
Getting Notes: 100%|█████████▉| 2591/2594 [00:00<00:00, 3777.50it/s]
Parsing 2021: 100%|██████████| 2594/2594 [02:43<00:00, 15.83it/s]
Getting Notes: 100%|█████████▉| 2616/2619 [00:00<00:00, 3850.50it/s]
Parsing 2022: 100%|██████████| 2619/2619 [02:48<00:00, 15.54it/s]
Getting Notes: 100%|█████████▉| 3794/3798 [00:01<00:00, 3243.30it/s]
Parsing 2023: 100%|██████████| 3798/3798 [04:04<00:00, 15.54it/s]
Parsing 2024: 0it [00:00, ?it/s]


## 2024

In [7]:
client2 = openreview.api.OpenReviewClient(
baseurl='https://api2.openreview.net',
username=open('daniels_info.txt', 'r').readlines()[0].strip(),
password=open('daniels_info.txt', 'r').readlines()[1].strip(),
)
year = 2024
venue_id = f"ICLR.cc/{year}/Conference"
venue_group_settings = client2.get_group(venue_id).content
submission_invitation = venue_group_settings['submission_id']['value']
submissions = client2.get_all_notes(
    invitation=submission_invitation,
    details='directReplies'
)
decision_invitation_name = venue_group_settings['decision_name']['value']
review_invitation_name = venue_group_settings['review_name']['value']

Getting V2 Notes: 100%|█████████▉| 7396/7404 [00:13<00:00, 554.19it/s]


In [8]:
rows = []
for submission in submissions:
    decision = ""
    for reply in submission.details['directReplies']:
        
        # capture decision 
        if any(invitation.endswith(f'/-/{decision_invitation_name}') for invitation in reply['invitations']):
            decision = reply['content']['decision']['value']
            
        # filter out non-review comments, threads
        elif any(invitation.endswith(f'/-/{review_invitation_name}') for invitation in reply['invitations']):
            reviewer = reply['signatures'][0].split("/")[-1]
            forum = reply['forum']
            reviewer_id =  forum + "&&" + reviewer 
            review = " ".join([reply['content'][field]['value'] for field in ICLR_FIELDS[2024]['text']])
            rating = reply['content']['rating']['value']
            rows.append([year, forum, reviewer, reviewer_id, review, rating, decision])
iclr_2024 = pd.DataFrame(rows, columns=['year', 'forum', 'reviewer', 'reviewer_id', 'review', 'rating', 'decision'])
iclr_2024.to_pickle(OUT+"iclr_2024.pkl")

In [9]:
iclr = pd.concat([iclr_2018_2023, iclr_2024], ignore_index=True)
iclr['year'].value_counts().sort_index()

year
2018     2784
2019     4332
2020     6721
2021    10022
2022    10210
2023    14359
2024    28028
Name: count, dtype: int64

In [10]:
iclr.isna().sum()

year           0
forum          0
reviewer       0
reviewer_id    0
review         0
rating         0
decision       0
dtype: int64

In [11]:
iclr['len'] = iclr['review'].apply(lambda x: len(x.strip()))
iclr['len'].describe().round(3)

count    76456.000
mean      2947.703
std       1657.201
min          7.000
25%       1848.000
50%       2591.000
75%       3618.000
max      28279.000
Name: len, dtype: float64

In [32]:
iclr['decision'] = iclr.groupby('forum')['decision'].transform(lambda x: x.ffill().bfill())
iclr['rating'] = iclr['rating'].astype(str)
iclr.to_feather(OUT+'iclr_2018_2024.feather')

In [None]:
prediction_to_label_map = {
    "polarity": {
        0: "None",
        1: "(+)",
        2: "(–)",
        },
    "value": {
        0: "None",
        1: "Clarity", 
        2: "Consistency", 
        3: "Novelty", 
        4: "Thoroughness", 
        5: "Accuracy", 
        6: "Replicability", 
        }
    }