In [25]:
import pandas as pd

# Read the CSV files
iclr_df = pd.read_csv('papers_iclr_05092025.csv')
neurips_df = pd.read_csv('papers_neurips_05092025.csv')

In [2]:
def extract_id_from_url(url):
    if pd.isna(url):
        return None
    if 'id=' in url:
        return url.split('id=')[-1]
    return None


# Fill missing IDs for ICLR df
iclr_df.loc[iclr_df['id'].isna(), 'id'] = iclr_df[iclr_df['id'].isna()].apply(
    lambda x: extract_id_from_url(x['openreview_url']) if pd.notna(x['openreview_url'])
    else extract_id_from_url(x['pdf_url']), axis=1
)



In [3]:
# Fill missing IDs for NeurIPS df
neurips_df.loc[neurips_df['id'].isna(), 'id'] = neurips_df[neurips_df['id'].isna()].apply(
    lambda x: extract_id_from_url(x['openreview_url']) if pd.notna(x['openreview_url'])
    else extract_id_from_url(x['pdf_url']), axis=1
)

In [4]:
# Save updated dataframes
iclr_df.to_csv('papers_iclr_05092025.csv', index=False)
neurips_df.to_csv('papers_neurips_05092025.csv', index=False)


In [3]:
import requests


def get_paper_content(row):  # gotta run a few times since rate limit
    paper_id = row['id']
    year = row['year']
    conf = row['publisher']

    if conf == 'ICLR':
        cutoff = 2023
    elif conf == 'NeurIPS':
        cutoff = 2022

    if pd.isna(paper_id): return None


    if year <= cutoff: base_url = "https://api.openreview.net"
    else: base_url = "https://api2.openreview.net"


    url = f"{base_url}/notes?forum={paper_id}&{''}"

    try:
        response = requests.get(url, timeout=5)
        data = response.json()
        dicts_with_content = [item['content'] for item in data['notes'] if isinstance(item, dict) and 'content' in item]

        return dicts_with_content if len(dicts_with_content) > 0 else None
    except:
        return None


In [27]:
# ICLR
# Add forum_content column if it's null
if 'forum_content' not in iclr_df.columns:
    iclr_df['forum_content'] = iclr_df.apply(get_paper_content, axis=1)
else:
    iclr_df.loc[iclr_df['forum_content'].isnull(), 'forum_content'] = \
        iclr_df.loc[iclr_df['forum_content'].isnull()].apply(get_paper_content, axis=1)

iclr_df.to_csv('papers_iclr_05092025.csv', index=False)

In [28]:
iclr_df.loc[iclr_df['forum_content'].isnull(), 'forum_content'].shape

(0,)

In [5]:
if 'forum_content' not in neurips_df.columns:
    neurips_df['forum_content'] = neurips_df.apply(get_paper_content, axis=1)
else:
    neurips_df.loc[neurips_df['forum_content'].isnull(), 'forum_content'] = \
        neurips_df.loc[neurips_df['forum_content'].isnull()].apply(get_paper_content, axis=1)
neurips_df.to_csv('papers_neurips_05092025.csv', index=False)

In [None]:
def get_forum_content(paper_id, year, conf):  # gotta run a few times since rate limit
    if conf == 'ICLR':
        cutoff = 2023
    elif conf == 'NeurIPS':
        cutoff = 2022

    if pd.isna(paper_id): return None

    further_edit = False

    if year <= cutoff: #change this to 2022 for neurips, 2023 for iclr
        base_url = "https://api.openreview.net"
    else:
        base_url = "https://api2.openreview.net"


    url = f"{base_url}/notes?forum={paper_id}&{''}"

    try:
        response = requests.get(url, timeout=5)
        data = response.json()
        dicts_with_content = [item['content'] for item in data['notes'] if isinstance(item, dict) and 'content' in item]
        return dicts_with_content if len(dicts_with_content) > 0 else None
    except:
        return None

# print(get_forum_content('mSAKhLYLSsl', 2021, 'ICLR')) #v1
print(get_forum_content('3f5PALef5B', 2024, 'ICLR'))