In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.firefox.options import Options

import requests
from tqdm import tqdm

In [7]:
class OpenReviewsParser:
    def __init__(self) -> None:
        path_to_driver = "./geckodriver.exe"
        self.service = Service(path_to_driver)
        self.driver_options = Options()
        self.timeout = 10
        self.dataset = []

    def get_all_page_links(self, workshop_url):
        paper_urls = []
        try:
            driver = webdriver.Firefox(
                service=self.service,
                options=self.driver_options,
            )
            driver.refresh()
            driver.get(url=workshop_url)
            WebDriverWait(driver, timeout=self.timeout).until(
                lambda d: d.find_element(
                    by=By.CLASS_NAME,
                    value="note",
                ),
            )

            all_links = driver.find_elements(by=By.CLASS_NAME, value="note")
            for link in tqdm(all_links):
                link = link.find_element(by=By.TAG_NAME, value="a").get_property("href")
                paper_id = str(link).split("id=")[1]
                review_url = f"https://api.openreview.net/notes?forum={paper_id}"
                reviews = requests.get(review_url).json()
                reviews = [
                    item
                    for item in reviews["notes"]
                    if not "TL;DR" in item["content"] and "review" in item["content"]
                ]
                if len(reviews) > 0:
                    paper_urls.append(
                        {
                            "paper_url": link,
                            "paper_id": paper_id,
                            "reviews": reviews,
                        }
                    )

            driver.close()
        except:
            driver.close()
            pass

        return paper_urls

    def get_all_subjects(self, root_url):
        subject_urls = []
        try:
            driver = webdriver.Firefox(
                service=self.service,
                options=self.driver_options,
            )
            driver.refresh()
            driver.get(url=root_url)

            WebDriverWait(driver, timeout=self.timeout).until(
                lambda d: d.find_element(
                    by=By.CLASS_NAME,
                    value="list-unstyled.venues-list",
                ),
            )

            links_list = driver.find_element(
                by=By.CLASS_NAME, value="list-unstyled.venues-list"
            )

            all_links = links_list.find_elements(by=By.TAG_NAME, value="a")

            for link in tqdm(all_links):
                link = link.get_property("href")
                subject_urls.append(link)

            driver.close()
        except:
            driver.close()

        return subject_urls

    def parse_all_conferences(self):
        conference_urls = [
            "https://openreview.net/group?id=aclweb.org/ACL/2022/Workshop"
        ]

        for conference_url in tqdm(conference_urls):
            subject_urls = self.get_all_subjects(
                root_url=conference_url,
            )

            for workshop_url in tqdm(subject_urls):
                try:
                    papers_with_reviews = self.get_all_page_links(
                        workshop_url=workshop_url,
                    )
                    self.dataset.extend(papers_with_reviews)
                except:
                    pass


parser = OpenReviewsParser()
# parser.get_all_page_links()
# parser.get_all_subjects()
parser.parse_all_conferences()

100%|██████████| 33/33 [00:00<00:00, 660.02it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 15/15 [00:08<00:00,  1.85it/s]

[A
[A
[A
[A
[A
[A
100%|██████████| 6/6 [00:03<00:00,  1.83it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 22/22 [00:11<00:00,  1.84it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 41/41 [00:22<00:00,  1.84it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 20/20 [00:10<00:00,  1.86it/s]

[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 8/8 [00:04<00:00,  1.72it/s]

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 14/14 [00:07<00:00,  1.77it/s]
100%|██████████| 33/33 [10:03<00:00, 18.28s/it]
100%|██████████| 1/1 [10:11<00:00, 611.48s/it]


In [8]:
len(parser.dataset)

47

In [9]:
parser.dataset

[{'paper_url': 'https://openreview.net/forum?id=rRzl6UQxEb5',
  'paper_id': 'rRzl6UQxEb5',
  'reviews': [{'id': 'B89x5zOf0Gc',
    'original': None,
    'number': 2,
    'cdate': 1648400401863,
    'mdate': 1648400401863,
    'ddate': None,
    'tcdate': 1648400401863,
    'tmdate': 1648400401863,
    'tddate': None,
    'forum': 'rRzl6UQxEb5',
    'replyto': 'rRzl6UQxEb5',
    'invitation': 'aclweb.org/ACL/2022/Workshop/CMCL_Shared_Task/Paper6/-/Official_Review',
    'content': {'title': 'Review for Team DMG at CMCL 2022 Shared Task',
     'review': '# Summary\nFor their contribution, the authors used a pre-trained XLM-R language model and trained an adapter, inserted into the frozen pre-trained LM in order to predict eye-tracking reading measures (first fixation duration & total reading time). They test several methods for both subtask 1 (1 adapter for all languages, language-specific adapters) and subtask 2 (zero-shot, e.g. using the adapters from subtask 1, translation of training 

In [8]:
"https://openreview.net/forum?id=rhz7nqYfF-q".split("id=")[1]

'rhz7nqYfF-q'

In [25]:
# https://api.openreview.net/notes?forum=rhz7nqYfF-q
import requests

review_url = "https://api.openreview.net/notes?forum=rhz7nqYfF-q"
# review_url  = "https://api.openreview.net/notes?forum=dTvsv8gBAUl"
# requests.get("https://api.openreview.net/notes?forum=rhz7nqYfF-q").json()
reviews = requests.get(review_url).json()
reviews = [item for item in reviews["notes"] if not "TL;DR" in item["content"]]
reviews

# requests.get("https://api.openreview.net/notes?forum=dTvsv8gBAUl").json()

[{'id': 'rRzguAEfTz9',
  'original': None,
  'number': 1,
  'cdate': 1648334031679,
  'mdate': None,
  'ddate': None,
  'tcdate': 1648334031679,
  'tmdate': 1648334031679,
  'tddate': None,
  'forum': 'rhz7nqYfF-q',
  'replyto': 'rhz7nqYfF-q',
  'invitation': 'aclweb.org/ACL/2022/Workshop/FL4NLP/Paper8/-/Decision',
  'content': {'title': 'Paper Decision', 'decision': 'Accept'},
  'signatures': ['aclweb.org/ACL/2022/Workshop/FL4NLP/Program_Chairs'],
  'readers': ['everyone'],
  'nonreaders': [],
  'writers': ['aclweb.org/ACL/2022/Workshop/FL4NLP/Program_Chairs']},
 {'id': 'HN2gHFt6qGq',
  'original': None,
  'number': 5,
  'cdate': 1648183677147,
  'mdate': None,
  'ddate': None,
  'tcdate': 1648183677147,
  'tmdate': 1648183677147,
  'tddate': None,
  'forum': 'rhz7nqYfF-q',
  'replyto': 'rhz7nqYfF-q',
  'invitation': 'aclweb.org/ACL/2022/Workshop/FL4NLP/Paper8/-/Official_Review',
  'content': {'title': 'Interesting idea and reasonable experiment results',
   'review': 'This paper prop