# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [1]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import os
import fitz
import io
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import faiss
import numpy as np
import pandas as pd
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline

  from .autonotebook import tqdm as notebook_tqdm
2024-05-17 12:22:54.567366: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [2]:
DATA_PATH = '../data/'
venue = 'ICLR.cc/2023/Conference'
venue_short = 'iclr2023'

In [3]:
def get_conference_notes(venue, blind_submission=False):
    """
    Get all notes of a conference (data) from OpenReview API.
    If results are not final, you should set blind_submission=True.
    """

    blind_param = '-/Blind_Submission' if blind_submission else ''
    offset = 0
    notes = []
    while True:
        print('Offset:', offset, 'Data:', len(notes))
        url = f'https://api.openreview.net/notes?invitation={venue}/{blind_param}&offset={offset}'
        response = requests.get(url)
        data = response.json()
        if len(data['notes']) == 0:
            break
        offset += 1000
        notes.extend(data['notes'])
    return notes

In [4]:
raw_notes = get_conference_notes(venue, blind_submission=True)
print("Number of submissions:", len(raw_notes))

Offset: 0 Data: 0
Offset: 1000 Data: 1000
Offset: 2000 Data: 2000
Offset: 3000 Data: 3000
Offset: 4000 Data: 3798
Number of submissions: 3798


In [5]:
df_raw = pd.json_normalize(raw_notes)
df_raw.head()

Unnamed: 0,id,original,number,cdate,mdate,ddate,tcdate,tmdate,tddate,forum,...,content.student_author,content.Please_choose_the_closest_area_that_your_submission_falls_into,content.paperhash,content.pdf,content.supplementary_material,content._bibtex,content.venue,content.venueid,content.TL;DR,content.community_implementations
0,RUzSobdYy0V,pmo4AKuE4-p,6620,1663850590815,,,1663850590815,1677758485903,,RUzSobdYy0V,...,,"Social Aspects of Machine Learning (eg, AI saf...",adebayo|quantifying_and_mitigating_the_impact_...,/pdf/8fa4751c3b6bc13a0eefd3b9a9dd75dc9359f20f.pdf,/attachment/151652f4d981a49f9dfa81be992839a243...,"@inproceedings{\nadebayo2023quantifying,\ntitl...",ICLR 2023 poster,ICLR.cc/2023/Conference,,
1,N3kGYG3ZcTi,kVYulJycT2K,6611,1663850589829,,,1663850589829,1676330777348,,N3kGYG3ZcTi,...,,Deep Learning and representational learning,zhuang|suppression_helps_lateral_inhibitionins...,/pdf/bc66a3bbb804a7158ba77a4de9f91a196e8eaf9a.pdf,,"@misc{\nzhuang2023suppression,\ntitle={Suppres...",Submitted to ICLR 2023,ICLR.cc/2023/Conference,Improving feature learning with lateral inhibi...,
2,tmIiMPl4IPa,RAIF4RUF0T,6610,1663850589709,,,1663850589709,1710206488222,,tmIiMPl4IPa,...,,"Machine Learning for Sciences (eg biology, phy...",tran|factorized_fourier_neural_operators,/pdf/c381fdf1b7600bdbaba7b4a98c1679006ec61c83.pdf,,"@inproceedings{\ntran2023factorized,\ntitle={F...",ICLR 2023 poster,ICLR.cc/2023/Conference,An efficient and scalable neural PDE solver us...,[![CatalyzeX](/images/catalyzex_icon.svg) 1 co...
3,mhnHqRqcjYU,ix_LR-W0OM2,6603,1663850588877,,,1663850588877,1677757114293,,mhnHqRqcjYU,...,,Deep Learning and representational learning,narshana|dfpc_data_flow_driven_pruning_of_coup...,/pdf/a04d739740d3a54486c4a47bf7d26dd24b41732d.pdf,,"@inproceedings{\nnarshana2023dfpc,\ntitle={{DF...",ICLR 2023 poster,ICLR.cc/2023/Conference,We propose a novel data-free algorithm to acce...,
4,sZI1Oj9KBKy,vRziu1jJDu,6601,1663850588630,,,1663850588630,1677757168918,,sZI1Oj9KBKy,...,,Deep Learning and representational learning,murti|tvsprune_pruning_nondiscriminative_filte...,/pdf/54b7911797398691422146138209e69d0674e5de.pdf,,"@inproceedings{\nmurti2023tvsprune,\ntitle={{T...",ICLR 2023 poster,ICLR.cc/2023/Conference,We use the total variation distance between th...,


## (optional) older crawled data

## Crawl forums of each submission
Here we scrape the forums of each submissions, it can be pretty fast thanks to:
- OpenReview's API (we use requests)
- Multiprocessing to parallelize the scraping of each paper

In [6]:
# Create multiprocessing pool of requests over index of dataframe

extra = "trash=true&details=replyCount%2Cwritable%2Crevisions%2Coriginal%2Coverwriting%2Cinvitation%2Ctags"

def get_paper_data(paper_id, extra='', timeout=5):
    try:
        url = f"https://api.openreview.net/notes?forum={paper_id}&{extra}"
        response = requests.get(url, timeout=timeout)
        data = response.json()
        return data
    except requests.exceptions.Timeout:
        print(f"Error for paper {paper_id}: Request timed out")
        return None
    except:
        print(f"Error for paper {paper_id}: General error")
        return None

def retry_get_paper_data(paper_id, extra='', timeout=5, retries=10):
    for i in range(retries):
        data = get_paper_data(paper_id, extra, timeout)
        if data is not None:
            return data
    print(f"Error for paper {paper_id}: All {retries} attempts failed")
    return None

def get_paper_data_multi(paper_ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(retry_get_paper_data, paper_ids), total=len(paper_ids)))
    return data

In [7]:
# filter df with only id, title, url and keywords
df_raw_filtered = df_raw[['id', 'content.title', 'content.keywords']]
df_raw_filtered.head()

Unnamed: 0,id,content.title,content.keywords
0,RUzSobdYy0V,Quantifying and Mitigating the Impact of Label...,[]
1,N3kGYG3ZcTi,Suppression helps: Lateral Inhibition-inspired...,"[Lateral Inhibition, Convolutional Neural Netw..."
2,tmIiMPl4IPa,Factorized Fourier Neural Operators,"[fourier transform, fourier operators, pde, na..."
3,mhnHqRqcjYU,DFPC: Data flow driven pruning of coupled chan...,"[Pruning, Data Free, Model Compression]"
4,sZI1Oj9KBKy,TVSPrune - Pruning Non-discriminative filters ...,"[Structured pruning, model compression]"


In [8]:
df_raw_filtered['pdf-url'] = "https://openreview.net/pdf?id=" + df_raw_filtered["id"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_raw_filtered['pdf-url'] = "https://openreview.net/pdf?id=" + df_raw_filtered["id"]


In [9]:
df_raw_filtered['pdf-url']

0       https://openreview.net/pdf?id=RUzSobdYy0V
1       https://openreview.net/pdf?id=N3kGYG3ZcTi
2       https://openreview.net/pdf?id=tmIiMPl4IPa
3       https://openreview.net/pdf?id=mhnHqRqcjYU
4       https://openreview.net/pdf?id=sZI1Oj9KBKy
                          ...                    
3793     https://openreview.net/pdf?id=P5Z-Zl9XJ7
3794     https://openreview.net/pdf?id=IJwhRE510b
3795     https://openreview.net/pdf?id=4XMAzZasId
3796    https://openreview.net/pdf?id=ED2Jjms9A4H
3797     https://openreview.net/pdf?id=jU-AXLS2bl
Name: pdf-url, Length: 3798, dtype: object

In [10]:
def extract_text_from_pdf(paper_url):
    response = requests.get(paper_url)
    pdf_content = response.content
    text = ""
    with fitz.open(stream=io.BytesIO(pdf_content)) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [13]:
def preprocess_text(text):
    tagged_data = []
    return word_tokenize(text.lower())

In [15]:
import chromadb
from replicate.client import Client

replicate = Client(api_token="r8_IbZu0U3qUJ5ZC0TzzMm6xCJ6yU5UxVi16Ejo4")

client = chromadb.PersistentClient(path="./iclr-chromadb-client")
collection = client.get_or_create_collection("iclr2023")

In [16]:
tagged_data = []
i = 1
for url in df_raw_filtered['pdf-url']:
    pdf_text = extract_text_from_pdf(url)
    preprocessed_text = preprocess_text(pdf_text)
    tagged_data.append(TaggedDocument(words=preprocessed_text, tags=[url]))
    # print(preprocessed_text)
    collection.add(
                documents=pdf_text,
                metadatas={"url": url},
                ids=f"doc{i}",
            )
    i += 1

KeyboardInterrupt: 

In [None]:
results = collection.query(
    query_texts=["use cases, applications, application areas"],
    n_results=len(pdf_urls),
)

In [None]:
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
)
#

llm = HuggingFacePipeline(pipeline=READER_LLM)

In [None]:
prompt_in_chat_format = [
    {
        "role": "system",
        "content": """
                    Using the information contained in the context,
                    give a comprehensive answer to the question.
                    Respond only to the question asked, response should be concise and relevant to the question.
                    Provide response in bullet points with an appropriate title.
                    If the answer cannot be deduced from the context, do not give an answer.
                """,
    },
    {
        "role": "user",
        "content": """Context:
{context}
---
Now here is the question you need to answer.

Question: {question}""",
    },
]
RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
)
print(RAG_PROMPT_TEMPLATE)

In [None]:
user_query = "What are the top use cases or application areas of retrieval augmented generation (RAG)? Emphasize on RAG applications with vector databases as well."
k=len(pdf_urls)
print(f"\nStarting retrieval for {user_query=}...")
print("\n==================================Top k documents==================================")
topk_retrieved_filename =  results['metadatas'][0][:k]
topk_retrieved_file_id =  results['ids'][0][:k]
topk_retrieved_file =  results['documents'][0][:k]
print(f"filename: {topk_retrieved_filename}\nfile_id: {topk_retrieved_file_id}")
print("\n==================================Top document==================================")
retrieved_filename =  results['metadatas'][0][0]
retrieved_file_id =  results['ids'][0][0]
retrieved_file =  results['documents'][0][0]
# print(f"filename: {retrieved_filename['filename']}\nfile_id: {retrieved_file_id}")


In [None]:
def remove_references(text):
    words = text.split()
    try:
        references_index = words.index('References')
        words_before_references = words[:references_index]
        cleaned_text = ' '.join(words_before_references)
    except ValueError:
        return text
    return cleaned_text

In [None]:
retrieved_docs_text = topk_retrieved_file[:100]
for i in range(len(retrieved_docs_text)):
    remove_references(retrieved_docs_text[i])
context = "\nExtracted documents:\n"
context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])

final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)

# print(final_prompt)

In [None]:
from langchain_core.output_parsers import JsonOutputParser
answer = llm(final_prompt)
print(answer)

In [None]:
# model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
# model.build_vocab(tagged_data)
# model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
# document_vectors = {}
# for url in pdf_urls:
#     pdf_text = extract_text_from_pdf(url)
#     preprocessed_text = preprocess_text(pdf_text)
#     vector_representation = model.infer_vector(preprocessed_text)
#     document_vectors[url] = vector_representation
#     print("Added: ", url)

In [None]:
# vector_dim = len(next(iter(document_vectors.values())))
# print(vector_dim)

In [None]:
# index = faiss.IndexFlatL2(vector_dim)
# vectors_np = np.array(list(document_vectors.values())).astype('float32')
# query_text = "use cases, applications, application areas"
# query_vector = model.infer_vector(query_text.lower().split(" "))
# query_vector_np = np.array([query_vector]).astype('float32')
# k = len(pdf_urls)

# index.reset()
# index.add(vectors_np)
# start_time = time.time()
# distances, indices = index.search(query_vector_np, k=k)
# end_time = time.time()
# retrieval_time = (end_time - start_time) * 1e6
# print("Retrieval time:", retrieval_time, "microseconds")

# results = pd.DataFrame({'distances': distances[0], 'ann': indices[0]})
# print(results.head())