In [1]:
import openreview
import requests
import pandas as pd
import json
import fitz
import io
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from langchain_core.output_parsers import JsonOutputParser
from transformers import pipeline
import torch

In [2]:
client = openreview.api.OpenReviewClient(baseurl='https://api.openreview.net')
submissions = client.get_all_notes(content={'venueid':'ICLR.cc/2023/Conference'})

Getting V2 Notes: 100%|█████████▉| 3794/3798 [00:01<00:00, 2969.40it/s]


In [3]:
records = []
for i in range(len(submissions)):
    record = {
        'URL': 'https://openreview.net/pdf?id=' + submissions[i].id,
        'Title': submissions[i].content['title'],
        'Keywords': submissions[i].content['keywords'],
        'Abstract': submissions[i].content['abstract']
    }
    records.append(record)

In [4]:
df = pd.DataFrame(records, columns=['URL', 'Title', 'Keywords', 'Abstract'])

In [5]:
df.head()

Unnamed: 0,URL,Title,Keywords,Abstract
0,https://openreview.net/pdf?id=zzqBoIFOQ1,Guiding Safe Exploration with Weakest Precondi...,"[reinforcement learning, safe learning, safe e...",In reinforcement learning for safety-critical ...
1,https://openreview.net/pdf?id=zzL_5WoI3I,An Adaptive Entropy-Regularization Framework f...,"[Multi-Agent Reinforcement Learning, Entropy R...","In this paper, we propose an adaptive entropy-..."
2,https://openreview.net/pdf?id=zyfEWkV6it,AutoSparse: Towards Automated Sparse Training,"[sparsity, sparse training, deep learning]",Sparse training is emerging as a promising ave...
3,https://openreview.net/pdf?id=zyLVMgsZ0U_,Sampling is as easy as learning the score: the...,"[diffusion models, score-based generative mode...",We provide theoretical convergence guarantees ...
4,https://openreview.net/pdf?id=zufPou5foW,RoCourseNet: Distributionally Robust Training ...,"[Counterfactual Explanation, Algorithmic Recou...",Counterfactual (CF) explanations for machine l...


In [6]:
def contains_keywords(keyword_list):
    return any(keyword.lower() in [kw.lower() for kw in keyword_list] for keyword in keywords_to_filter)

In [7]:
keywords_to_filter = ['llm', 'rag', 'retrieval augmented generation', 'chatgpt', 'gpt', 'retrieval', 'vector databases', 'vector', 'nlp']
filtered_df = df[df['Keywords'].apply(contains_keywords)]

In [8]:
len(filtered_df)

38

In [9]:
filtered_df

Unnamed: 0,URL,Title,Keywords,Abstract
102,https://openreview.net/pdf?id=yKbprarjc5B,Leveraging Large Language Models for Multiple ...,"[NLP, language models, multiple choice questio...",While large language models (LLMs) like GPT-3 ...
217,https://openreview.net/pdf?id=wCFB37bzud4,Bidirectional Language Models Are Also Few-sho...,"[prompting, prompt-based learning, mt5, t5, ma...",Large language models such as GPT-3 (Brown et ...
246,https://openreview.net/pdf?id=vaxnu-Utr4l,WikiWhy: Answering and Explaining Cause-and-Ef...,"[NLP, Question Answering, LLM, Dataset, Explan...",As large language models (LLMs) grow larger an...
354,https://openreview.net/pdf?id=tcbBPnfwxS,OPTQ: Accurate Quantization for Generative Pre...,"[compression, quantization, generative pre-tra...","Generative Pre-trained Transformer models, kno..."
748,https://openreview.net/pdf?id=mumZwT6OrEV,ULF: UNSUPERVISED LABELING FUNCTION CORRECTION...,"[nlp, weak supervision, text classification, s...",A way to overcome expensive and time-consuming...
1079,https://openreview.net/pdf?id=hY6M0JHl3uL,Linear Connectivity Reveals Generalization Str...,"[loss landscapes, OOD generalization, NLI, tex...","In the mode connectivity literature, it is wid..."
1166,https://openreview.net/pdf?id=gEvzRWqFoCO,Contrastive Novelty Learning: Anticipating Out...,"[selective prediction, open-set classification...","In many task settings, text classification mod..."
1289,https://openreview.net/pdf?id=dz8i-yzXeVg,Elicitation Inference Optimization for Multi-P...,"[alignment, large language models, LLMs, NLP, ...",In multi-principal-agent alignment scenarios s...
1337,https://openreview.net/pdf?id=dGdoZds9qAs,Data Feedback Loops: Model-driven Amplificatio...,"[feedback loops, bias amplification, deep lear...",Datasets scraped from the internet have been c...
1499,https://openreview.net/pdf?id=am22IukDiKf,Learning by Distilling Context,"[language models, NLP, prompting, distillation]",Language models significantly benefit from con...


In [10]:
READER_MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(READER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(READER_MODEL_NAME)

READER_LLM = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    do_sample=True,
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=500,
    pad_token_id=tokenizer.eos_token_id
)

llm = HuggingFacePipeline(pipeline=READER_LLM)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
class PaperAnalysisPipeline:
    def extract_text_from_pdf(self, paper_url):
        response = requests.get(paper_url)
        pdf_content = response.content
        text = ""
        with fitz.open(stream=io.BytesIO(pdf_content)) as doc:
            for page in doc:
                text += page.get_text()
        return text

    def extract_introduction(self, text):
        text = text.lower()
        introduction_start = text.find("introduction")
        lit_review_start = text.find("related work")
        if lit_review_start == -1:
            lit_review_start = text.find("related works")
            if lit_review_start == -1:
                lit_review_start = text.find("background")
                if lit_review_start == -1:
                    lit_review_start = text.find("previous work")
        introduction_text = text[introduction_start:lit_review_start].strip()
        introduction_paragraphs = introduction_text.split('\n\n')[:3]
        return '\n\n'.join(introduction_paragraphs)

    def extract_conclusion(self, text):
        text = text.lower()
        conclusion_start = text.find("conclusion")
        future_work_start = text.find("future work")
        if future_work_start == -1:
            future_work_start = text.find("references")
        conclusion_text = text[conclusion_start:future_work_start].strip()
        conclusion_paragraphs = conclusion_text.split('\n\n')[:1]
        return '\n\n'.join(conclusion_paragraphs)

    def stitch_relevant_sections(self, title, abstract, conclusion):
        context = ''
        context += title + abstract + conclusion
        return context

    def ask_llm_with_context(self, context):
        prompt = f"""
        Using the information contained in the context,
        give a comprehensive answer to the question.
        If the answer to the first question is affirmative, answer the questions that follow.
        Context:
        {context}
        ---
        Now here is the question you need to answer.

        Question 1: Does this paper use retrieval-augmented generation (RAG)?
        Question 2: If the answer to Question 1 is yes, how and where is the LLM used to solve real-world problems?
        Question 3: If the answer to Question 1 is yes, does it talk about applying RAG in traditional natural language 
                    processing (NLP) applications? Which NLP applications does it mention?
        """
        answer = llm(prompt)
        return answer
        
    def print_text(self, text):
        print(text)

In [None]:
for i in range(len(filtered_df)):
    paper_url = filtered_df.iloc[i].URL
    paper_title = filtered_df.iloc[i].Title
    abstract = filtered_df.iloc[i].Abstract
    print(f"\n\nPAPER {i+1}: {paper_title}")
    analysis_pipeline = PaperAnalysisPipeline()
    text = analysis_pipeline.extract_text_from_pdf(paper_url)
    # introduction = analysis_pipeline.extract_introduction(text)
    conclusion = analysis_pipeline.extract_conclusion(text)
    context = analysis_pipeline.stitch_relevant_sections(paper_title, abstract, conclusion)
    llm_response = analysis_pipeline.ask_llm_with_context(context)
    analysis_pipeline.print_text(llm_response)



PAPER 1: Leveraging Large Language Models for Multiple Choice Question Answering


  warn_deprecated(



        Answer:
        No, this paper does not use retrieval-augmented generation (RAG). The focus of this paper is on leveraging large language models (LLMs) for multiple choice question answering (MCQA), specifically exploring the effectiveness of presenting questions and answer options to LLMs jointly instead of as separate inputs. The authors argue that this "natural" approach allows the LLM to explicitly compare answer options, reducing computational costs and mitigating the effects of tokenization scheme and answer option representations on answer selection. They also introduce the concept of multiple choice symbol binding (MCSB) ability, which refers to an LLM's ability to associate answer options with symbols that represent them. The authors demonstrate that a model with high MCSB ability performs much better with the natural approach than with the traditional approach across 20 diverse datasets and largely closes the gap with the state of the art (SOTA), suggesting that the 