<a href="https://colab.research.google.com/github/g-larios/arXiv_RAG/blob/main/Website_Author_and_Time_Summary_Using_ArXiv_and_Gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Query based on author information or date

For an author based query, the arXiv API needs

- full_name: author's full name. The expected format is 'first name' + 'middle name' + 'surname', separated with spaces and with middle name possibly null.

- cat: category in (astro-ph, cond-mat, gr-qc, hep-ex, hep-lat, hep-th,hep-ph, math-ph, nlin, nucl-ex, nucl-th, physics, quant-ph, math, CoRR, q-bio, q-fin, stat, eess, econ). See https://arxiv.org/category_taxonomy for details

# Installing Packages and importing relevant Imports

In [None]:
%pip install -q feedparser

%pip install -q langchain
%pip install -q langchain-community
%pip install -qU google-generativeai
%pip install -qU langchain-google-genai

%pip install --quiet streamlit
%pip install --quiet streamlit_chat

%pip install pyngrok

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.3 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m763.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Set up the LLM Model and Langchain Chain

Here we will use Gemini model to do our inference and use langchain to create a small prompt pipeline.

In [None]:
import os
import getpass

os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter your Google AI API key: ")

from pyngrok import ngrok, conf
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken")
conf.get_default().auth_token = getpass.getpass()

Enter your authtoken, which can be copied from https://dashboard.ngrok.com/get-started/your-authtoken


In [None]:
%%writefile StreamlitApp.py
import urllib, urllib.request
import feedparser
import math
import datetime
from dateutil.relativedelta import relativedelta

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_tokens=50000,
    timeout=None,
    max_retries=5,
    # other params..
)

# Prompt for Author Queries
prompt_author = ChatPromptTemplate.from_messages([
    ("system", "You are a analyst that specializes in understanding the topic of interest of a given author or time period. "
    "You are given a set of papers by a certain author and you are to give a report on how the interest of this author have changed over time. The information is in a .json file format. "
    "The information contains the id of the paper, the date it was published, list of authors, and a summary of the paper. "
    "Create a citation of papers with proper bibliography to support why you think the author worked on the topic you state. "
    "Put the references at the end of and use numbers to cite through the body. "
    "Restrict the number of references to 20. "
    "Do not summaries the author's papers. "
    "Give an in-depth summary of their career but keep it concise. "
    "Focus only on the author the user asked about and disregard any papers that do not contain that author's name."),
    ("user", "{Prompt}\nContext:\n{Context}"),
    ])

# Prompt for Date Queries
prompt_date = ChatPromptTemplate.from_messages([
    ("system", "You are an analyst specializing in identifying the main topics of interest during a given time period. "
    "You are given a set of papers published within a specified date range, provided in .json file format. "
    "The information includes the id of the paper, the date it was published, the list of authors, and a summary of the paper. "
    "Your task is to determine the primary topics of interest during the specified time period based on the given data. "
    "Provide a clear summary of these topics, supported by citations of papers using their ids, and include a bibliography at the end. "
    "Use numbers to cite references throughout the body of your report. "
    "Restrict the number of references to 20. "
    "Do not summarize individual papers but focus on identifying and describing the key topics of interest during the given period. "
    "Ignore any papers published outside the specified date range."),
    ("user", "{Prompt}\nContext:\n{Context}"),
    ])

# Simple author and date chains
chain_author = prompt_author | llm
chain_date = prompt_date | llm

Writing StreamlitApp.py


# Query ArXiv for Papers

We use a given author's name to pull at max {max_results} number of papers from ArXiv using their API

In [None]:
%%writefile -a StreamlitApp.py
def get_arxiv_docs_author(full_name, category):
    """
    This function calls the ArXiv API and retrives relavent papers, appeneds the json information
    (which contains the abstract) together in to a single string.

    Parameters:
        full_name: author's full name.
        category: ArXiv category from which the papers are retrieved.

    Return:
        output_str: String containing the paper id, publishing date,
                    author names, title and abstract of the papers.
    """

    base_url = 'http://export.arxiv.org/api/query?'
    max_results=1000

    search_query = f'au:{"+".join(full_name.split())}+AND+cat:{category}&sortBy=submittedDate&sortOrder=descending'

    query = 'search_query=%s&max_results=%i' % (search_query,max_results)

    data = urllib.request.urlopen(base_url+query)
    feed = feedparser.parse(data.read().decode('utf-8'))

    output = []

    # Select and record relevant information for each entry, if the query author is among the authors of the entry
    for paper in feed.entries:
        paper_info = {
            'id' : paper.id.split('/abs/')[-1],
            'published' : paper.published,
            'authors' : [aut['name'] for aut in paper.authors],
            'title': paper.title,
            'summary': paper.summary
            }
        if full_name in paper_info['authors']: # or alias in paper_info["authors"]:
            output.append(paper_info)

    # Limit the number of papers used to 100 (important for when
    # using gemini-1.5-pro on a free account) to limit the number of tokens used.
    if len(output) > 100:
        skip_size = math.ceil(len(output) / 100)
        output_used = output[::skip_size]
    else:
        output_used = output

    output_str =  "".join([str(dic)+"\n\n" for dic in output_used])
    return output_str

Appending to StreamlitApp.py


We use a range of dates to pull at max {max_results} number of papers from ArXiv using their API

In [None]:
%%writefile -a StreamlitApp.py
def get_arxiv_docs_date(start, end, category):
    """
    This function calls the ArXiv API and retrives relavent papers, appeneds the json information
    (which contains the abstract) together in to a single string.

    Parameters:
        start: start date of the date range.
        end: end date of the date range.
        category: ArXiv category from which the papers are retrieved.

    Return:
        output_str: String containing the paper id, publishing date,
                    author names, title and abstract of the papers.
    """
    base_url = 'http://export.arxiv.org/api/query?'

    max_results=1000

    start = start.replace('-','')
    end = end.replace('-','')

    # Query
    search_query = f'cat:{category}+AND+submittedDate:[{start}0000+TO+{end}2359]'
    query = 'search_query=%s&start=%i&max_results=%i' % (search_query, 0, max_results)

    data = urllib.request.urlopen(base_url+query)
    feed = feedparser.parse(data.read().decode('utf-8'))

    output = [ {'id' : paper.id.split('/abs/')[-1],
            'published' : paper.published,
            'authors' : [aut['name'] for aut in paper.authors],
            'title': paper.title,
            'summary': paper.summary}
            for paper in feed.entries]

    # Limit the number of papers used to 100 (important for when
    # using gemini-1.5-pro on a free account) to limit the number of tokens used.
    if len(output) > 100:
        skip_size = math.ceil(len(output) / 100)
        output_used = output[::skip_size]
    else:
        output_used = output

    output_str =  "".join([str(dic)+"\n\n" for dic in output_used])
    return output_str

Appending to StreamlitApp.py


## how_to_text

Write a guide for how to use the Author tab

In [None]:
%%writefile -a StreamlitApp.py
how_to_text_author = """
## **What is ArXiv Author Summarizer?**
This application is useful for quickly understanding the research interests and impact of an author based on their publications indexed in ArXiv. It is especially helpful for students, researchers, and collaborators looking to familiarize themselves with a particular scholar's work.

---

## **How to Use the ArXiv Author Summarizer**
1. **Enter Author Name**:
   - Start by typing the FULL NAME of the author you want to focus on in the "Author Name" input field.

2. **View the ArXiv Categories**:
   - The app provides a link to the list of categories on ArXiv: [https://arxiv.org/category_taxonomy](https://arxiv.org/category_taxonomy). You can explore this link for reference.

3. **Select Area of Expertise**:
   - Choose the broad field the author works in from the dropdown menu labeled **"What area does this professor work in?"**
     Options include:
     - **Computer Science**
     - **Mathematics**
     - **Physics**

4. **Choose a Specific Category**:
   - Based on the area of expertise selected, a second dropdown menu labeled **"Category"** will appear with a list of specific categories (e.g., `cs.AI`, `math.AG`, or `cond-mat.dis-nn`).
   - Select the most relevant category.
   - If none of the options fit, you can choose **"other,"** which will prompt you to manually input a category ID.

5. **Click "Summarize!"**:
   - Once you've provided the necessary inputs, click the **Summarize!** button.
   - The application will retrieve relevant documents from ArXiv about the author and generate a detailed summary of their research focus.

6. **View the Output**:
   - The summary of the author’s research and contributions will appear on the right side of the application.
"""

Appending to StreamlitApp.py


Write a guide for how to use the Date tab

In [None]:
%%writefile -a StreamlitApp.py
how_to_text_date = """
## **What is the ArXiv Date Range Summarizer?**

This application is useful for quickly identifying the topics of interest and significant trends in research published on ArXiv within a specific date range. It is particularly helpful for students, researchers, and analysts who want to understand the focus of academic publications during a certain time period.

---

## **How to Use the ArXiv Date Range Summarizer**

1. **Specify Date Range**:
   - Start by entering the **Start Date** and **End Date** in the respective input fields to define the time period you want to analyze. The end date can not be more than a month apart.

2. **View the ArXiv Categories**:
   - The app provides a link to the list of categories on ArXiv: [https://arxiv.org/category_taxonomy](https://arxiv.org/category_taxonomy). You can explore this link for reference.

3. **Select Area of Expertise**:
   - Choose the broad field of research you want to focus on from the dropdown menu labeled **"What area does this research focus on?"**
     Options include:
     - **Computer Science**
     - **Mathematics**
     - **Physics**

4. **Choose a Specific Category**:
   - Based on the area of expertise selected, a second dropdown menu labeled **"Category"** will appear with a list of specific categories (e.g., `cs.AI`, `math.AG`, or `cond-mat.dis-nn`).
   - Select the most relevant category.
   - If none of the options fit, you can choose **"other,"** which will prompt you to manually input a category ID.

5. **Click "Summarize Dates!"**:
   - Once you've provided the necessary inputs, click the **Summarize Dates!** button.
   - The application will retrieve relevant documents from ArXiv published within the specified time period and generate a detailed summary of the topics of interest.

6. **View the Output**:
   - The summary of research topics and focus areas during the specified date range will appear on the right side of the application.
   """


Appending to StreamlitApp.py


## Steamlit App

In [None]:
%%writefile -a StreamlitApp.py
import streamlit as st
st.set_page_config(page_title="ArXiv Topics Summarizer", page_icon=":robot:")
st.title("ArXiv Topics Summarizer :robot_face:")

# Create two tabs for the two types of queries.
tab1, tab2 = st.tabs(["Author", "Date"])

# Helper function to generate a drop down menu to select a ArXiv category
def show_categories(group, key):
    """
    This function creates a drop down box based on the group chosen in
    the previous drop down box.

    Parameters:
        group: The group chosen in the previous drop down box.
        key: "auth" or "date" to keep the drop down boxes separate for the two tabs.

    Return:
        category: The category selected in the drop down box.
    """

    # The key helps keep different selectboxes for the two tabs
    used_key = "cat"+key
    match group:
        case "Computer Science":
            category = st.selectbox("Category you wish to focus on: ",
            ("cs.AI", "cs.IT", "cs.LG", "cs.NA", "other"), key=used_key)
        case "Mathematics":
            category = st.selectbox("Category you wish to focus on: ",
            ("math.AG", "math.A", "math.CT", "math.KT", "math.MP", "math.QA", "math.RT", "math.SG", "math.SP", "other"), key=used_key)
        case "Physics":
            category = st.selectbox("Category you wish to focus on: ",
            ("cond-mat.dis-nn", "cond-mat.mes-hall", "cond-mat.supr-con", "cond-mat.stat-mech", "hep-ex", "hep-th", "math-ph", "other"), key=used_key)
    return category

Appending to StreamlitApp.py


### Tab 1: Author

In [None]:
%%writefile -a StreamlitApp.py
with tab1:
    # Create two columns. Left has text input for author's name and drop down to select the category.
    # Right column has the generated response.
    col1, col2 = st.columns([5, 8])

    with col1:
        st.session_state.author_name = st.text_input("Author Name", "Name")
        st.write("Select a category which you wish to focus on. Here is the list of categories on ArXiv:\nhttps://arxiv.org/category_taxonomy")
        group_auth = st.selectbox("What area does this researcher work in?",
                            ("Computer Science", "Mathematics", "Physics"),
                            key="group_auth", placeholder = "---")

        st.session_state.category_1 = show_categories(group_auth, "auth")

        # If the category is "other" create a text_input to manually enter the category
        if st.session_state.category_1 == "other":
            st.session_state.category_1 = st.text_input("Please enter the category ID: ", "hep-th")

        if submitted:= st.button("Summarize Author!"):
            # Once the submit button is pressed retrieve context and generate response.
            context = get_arxiv_docs_author(st.session_state.author_name, st.session_state.category_1)
            st.session_state.llm_result_author = chain_author.stream({"Prompt": f"Can you tell me about the interests of {st.session_state.author_name}."
                                                        , "Context": context})
            with col2:
                # Stream out the response in the right column
                aut_tit = (f"## {st.session_state.author_name} \n\n")
                st.write(aut_tit)
                sum_res = st.write_stream(st.session_state.llm_result_author)
                # Store the final output
                st.session_state["res_auth"] = aut_tit + sum_res

        else:
            # If just the options are being tinkered reprint the old output.
            if st.session_state.get("res_auth"):
                with col2:
                    st.write(st.session_state["res_auth"])
            # If there is no previous response (new run), print out the how to text for author.
            else:
                with col2:
                    st.write(how_to_text_author)

### Tab 2: Date

In [None]:
%%writefile -a StreamlitApp.py
with tab2:
    # Create two columns. Left has text input for start and end date, and drop down to select the category.
    # Right column has the generated response.
    col1, col2 = st.columns([5, 8])

    with col1:
        st.session_state.start_date = st.date_input(label = "Start Date", value="default_value_today", format="YYYY/MM/DD")
        st.session_state.end_date = st.date_input(label = "End Date", value=st.session_state.start_date, min_value=st.session_state.start_date,
                                                  max_value=st.session_state.start_date + relativedelta(months=1) , format="YYYY/MM/DD")
        st.write("Select a category which you wish to focus on. Here is the list of categories on ArXiv:\nhttps://arxiv.org/category_taxonomy")
        group_date = st.selectbox("What area does this professor work in?",
                            ("Computer Science", "Mathematics", "Physics"),
                            key="group_date", placeholder = "---")

        st.session_state.category_2 = show_categories(group_date, "date")

        # If the category is "other" create a text_input to manually enter the category
        if st.session_state.category_2 == "other":
            st.session_state.category_2 = st.text_input("Please enter the category ID: ", "hep-th")

        if submitted:= st.button("Summarize Dates!"):
            # Once the submit button is pressed retrieve context and generate response.
            context = get_arxiv_docs_date(str(st.session_state.start_date), str(st.session_state.end_date), st.session_state.category_2)
            st.session_state.llm_result_date = chain_date.stream({"Prompt": f"Can you tell me about the interests between "+str(st.session_state.start_date) + " to " + str(st.session_state.end_date)+"."
                                                        , "Context": context})
            with col2:
                # Stream out the response in the right column
                date_tit = "## "+str(st.session_state.start_date) + " to " + str(st.session_state.end_date)+"\n\n"
                st.write(date_tit)
                sum_res = st.write_stream(st.session_state.llm_result_date)
                # Store the final output
                st.session_state["res_date"] = date_tit + sum_res

        else:
            # If just the options are being tinkered reprint the old output.
            if st.session_state.get("res_date"):
                with col2:
                    st.write(st.session_state["res_date"])
            # If there is no previous response (new run), print out the how to text for author.
            else:
                with col2:
                    st.write(how_to_text_date)

Appending to StreamlitApp.py


# Host the Webapp using Ngork

In [None]:
import getpass
from pyngrok import ngrok, conf

custom_domain = "" # Put the free custom_domain provided by ngrok here

# Link port 8501 with the domain
public_url = ngrok.connect('8501',  hostname=custom_domain).public_url
print("Here is your website link:\n",public_url)

Here is your website link:
 https://osprey-fit-loosely.ngrok-free.app


In [None]:
# Now we simply run the streamlit app on port 8501 and our
# webapp is ready!
!streamlit run --server.port 8501 StreamlitApp.py >/dev/null