In [1]:
import arxiv
import json
import os
from typing import List
from dotenv import load_dotenv
from google import genai

In [2]:
PAPER_DIR = "papers"

In [3]:
def search_papers(topic: str, max_results: int = 5) -> List[str]:
    """
    Search for papers on arXiv based on a topic and store their information.
    
    Args:
        topic: The topic to search for
        max_results: Maximum number of results to retrieve (default: 5)
        
    Returns:
        List of paper IDs found in the search
    """
    
    # Use arxiv to find the papers 
    client = arxiv.Client()

    # Search for the most relevant articles matching the queried topic
    search = arxiv.Search(
        query = topic,
        max_results = max_results,
        sort_by = arxiv.SortCriterion.Relevance
    )

    papers = client.results(search)
    
    # Create directory for this topic
    path = os.path.join(PAPER_DIR, topic.lower().replace(" ", "_"))
    os.makedirs(path, exist_ok=True)
    
    file_path = os.path.join(path, "papers_info.json")

    # Try to load existing papers info
    try:
        with open(file_path, "r") as json_file:
            papers_info = json.load(json_file)
    except (FileNotFoundError, json.JSONDecodeError):
        papers_info = {}

    # Process each paper and add to papers_info  
    paper_ids = []
    for paper in papers:
        paper_ids.append(paper.get_short_id())
        paper_info = {
            'title': paper.title,
            'authors': [author.name for author in paper.authors],
            'summary': paper.summary,
            'pdf_url': paper.pdf_url,
            'published': str(paper.published.date())
        }
        papers_info[paper.get_short_id()] = paper_info
    
    # Save updated papers_info to json file
    with open(file_path, "w") as json_file:
        json.dump(papers_info, json_file, indent=2)
    
    print(f"Results are saved in: {file_path}")
    
    return paper_ids

In [5]:
search_papers("computers")

Results are saved in: papers\computers\papers_info.json


['1310.7911v2',
 'math/9711204v1',
 '2208.00733v1',
 '2504.07020v1',
 '2403.03925v1']

In [6]:
def extract_info(paper_id: str) -> str:
    """
    Search for information about a specific paper across all topic directories.
    
    Args:
        paper_id: The ID of the paper to look for
        
    Returns:
        JSON string with paper information if found, error message if not found
    """
 
    for item in os.listdir(PAPER_DIR):
        item_path = os.path.join(PAPER_DIR, item)
        if os.path.isdir(item_path):
            file_path = os.path.join(item_path, "papers_info.json")
            if os.path.isfile(file_path):
                try:
                    with open(file_path, "r") as json_file:
                        papers_info = json.load(json_file)
                        if paper_id in papers_info:
                            return json.dumps(papers_info[paper_id], indent=2)
                except (FileNotFoundError, json.JSONDecodeError) as e:
                    print(f"Error reading {file_path}: {str(e)}")
                    continue
    
    return f"There's no saved information related to paper {paper_id}."

In [7]:
extract_info('1310.7911v2')

'{\n  "title": "Compact manifolds with computable boundaries",\n  "authors": [\n    "Zvonko Iljazovic"\n  ],\n  "summary": "We investigate conditions under which a co-computably enumerable closed set\\nin a computable metric space is computable and prove that in each locally\\ncomputable computable metric space each co-computably enumerable compact\\nmanifold with computable boundary is computable. In fact, we examine the notion\\nof a semi-computable compact set and we prove a more general result: in any\\ncomputable metric space each semi-computable compact manifold with computable\\nboundary is computable. In particular, each semi-computable compact\\n(boundaryless) manifold is computable.",\n  "pdf_url": "http://arxiv.org/pdf/1310.7911v2",\n  "published": "2013-10-29"\n}'

###Tool Schema

Here are the schema of each tool which you will provide to the LLM. 
We are going to pass these tools to LLM (Claude/gemini).
Then we will build a chatbot that will take in these tools and knows when to call them to return the data.
Each tool we define will have a "name", "description" and then some schema that it needs to follow
LLM will not call the functions, we are goign to write the code to call those functions and pass the data back to the model.
But these tools are going to allow the model to extend it's functionality so instead of saying 'I don't know' or hallucinate we'll get the answer that we want.


In [8]:
#This works with anthropic library for claude model
tools_anthropic = [
    {
        "name": "search_papers",
        "description": "Search for papers on arXiv based on a topic and store their information.",
        "input_schema": {
            "type": "object",
            "properties": {
                "topic": {
                    "type": "string",
                    "description": "The topic to search for"
                }, 
                "max_results": {
                    "type": "integer",
                    "description": "Maximum number of results to retrieve",
                    "default": 5
                }
            },
            "required": ["topic"]
        }
    },
    {
        "name": "extract_info",
        "description": "Search for information about a specific paper across all topic directories.",
        "input_schema": {
            "type": "object",
            "properties": {
                "paper_id": {
                    "type": "string",
                    "description": "The ID of the paper to look for"
                }
            },
            "required": ["paper_id"]
        }
    }
]

Tools for Google genai library with Gemini model 

In [9]:
from google.genai import types
tools_google = [
    types.Tool(
        function_declarations=[
            {
                "name": tool.get("name"),
                "description": tool.get("description"),
                "parameters": {
                    k: v
                    for k, v in tool.get("input_schema").items()
                    if k not in ["additionalProperties", "$schema"]
                },
            }
        ]
    )
    for tool in tools_anthropic
]

In [10]:
mapping_tool_function = {
    "search_papers": search_papers,
    "extract_info": extract_info
}

def execute_tool(tool_name, tool_args):
    
    result = mapping_tool_function[tool_name](**tool_args)
    print(f"execute_tools result:{result}")

    if result is None:
        result = "The operation completed but didn't return any results."
        
    elif isinstance(result, list):
        result = ', '.join(result)
        
    elif isinstance(result, dict):
        # Convert dictionaries to formatted JSON strings
        result = json.dumps(result, indent=2)
    
    else:
        # For any other type, convert using str()
        result = str(result)
    return result

In [11]:
load_dotenv()

True

In [12]:

client = genai.Client(api_key=os.getenv("GEMINI_KEY"))

In [16]:
import json
def process_query(query):
    
    messages = [{'role': 'user', 'content': query}]
    
    response = client.models.generate_content(
        model='gemini-2.5-flash-preview-04-17',
        contents=json.dumps(messages),
        config={'tools':tools_google})
    
    process_query = True
    while process_query:
        assistant_content = []

        if response.text :
            assistant_content.append(response.text)
            print(response.text)
            #if len(response.text) == 1:
            process_query = False
        
        elif response.function_calls:
            for content in response.function_calls:
                
                assistant_content.append(json.dumps(content.to_json_dict()))
                messages.append({'role': 'assistant', 'content': assistant_content})
                
                tool_id = content.id
                tool_args = content.args
                tool_name = content.name
                
                print(f"Calling tool {tool_name} with args {tool_args}")
                
                result = execute_tool(tool_name, tool_args)
                messages.append({"role": "user", 
                                    "content": [
                                        {
                                            "type": "tool_result",
                                            "tool_name": tool_name,
                                            "content": json.dumps(result)
                                        }
                                    ]
                                })
                response = client.models.generate_content(
                            model='gemini-2.5-flash-preview-04-17',
                            contents= json.dumps(messages),
                            config={'tools':tools_google})
                

In [14]:
def chat_loop():
    print("Type your queries or 'quit' to exit.")
    while True:
        try:
            query = input("\nQuery: ").strip()
            if query.lower() == 'quit':
                break
    
            process_query(query)
            print("\n")
        except Exception as e:
            print(f"\nError: {str(e)}")

In [17]:
chat_loop()

Type your queries or 'quit' to exit.
Hello! How can I help you today?







Calling tool search_papers with args {'topic': 'trigonometry'}
Results are saved in: papers\trigonometry\papers_info.json
execute_tools result:['1206.1761v1', '2503.11678v1', 'math-ph/9910041v1', 'math-ph/0112030v1', '0911.1025v1']

Error: 500 INTERNAL. {'error': {'code': 500, 'message': 'An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting', 'status': 'INTERNAL'}}




Calling tool extract_info with args {'paper_id': '1206.1761v1'}
execute_tools result:{
  "title": "Trigonometry of The Gold-Bug",
  "authors": [
    "Erik Talvila"
  ],
  "summary": "The classic Edgar Allan Poe story The Gold-Bug involves digging for pirate\ntreasure. Locating the digging sites requires some simple trigonometry.",
  "pdf_url": "http://arxiv.org/pdf/1206.1761v1",
  "published": "2012-05-31"
}
Calling tool extract_info with args {'paper_id': '2503.11678v1'}
execute_tools result:{
  "title": "A New Approach to Learn Trigonometry",
  "authors": [
    "Marcia Ann Surya",
    "Yohanes Surya"
  ],
  "summary": "We introduce the Primary Gasing Triangle, a right triangle with a hypotenuse\nof 1 unit, to define the primary trigonometric functions: sine and cosine. This\ntriangle serves as the foundational element in a new approach to learning\ntrigonometry, enabling us to derive the Derived Gasing Triangle, where the\nother four trigonometric functions (tangent, secant, cotangen