In [1]:
import re
import os
import arxiv
import requests
import random
from time import sleep
from scholarly import scholarly
from scrapeless import ScrapelessClient
from serpapi.google_search import GoogleSearch
from dotenv import load_dotenv
load_dotenv()

scrapeless_token = os.getenv("SCRAPELESS_API_KEY")
scrapeless = ScrapelessClient(api_key=scrapeless_token)

# Replace with your own SerpAPI API key
serpapi_token = os.getenv("SERPAPI_KEY")
# print(serpapi_key)

def extract_venue(citation_snippet):
    # This regex looks for:
    # - A paper title enclosed in quotes (non-greedily with ".*?")
    # - Then optional whitespace
    # - Then captures one or more non-digit characters (the venue)
    #   until it sees a digit, possibly after some spaces or colons.
    pattern = r'".*?"\s*([^0-9]+?)(?=[\s:]*\d)'
    
    match = re.search(pattern, citation_snippet)
    if match:
        venue = match.group(1).strip()
        # For cases like arXiv, remove a trailing 'arXiv:' if present.
        if venue and 'arxiv' in venue.lower():
            venue = 'arXiv'
        if len(venue) < 3:
            return None
        return venue
    return None

def get_author_info_by_gs_id(author_id):
    """
    Fetches author information from Google Scholar using the author ID.
    Args:
        author_id (str): The Google Scholar author ID.
    Returns:
        dict: Author information including name, affiliation, and publications.
    """
    sleep(random.randint(1, 3))
    try:
        author_info = scholarly.search_author_id(author_id)

        if author_info:
            author_name = author_info.get('name', None)
            author_affiliation = author_info.get('affiliation', None)
            if author_affiliation and "," in author_affiliation:
                author_affiliation = author_affiliation.split(",")[-1]
            return {
                'name': author_name,
                'affiliation': author_affiliation
            }
        else:
            print("No author information found.")
            return None
    except Exception as e:
        print(f"Error fetching author info: {e}")
        return None

def get_author_info_serpapi(author_id):
    """
    Search for an author by their Google Scholar ID and retrieve their details.
    """
    # Define the parameters for the search
    params = {
        "api_key": serpapi_token,
        "engine": "google_scholar_author",
        "author_id" : author_id
    }

    # Initialize and execute the search
    author_search = GoogleSearch(params)
    return author_search.get_dict()

def get_paper_info_serpapi(paper_title):
    """
    Search for a paper by title and retrieve its details using SerpAPI.
    """
    # Define the parameters for the search

    params = {
        "api_key": serpapi_token,
        "engine": "google_scholar",  # Use the Google Scholar engine
        "q": paper_title,            # Query the paper title
    }

    # Initialize the search
    search = GoogleSearch(params)

    # Get the full response as a Python dictionary
    try:
      result = search.get_dict()
      if 'organic_results' in result or 'organic_result' in result:
          organic_results = result['organic_results'] if 'organic_results' in result else result['organic_result']
          if len(organic_results) > 0:
            return organic_results[0]
    except Exception as e:
      print(f"Error fetching {paper_title}:\n{e}")
      return None
    
    print("No organic results found.")
    return None

def get_paper_info_scrapeless(paper_title, max_retries=3):
  """
  Fetches paper information from Google Scholar using Scrapeless API.
  Args:
    paper_title (str): The title of the paper to search for.
    max_retries (int): Maximum number of retries for the API request.
  Returns:
    dict: The first organic result from Google Scholar.
  """
  actor = "scraper.google.scholar"
  input_data = {
    "q" : paper_title,
  }
  try:
    for i in range(max_retries):
      # Make the API request
      result = scrapeless.scraper(actor, input=input_data)
      if 'organic_results' in result or 'organic_result' in result:
        organic_results = result['organic_results'] if 'organic_results' in result else result['organic_result']
        if len(organic_results) > 0:
          return organic_results[0]
        else:
          print("No organic results found.")
          return None
      else:
        print(f"No organic results found in the response at iteration {i + 1}.")
        if i < max_retries - 1:
          print("Retrying...")
          sleep(3)
  except Exception as e:
    print(f"Error fetching {paper_title}:\n{e}")
    return None

def get_paper_citations_scrapeless(result_id, debug = False):
  """
  Fetches citation information from Google Scholar using Scrapeless API.
  Args:
    result_id (str): The ID of the paper result.
  Returns:
    dict: Citation information including title, authors, and abstract.
  """
  actor = "scraper.google.scholar.cite"
  input_data = {
    "q": result_id,
  }
  venue = None
  bibtex_link = None
  try:
    # Make the API request
    result = scrapeless.scraper(actor, input=input_data)
    if 'citations' in result:
      citations = result['citations']
      for c in citations:
        if debug:
          print(c)
        if c.get("title") == "MLA":
          citation_snippet = c.get("snippet", None)
          if citation_snippet:
            venue = extract_venue(citation_snippet)

    if 'links' in result:
      bib_links = result['links']
      for b in bib_links:
        if debug:
          print(b)
        if b.get("name").lower() == "bibtex":
          bibtex_link = b.get("link")

  except Exception as e:
    print(f"Error fetching citations:\n{e}")
    return None
  
  if venue and 'arxiv' in venue:
    venue = 'arxiv'
    
  return {
    'venue': venue,
    'bibtex_link': bibtex_link
  }
  
def extract_paper_info(scrapeless_result):
  """
  Extracts relevant information from the Scrapeless API result.
  Args:
    scrapeless_result (dict): The result from Scrapeless API.
  Returns:
    dict: Extracted information including title, authors, and abstract.
  """
  title = scrapeless_result.get('title')
  authors = scrapeless_result.get('authors')
  abstract = scrapeless_result.get('abstract')
  
  return {
    'title': title,
    'authors': authors,
    'abstract': abstract
  }



def get_abstract_from_openview(forum_id):
    """
    Fetch the abstract from OpenReview using the forum ID.
    """
    # OpenReview API endpoint to get note details by forum id
    url = f"https://api2.openreview.net/notes?forum={forum_id}"

    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        notes = data.get('notes', [])
        if notes:
            # Extract the abstract from the note content
            abstract = notes[0]['content'].get('abstract', None)
            if abstract:
                return abstract['value']
            else:
                print("Abstract not found in the note content.")
                return None
        else:
            print("No notes found in the response.")
            return None
    else:
        print(f"Failed to retrieve data from OpenReview API. Status code: {response.status_code}")
        return None

def get_abstract_from_arxiv_link(arxiv_link):
    # Extract the arXiv ID from the link
    # ArXiv links typically have the format: https://arxiv.org/abs/XXXX.XXXXX
    arxiv_id = arxiv_link.split("/")[-1]
    
    # Create a client to interact with the arXiv API
    client = arxiv.Client()
    
    # Search for the paper by ID
    search = arxiv.Search(id_list=[arxiv_id])
    
    # Get the first result (there should be only one since we're searching by ID)
    paper = next(client.results(search))
    
    # Return the abstract
    return paper.summary


def get_paper_metadata(paper_title):
    """
    Search for a paper by title and retrieve its details.
    """
    # add a random delay between 1 second and 3 seconds
    sleep(random.randint(1, 3))
    # if no space in paper_title, and more than one "_" replace "_" with " "
    if " " not in paper_title:
        if paper_title.count("_") > 0:
            paper_title = paper_title.replace("_", " ")
        elif paper_title.count("-") > 2:
            paper_title = paper_title.replace("-", " ")
        # if paper_title is like "CommunicativeAgentForSoftwareDevelopement", then add space between each word so it comes
        # out as "Communicative Agent For Software Developement"
        elif paper_title[0].isupper() and paper_title[1].islower():
            paper_title = re.sub(r"(?<!^)(?=[A-Z])", " ", paper_title)
            print(f"Reformated paper title: {paper_title}")

    try:
        paper_info = get_paper_info_scrapeless(paper_title)
        # paper_info = scholarly.search_single_pub(paper_title)
        # search_query = scholarly.search_pubs(paper_title)
        # pub = next(search_query, None)
        # if pub:
        #     paper_info = scholarly.fill(pub)
        # else:
        #     print("No publication found")
        #     return None
    except Exception as e:
        print(f"Searching {paper_title} got an error: {e}")
        return None
    """Example paper_info:
{'position': 1,
 'title': 'Data efficient evaluation of large language models and text-to-image models via adaptive sampling',
 'result_id': 'X8cSSWkY13EJ',
 'link': 'https://arxiv.org/abs/2406.15527',
 'publication_info': {'summary': 'C Xu, G Saranathan, MP Alam, A Shah, J Lim…\xa0- arXiv preprint arXiv\xa0…, 2024 - arxiv.org',
  'authors': [{'name': 'C Xu',
    'link': 'http://https://scholar.google.com/citations?user=B8WA2XsAAAAJ&hl=en&oi=sra',
    'author_id': 'B8WA2XsAAAAJ'},
   {'name': 'G Saranathan',
    'link': 'http://https://scholar.google.com/citations?user=haF4QnMAAAAJ&hl=en&oi=sra',
    'author_id': 'haF4QnMAAAAJ'},
   {'name': 'MP Alam',
    'link': 'http://https://scholar.google.com/citations?user=V06zkkwAAAAJ&hl=en&oi=sra',
    'author_id': 'V06zkkwAAAAJ'},
   {'name': 'A Shah',
    'link': 'http://https://scholar.google.com/citations?user=zMszxvYAAAAJ&hl=en&oi=sra',
    'author_id': 'zMszxvYAAAAJ'}]},
 'resources': [{'title': 'arxiv.org',
   'file_format': '[PDF]',
   'link': 'https://arxiv.org/pdf/2406.15527'}],
 'inline_links': {'cited_by': {'total': 6,
   'link': 'https://scholar.google.com',
   'cites_id': ''}}}
    """
    if not paper_info:
        print("Paper not found with Scrapeless API. Retry with SerpAPI.")
        paper_info = get_paper_info_serpapi(paper_title)
    if not paper_info:
        print("Paper not found with SerpAPI.")
        return None
    
    title = paper_info.get('title', None)
    result_id = paper_info.get('result_id', None)
    pub_url = paper_info.get('link', None)
    authors = paper_info.get('publication_info', {}).get('authors', [])
    author_names = None
    author_ids = None
    if authors:
        author_names = [author['name'] for author in authors]
        author_ids = [author['author_id'] for author in authors]
        # print(f"author_ids: {author_ids}")
    
    resource_link = None
    if 'resources' in paper_info:
      resource = paper_info['resources'][0]
      if 'link' in resource:
        resource_link = resource['link']
        # print(f"resource_link: {resource_link}")

    venue = None
    bibtex_link = None
    if result_id:
        citations = get_paper_citations_scrapeless(result_id)
        if citations:
            venue = citations.get('venue', None)
            bibtex_link = citations.get('bibtex_link', None)
    
        # print(f"Title: {title}")
        # print(f"Author_ids: {author_ids}")
        # print(f"Publication Year: {pub_year}")
        # print(f"Venue: {venue}")
        # print(f"Abstract: {abstract}")
        # print(f"Public URL: {pub_url}")
    else:
        print("No bibliographic information available")
    
    first_author_name = None
    first_author_affiliation = None
    if author_ids:
        first_author_id = author_ids[0]
        if len(first_author_id) > 5:
            # print(f"First Author ID: {first_author_id}")
            author_info = get_author_info_by_gs_id(first_author_id)
            if author_info:
                first_author_name = author_info.get('name', None)
                first_author_affiliation = author_info.get('affiliation', None)
            else:
                print(f"No author information found for ID: {first_author_id}")

    abstract = None
    code_url = None

    if pub_url:
      if 'arxiv' in pub_url:
          abstract = get_abstract_from_arxiv_link(pub_url)
          # print(f"Abstract: {abstract}")
      elif 'openreview' in pub_url:
          forum_id = pub_url.split("/")[-1]
          abstract = get_abstract_from_openview(forum_id)
          # print(f"Abstract: {abstract}")

    if abstract and 'github.com/' in abstract:
        code_url = "https://github.com/" + abstract.split("github.com/")[1].split(' ')[0].split('.')[0]
        # print(f"Code URL: {code_url}")

    results_json = {
        "title": title,
        "author_names": author_names,
        "author_ids": author_ids,
        "public_url": pub_url,
        "code_url": code_url,
        "venue": venue,
        "result_id": result_id,
        "resource_link": resource_link,
        "bibtex_link": bibtex_link,
        "first_author_name": first_author_name,
        "first_author_affiliation": first_author_affiliation,
        "abstract": abstract
    }
    return results_json
        


In [2]:
from pprint import pprint
title = "Data efficient evaluation of large language models and text-to-image models via adaptive sampling"
result = get_paper_metadata(title)
pprint(result)

No organic results found in the response at iteration 1.
Retrying...
No organic results found in the response at iteration 2.
Retrying...
No organic results found in the response at iteration 3.
Paper not found with Scrapeless API. Retry with SerpAPI.
{'abstract': 'Evaluating LLMs and text-to-image models is a computationally '
             'intensive task\n'
             'often overlooked. Efficient evaluation is crucial for '
             'understanding the diverse\n'
             'capabilities of these models and enabling comparisons across a '
             'growing number\n'
             'of new models and benchmarks. To address this, we introduce '
             'SubLIME, a\n'
             'data-efficient evaluation framework that employs adaptive '
             'sampling techniques,\n'
             'such as clustering and quality-based methods, to create '
             'representative subsets\n'
             'of benchmarks. Our approach ensures statistically aligned model '
      