In [21]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from urllib.parse import urlparse, parse_qs, urlunparse

from bs4 import BeautifulSoup

import re
import csv
import time
import math

import arxiv

In [19]:
# Useful functions

def prepare_base_url(full_url):
    parsed_url = urlparse(full_url)
    query_params = parse_qs(parsed_url.query)
    query_params.pop('page', None)

    # Reconstruct the query string, ensuring spaces are encoded correctly
    query_string = "&".join(f"{key}={value[0].replace(' ', '%20')}" for key, value in query_params.items())
    new_parsed_url = parsed_url._replace(query=query_string)
    
    return urlunparse(new_parsed_url)

def get_total_pages_from_inspire(driver, full_query_url):
    # Prepare the base URL by removing the 'page' parameter
    base_url = prepare_base_url(full_query_url)
    print(base_url)
    
    # Fetch the first page to get the total number of results
    driver.get(base_url + "&page=1")
    # Scroll to the bottom of the page
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait for the page to load
    time.sleep(2)  # Adjust this time based on your network speed

    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the total number of results
    results_span = soup.find('span', string=re.compile(r'\d+\sresults'))
    total_results = int(results_span.text.split()[0].replace(',', '')) if results_span else 0
    
    # Extract the results per page from the URL
    match = re.search(r"&size=(\d+)&", full_query_url)
    results_per_page = int(match.group(1)) if match else 25  # Default to 25 if not found
    
    # Calculate the total number of pages
    total_pages = math.ceil(total_results / results_per_page)
    
    return total_pages

# Function to extract DOI
def extract_doi(entry):
    doi_pattern = re.compile(r'(10\.\d{4,9}/[-._;()/:A-Za-z0-9]+)')
    doi = None
    doi_tag = entry.find('a', href=doi_pattern)
    if doi_tag:
        doi_match = doi_pattern.search(doi_tag['href'])
        doi = doi_match.group(0) if doi_match else None
    return doi

# Function to extract ePrint number
def extract_eprint_number(entry):
    eprint_number = None
    eprint_section = entry.find(lambda tag: tag.name == 'a' and 'arxiv.org' in tag.get('href', ''))
    if eprint_section:
        eprint_number = eprint_section.get_text(strip=True)
    return eprint_number

# Function to extract arXiv category
def extract_arxiv_category_v2(entry):
    arxiv_category = None
    eprint_link = entry.find(lambda tag: tag.name == 'a' and 'arxiv.org' in tag.get('href', ''))
    if eprint_link:
        category_span = eprint_link.find_next_sibling('span')
        if category_span:
            arxiv_category = category_span.get_text(strip=True).strip('[]')
    return arxiv_category

# Revised function to extract citation counts based on the demonstrated algorithm
def extract_citation_count_v2(entry):
    citation_tag = entry.find_next(lambda tag: tag.name == 'span' and ('citation' in tag.text or 'citations' in tag.text))
    if citation_tag:
        citation_count = int(citation_tag.text.split()[0])
        return citation_count
    return 0

# Function to extract paper details including citation counts
def extract_paper_details_v9(entry):
    title_tag = entry.find('span', {'data-test-id': 'literature-detail-title'})
    title = title_tag.get_text(strip=True) if title_tag else None
    authors_tags = entry.find_all('a', {'data-test-id': 'author-link'})
    authors = ', '.join([tag.get_text(strip=True) for tag in authors_tags])

    eprint_number = extract_eprint_number(entry)
    arxiv_category = extract_arxiv_category_v2(entry)
    doi = extract_doi(entry)
    citation_count = extract_citation_count_v2(entry)

    return {
        'title': title,
        'authors': authors,
        'eprint_number': eprint_number,
        'arXiv_category': arxiv_category,
        'doi': doi,
        'citation_count': citation_count
    }

# Function to filter out duplicate and invalid entries
def filter_valid_entries(entries):
    filtered_entries = []
    seen_titles = set()
    for entry in entries:
        title = entry.find('span', {'data-test-id': 'literature-detail-title'})
        authors = entry.find_all('a', {'data-test-id': 'author-link'})
        if title and authors and title.get_text(strip=True) not in seen_titles:
            seen_titles.add(title.get_text(strip=True))
            filtered_entries.append(entry)
    return filtered_entries

In [2]:
# Set up the WebDriver for Safari
driver = webdriver.Safari()


In [18]:
# URL to scrape
url = "https://inspirehep.net/literature?sort=mostrecent&size=250&page=1&q=a%20quevedo&ui-citation-summary=true"

# Open the URL
driver.get(url)

# Regular expression to find the size parameter
match = re.search(r"&size=(\d+)&", url)
results_per_page = int(match.group(1))
results_per_page

250

In [13]:
# Scroll to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Wait for the page to load
time.sleep(10)  # Adjust this time based on your network speed

# Now fetch the HTML content
html_content = driver.page_source

In [14]:
# Specify the filename
filename = "webpage_content.txt"

# Write the HTML content to a text file
with open(filename, "w", encoding="utf-8") as file:
    file.write(html_content)

In [15]:
# Assuming 'html_content' contains the full HTML content
soup = BeautifulSoup(html_content, 'html.parser')

In [16]:
# Find the span element that contains the number of results just before the word "results"
results_span = soup.find('span', string=re.compile(r'\d+\sresults'))

In [17]:
total_results = int(results_span.text.split()[0].replace(',', ''))  # Remove commas
total_results

451

In [22]:
total_pages = math.ceil(total_results / results_per_page)
total_pages

2

In [35]:
# Example usage:
# Paste the full URL from INSPIRE-HEP here
full_query_url = "https://inspirehep.net/literature?sort=mostrecent&size=250&page=1&q=a%20quevedo&ui-citation-summary=true"
total_pages = get_total_pages_from_inspire(driver, full_query_url)
total_pages

https://inspirehep.net/literature?sort=mostrecent&size=250&q=a%20quevedo&ui-citation-summary=true


2

In [116]:


# Find all entries and apply the filter
entries = soup.find_all(lambda tag: tag.name == 'div' and 'literature' in tag.attrs.get('data-test-id', ''))
filtered_entries = filter_valid_entries(entries)

# Extract details from filtered entries
papers_details = [extract_paper_details_v9(entry) for entry in filtered_entries]


In [117]:
papers_details

[{'title': 'Searching for High Frequency Gravitational Waves with Phonons',
  'authors': 'Yonatan Kahn, Jan Schütte-Engel, Tanner Trickle',
  'eprint_number': '2311.17147',
  'arXiv_category': 'hep-ph',
  'doi': None,
  'citation_count': 0},
 {'title': 'Probing Reheating with Graviton Bremsstrahlung',
  'authors': 'Nicolás Bernal, Simon Cléry, Yann Mambrini, Yong Xu',
  'eprint_number': '2311.12694',
  'arXiv_category': 'hep-ph',
  'doi': None,
  'citation_count': 0},
 {'title': 'Quantum technologies for fundamental (HE) physics',
  'authors': 'D. Blas',
  'eprint_number': '2311.10187',
  'arXiv_category': 'hep-ph',
  'doi': None,
  'citation_count': 0},
 {'title': 'Constraints on Non-Gaussian primordial curvature perturbation from the LIGO-Virgo-KAGRA third observing run',
  'authors': 'Ryoto Inui, Santiago Jaraba, Sachiko Kuroyanagi, Shuichiro Yokoyama',
  'eprint_number': '2311.05423',
  'arXiv_category': 'astro-ph.CO',
  'doi': None,
  'citation_count': 1},
 {'title': 'Search for h

In [118]:
len(papers_details)

181

In [119]:
# Function to fetch abstract from arXiv
def fetch_abstract(eprint_number):
    search = arxiv.Search(id_list=[eprint_number])
    for result in search.results():
        return result.summary
    return "Abstract not found"

# Update each paper's dictionary with its abstract
for paper in papers_details:
    if paper['eprint_number']:
        paper['abstract'] = fetch_abstract(paper['eprint_number'])

# Now, papers_details contains each paper with its abstract


  for result in search.results():


In [112]:
papers_details[0:10]

[{'title': 'Searching for High Frequency Gravitational Waves with Phonons',
  'authors': 'Yonatan Kahn, Jan Schütte-Engel, Tanner Trickle',
  'eprint_number': '2311.17147',
  'arXiv_category': 'hep-ph',
  'doi': None,
  'citation_count': 0,
  'abstract': 'The gravitational wave (GW) spectrum at frequencies above a kHz is a largely\nunexplored frontier. We show that detectors with sensitivity to single-phonon\nexcitations in crystal targets can search for GWs with frequencies,\n$\\mathrm{THz} \\lesssim f \\lesssim 100 \\, \\mathrm{THz}$, corresponding to the\nrange of optical phonon energies, $\\mathrm{meV} \\lesssim \\omega \\lesssim 100 \\,\n\\mathrm{meV}$. Such detectors are already being built to search for light dark\nmatter (DM), and therefore sensitivity to high-frequency GWs will be achieved\nas a byproduct. We begin by deriving the absorption rate of a general GW signal\ninto single phonons. We then focus on carefully defining the detector\nsensitivity to monochromatic and chir

In [120]:
# Replace newline characters in abstracts with spaces
for paper in papers_details:
    if 'abstract' in paper:
        paper['abstract'] = paper['abstract'].replace('\n', ' ').replace('\r', '')

In [121]:
# Modify arXiv numbers to be recognized as text
for paper in papers_details:
    if 'eprint_number' in paper and paper['eprint_number']:
        paper['eprint_number'] = str(paper['eprint_number'])

# Replace newline characters in abstracts with spaces
for paper in papers_details:
    if 'abstract' in paper:
        paper['abstract'] = paper['abstract'].replace('\n', ' ').replace('\r', '')
        
# Specify the filename for the CSV
csv_filename = 'CsvOutput.csv'  # Replace with your desired path

# Writing to the CSV file
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Writing the header
    writer.writeheader()

    # Writing the rows
    for paper in papers_details:
        writer.writerow(paper)

print(f"Data written to {csv_filename}")

Data written to CsvOutput.csv


In [101]:
!pwd

/Users/fmuia/Library/CloudStorage/Dropbox/HFGW/Workshops/2023/Data/NewFolder
