In [9]:
import requests
import io
import pdfplumber
import re
import pandas as pd 
from PyPDF2 import PdfReader
import fitz

In [3]:
# load in the csv file that contains the URLs
df = pd.read_csv(r"C:\Users\Philippa\Documents\GitHub\arxivscraper\ArxivNanoScrape.csv")

df.head()

Unnamed: 0,arxiv id,title,published,author(s),abs link,pdf link,journal ref,comment,abstract
0,2104.10944v1,Self-immolative chemistry in nanomedicine,2021-04-22T09:15:17Z,"M Gisbert-Garzaran, M Manzano, M Vallet-Regi",http://arxiv.org/abs/2104.10944v1,http://arxiv.org/pdf/2104.10944v1,"Chem. Eng. J. 340, 24-31 (2018)","31 pages, 10 figures",Self-Immolative Chemistry is based on the casc...
1,2401.10365v1,Recent Nanoninformatics Approaches for Develop...,2024-01-18T20:14:14Z,"Francisco Mariano-Neto, Thiago de Castro Pereira",http://arxiv.org/abs/2401.10365v1,http://arxiv.org/pdf/2401.10365v1,No journal ref found,No comment found,"Nanoinformatics is a novel, rapidly growing ar..."
2,0906.5022v1,Chemical Power for Microscopic Robots in Capil...,2009-06-26T23:54:08Z,"Tad Hogg, Robert A. Freitas Jr",http://arxiv.org/abs/0906.5022v1,http://arxiv.org/pdf/0906.5022v1,"Nanomedicine: Nanotechnology, Biology, and Med...","28 pages, 7 figures",The power available to microscopic robots (nan...
3,1811.01418v1,Magnetic Nanoparticles in Nanomedicine,2018-11-04T19:02:13Z,"Kai Wu, Diqing Su, Jinming Liu, Renata Saha, J...",http://arxiv.org/abs/1811.01418v1,http://arxiv.org/pdf/1811.01418v1,No journal ref found,"67 pages, 24 figures","Nanomaterials, in addition to their small size..."
4,physics/0504007v1,Societal and ethical interactions with nanotec...,2005-04-01T15:06:33Z,"Davis Baird, Tom Vogt",http://arxiv.org/abs/physics/0504007v1,http://arxiv.org/pdf/physics/0504007v1,No journal ref found,6 pages published in Nanotechnology Law & Busi...,We identify 6 important issues tied to the con...


In [4]:
# grab the PDF links from the dataframe and store it in a list
pdf_urls_list = df["pdf link"].tolist()
print(pdf_urls_list)

['http://arxiv.org/pdf/2104.10944v1', 'http://arxiv.org/pdf/2401.10365v1', 'http://arxiv.org/pdf/0906.5022v1', 'http://arxiv.org/pdf/1811.01418v1', 'http://arxiv.org/pdf/physics/0504007v1', 'http://arxiv.org/pdf/2101.11195v1', 'http://arxiv.org/pdf/2102.00879v1', 'http://arxiv.org/pdf/2012.12561v2', 'http://arxiv.org/pdf/1704.06959v1', 'http://arxiv.org/pdf/0811.1520v1', 'http://arxiv.org/pdf/0903.1153v1', 'http://arxiv.org/pdf/1004.3448v2', 'http://arxiv.org/pdf/1101.0857v1', 'http://arxiv.org/pdf/1108.4769v1', 'http://arxiv.org/pdf/1201.3034v1', 'http://arxiv.org/pdf/1211.0912v1', 'http://arxiv.org/pdf/1506.01740v1', 'http://arxiv.org/pdf/1612.09455v1', 'http://arxiv.org/pdf/1701.03352v1', 'http://arxiv.org/pdf/1705.06817v1']


In [None]:
doi = re.compile(r"\b(?:https?://doi\.org/|doi:\s*)?(10\.\d{4,9}/[^\s]+)\b", re.IGNORECASE) # doi format can be doi:10.xxxx/xxx or https://doi.org/10.xxxx/xxx.

rows = []

# iterate through the list and grab each PDF
for url in pdf_urls_list:
    print(f"Fetching {url}...")
    request = requests.get(url)
    request.raise_for_status() # raises HTTPError if it occurs
    
    reader = PdfReader(io.BytesIO(request.content)) # convert url to file-like object loaded as bytes
    meta = reader.metadata or {}

    # get title from meta data if present
    title = ""
    for key in ("/Title", "Title"):
        if meta.get(key):
            title = meta[key]
            break
    
    found_dois = set()
    for v in meta.values():
        if not isinstance(v, str):
            continue
        m = doi.search(v)
        if m:
            found_dois.add(m.group(1))
    
    # if title or DOI cannot be found, search through full-text
    need_text = (not title) or (not found_dois)
    full_text = ""
    if need_text:
        doc = fitz.open(stream=request.content, filetype="pdf")
        full_text = "/n".join(page.get_text() or "" for page in doc)
        if not title:
            tm = re.search(r"^Title:\s*(.+)$", full_text, re.IGNORECASE | re.MULTILINE)
            if tm:
                title = tm.group(1).strip()
                
        # scan text for DOIs
        for m in doi.finditer(full_text):
            found_dois.add(m.group(1))
    
    rows.append({
        "url":url,
        "title":title,
        "dois": ";".join(sorted(found_dois))
    })
    
# convert to dataframe
df_parsed = pd.DataFrame(rows)
df_parsed.head()

Fetching http://arxiv.org/pdf/2104.10944v1...
Fetching http://arxiv.org/pdf/2401.10365v1...
Fetching http://arxiv.org/pdf/0906.5022v1...
Fetching http://arxiv.org/pdf/1811.01418v1...
Fetching http://arxiv.org/pdf/physics/0504007v1...
Fetching http://arxiv.org/pdf/2101.11195v1...
Fetching http://arxiv.org/pdf/2102.00879v1...
Fetching http://arxiv.org/pdf/2012.12561v2...
Fetching http://arxiv.org/pdf/1704.06959v1...
Fetching http://arxiv.org/pdf/0811.1520v1...
Fetching http://arxiv.org/pdf/0903.1153v1...
Fetching http://arxiv.org/pdf/1004.3448v2...
Fetching http://arxiv.org/pdf/1101.0857v1...
Fetching http://arxiv.org/pdf/1108.4769v1...
Fetching http://arxiv.org/pdf/1201.3034v1...
Fetching http://arxiv.org/pdf/1211.0912v1...
Fetching http://arxiv.org/pdf/1506.01740v1...
Fetching http://arxiv.org/pdf/1612.09455v1...
Fetching http://arxiv.org/pdf/1701.03352v1...
Fetching http://arxiv.org/pdf/1705.06817v1...


Unnamed: 0,url,title,dois
0,http://arxiv.org/pdf/2104.10944v1,,
1,http://arxiv.org/pdf/2401.10365v1,Recent Nanoninformatics Approaches for Develop...,10.1002/smll.201906588;10.1006/jbin.2002.1042;...
2,http://arxiv.org/pdf/0906.5022v1,,
3,http://arxiv.org/pdf/1811.01418v1,,
4,http://arxiv.org/pdf/physics/0504007v1,BUILDING A BETTER MOUSETRAP: PATENTING BIOTEC...,


In [12]:
df_parsed.to_csv("extracted_paper_info.csv", index=False)