# Crawl all PDF files for text and statistics
Here we scrape all the PDFs and extract text data and statistics from them

In [1]:
%load_ext autoreload
%autoreload 2

import os
import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
from pathlib import Path
from PyPDF2 import PdfReader


## Read submission list
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [2]:
DATA_PATH = '../data/'
venue = 'ICLR.cc/2023/Conference'
venue_short = 'iclr2023'
date = time.strftime("%Y%m%d")

# Read hdf5 file
df = pd.read_hdf(DATA_PATH + f'{venue_short}_data_full_{date}.h5', key='df')

## Download PDFs and crawl data

In [3]:
# Download pdfs from df['content.pdf']
def download_pdf(url, filename):
    r = requests.get(url, allow_redirects=True)
    open(filename, 'wb').write(r.content)

def download_pdf(url, filename):
    try:
        r = requests.get(url, allow_redirects=True, timeout=10)
        r.raise_for_status()
        open(filename, 'wb').write(r.content)
        return
    except requests.exceptions.Timeout:
        print(f"Error downloading {filename}: Request timed out")
    except e:
        print(f"Error downloading {filename}: {e}")


def retry_get_pdf_data(url, filename, extra='', timeout=5, retries=10):
    num_retries = 0
    for i in range(retries):
        if download_pdf(url, filename, timeout=timeout):
            break
        else:
            print(f"Error downloading {filename}: {extra}")
            time.sleep(1)
            num_retries += 1
            if num_retries >= retries:
                print(f"Error downloading {filename}: {extra}")
                break
    

def get_pdf_data(id, save_dir='temp/'):
    try:
        # Make temp directory if not exis
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        BASE_URL = 'https://openreview.net/pdf?id='
        filename = save_dir / (id + '.pdf')
        download_pdf(BASE_URL + id, filename)

        # Read pdf
        reader = PdfReader(filename)
        number_of_pages = len(reader.pages)
        file_size = os.stat(filename).st_size
        text_all = ''
        num_characters = 0
        for page in reader.pages:
            text = page.extract_text()
            # text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
            text_all += text
            num_characters += len(text)

        # Delete pdf
        os.remove(filename)
        return {'id': id, 'text': text_all, 'num_characters': num_characters, 'num_pages': number_of_pages, 'file_size': file_size}
    except:
        return {'id': id, 'text': '', 'num_characters': 0, 'num_pages': 0, 'file_size': 0}


# Use multiprocessing to download pdfs
def get_pdf_multi(ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(get_pdf_data, ids), total=len(ids)))
    return data

In [4]:
# Get pdf data
data = get_pdf_multi(df['id'].values.tolist(), ratio=.8)

  0%|          | 0/4874 [00:00<?, ?it/s]

unknown widths : 
[0, IndirectObject(317, 0, 140366911357040)]
unknown widths : 
[0, IndirectObject(319, 0, 140366911357040)]
FloatObject (b'0.00-86730235') invalid; use 0.0 instead
FloatObject (b'0.000000000000-15543122') invalid; use 0.0 instead
FloatObject (b'0.000000000000-19539925') invalid; use 0.0 instead
FloatObject (b'0.0000000000000-35527137') invalid; use 0.0 instead
FloatObject (b'0.0000000000000-71054274') invalid; use 0.0 instead
FloatObject (b'0.000000000000-14210855') invalid; use 0.0 instead
FloatObject (b'0.000000000000-38339703') invalid; use 0.0 instead
FloatObject (b'0.0000000000000000000000000000-4256562') invalid; use 0.0 instead
FloatObject (b'0.00000000000-11368684') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-27322833') invalid; use 0.0 instead
FloatObject (b'0.0

In [7]:
data = pd.DataFrame(data)
data.tail()

Unnamed: 0,id,text,num_characters,num_pages,file_size
4869,,,0,0,0
4870,,,0,0,0
4871,,,0,0,0
4872,,,0,0,0
4873,,,0,0,0


## Save dataset as `hdf5` file

In [35]:
data.to_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df', mode='a')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['id', 'text'], dtype='object')]

  data.to_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df', mode='a')


## (optional) get older data parsed before results

In [42]:
new_df = pd.read_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df')
old_df = pd.read_hdf(f'{venue_short}_pdf_data_20221120.h5', key='df')

In [54]:
new_df.head()

Unnamed: 0,id,text,num_characters,num_pages,file_size
0,RUzSobdYy0V,QUANTIFYING AND MITIGATING THE IMPACT OF LA-\n...,78841,27,1959287
1,N3kGYG3ZcTi,Under review as a conference paper at ICLR 202...,24857,8,199010
2,tmIiMPl4IPa,Under review as a conference paper at ICLR 202...,53125,17,5864879
3,mhnHqRqcjYU,Under review as a conference paper at ICLR 202...,77689,25,2192779
4,sZI1Oj9KBKy,Under review as a conference paper at ICLR 202...,81745,27,4586711


In [67]:
# Merge entries from df_old with id = NaN
# Overwrite data from old_df with new_df if id is not NaN

ids = old_df['id'].values.tolist()


papers = []
missing = 0
for i, id in enumerate(ids):
    # search for id in new_df
    paper_old = old_df[old_df['id'] == id]
    paper_new = new_df[new_df['id'] == id]
    
    # if no paper new, then keep old
    if len(paper_new) == 0:
        papers.append(paper_old)
        missing += 1
    else:
        papers.append(paper_new)

df = pd.concat(papers)
df = df.reset_index(drop=True)
print(f"Missing {missing} papers")

Missing 1023 papers


In [70]:
df.tail()

Unnamed: 0,id,text,num_characters,num_pages,file_size
4869,IJwhRE510b,Under review as a conference paper at ICLR 202...,65682,18,1817942
4870,4XMAzZasId,Under review as a conference paper at ICLR 202...,102838,29,822566
4871,KjKZaJ5Gbv,Under review as a conference paper at ICLR 202...,56608,14,5518446
4872,ED2Jjms9A4H,Under review as a conference paper at ICLR 202...,70902,20,11203876
4873,jU-AXLS2bl,Under review as a conference paper at ICLR 202...,47722,15,1766995


In [69]:
df.to_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df', mode='a')