# Crawl all PDF files for text and statistics
Here we scrape all the PDFs and extract text data and statistics from them

In [1]:
%load_ext autoreload
%autoreload 2

import os
import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
from pathlib import Path
from PyPDF2 import PdfReader


## Read submission list
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [6]:
DATA_PATH = '../data/'
venue = 'ICLR.cc/2023/Conference'
venue_short = 'iclr2023'
date = time.strftime("%Y%m%d")

# Read hdf5 file
df = pd.read_hdf(DATA_PATH + f'{venue_short}_data_full_{date}.h5', key='df')

## Download PDFs and crawl data

In [11]:
# Download pdfs from df['content.pdf']
def download_pdf(url, filename):
    r = requests.get(url, allow_redirects=True)
    open(filename, 'wb').write(r.content)


def get_pdf_data(id, save_dir='temp/'):
    try:
        # Make temp directory if not exis
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        BASE_URL = 'https://openreview.net/pdf?id='
        filename = save_dir / (id + '.pdf')
        download_pdf(BASE_URL + id, filename)

        # Read pdf
        reader = PdfReader(filename)
        number_of_pages = len(reader.pages)
        file_size = os.stat(filename).st_size
        text_all = ''
        num_characters = 0
        for page in reader.pages:
            text = page.extract_text()
            # text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
            text_all += text
            num_characters += len(text)

        # Delete pdf
        os.remove(filename)
        return {'id': id, 'text': text_all, 'num_characters': num_characters, 'num_pages': number_of_pages, 'file_size': file_size}
    except:
        return {'id': id, 'text': '', 'num_characters': 0, 'num_pages': 0, 'file_size': 0}


# Use multiprocessing to download pdfs
def get_pdf_multi(ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(get_pdf_data, ids), total=len(ids)))
    return data

In [12]:
# Get pdf data
data = get_pdf_multi(df['id'].values.tolist(), ratio=.8)

  0%|          | 0/4874 [00:00<?, ?it/s]

unknown widths : 
[0, IndirectObject(317, 0, 140410485629440)]
unknown widths : 
[0, IndirectObject(319, 0, 140410485629440)]
FloatObject (b'0.00-86730235') invalid; use 0.0 instead
FloatObject (b'0.000000000000-5684342') invalid; use 0.0 instead
FloatObject (b'0.000000000000-5684342') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
FloatObject (b'0.00-50') invalid; use 0.0 instead
unknown widths : 
[0, IndirectObject(941, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(943, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(945, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(947, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(949, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(951, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(953, 0, 140410486189856)]
unknown widths : 
[0, IndirectObject(941, 0, 14

In [13]:
data = pd.DataFrame(data)
data.tail()

Unnamed: 0,id,text,num_characters,num_pages,file_size
4869,IJwhRE510b,Under review as a conference paper at ICLR 202...,64384,18,1817114
4870,4XMAzZasId,Under review as a conference paper at ICLR 202...,64238,19,530259
4871,KjKZaJ5Gbv,Under review as a conference paper at ICLR 202...,56608,14,5518446
4872,ED2Jjms9A4H,Under review as a conference paper at ICLR 202...,61944,18,8040706
4873,jU-AXLS2bl,Under review as a conference paper at ICLR 202...,47722,15,1766995


## Save dataset as `hdf5` file

In [14]:
data.to_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df', mode='a')