# Crawl all PDF files for text and statistics
Here we scrape all the PDFs and extract text data and statistics from them

In [3]:
%load_ext autoreload
%autoreload 2

import os
import time
import pandas as pd
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
from pathlib import Path
from PyPDF2 import PdfReader


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Read submission list
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [4]:
DATA_PATH = '../data/'
venue = 'NeurIPS.cc/2022/Conference'
venue_short = 'neurips2022'
date = time.strftime("%Y%m%d")

# Read hdf5 file
df = pd.read_hdf(DATA_PATH + f'{venue_short}_data_full_{date}.h5', key='df')

## Download PDFs and crawl data

In [5]:
# Download pdfs from df['content.pdf']
def download_pdf(url, filename):
    r = requests.get(url, allow_redirects=True)
    open(filename, 'wb').write(r.content)


def get_pdf_data(id, save_dir='temp/'):
    try:
        # Make temp directory if not exis
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        BASE_URL = 'https://openreview.net/pdf?id='
        filename = save_dir / (id + '.pdf')
        download_pdf(BASE_URL + id, filename)

        # Read pdf
        reader = PdfReader(filename)
        number_of_pages = len(reader.pages)
        file_size = os.stat(filename).st_size
        text_all = ''
        num_characters = 0
        for page in reader.pages:
            text = page.extract_text()
            # text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
            text_all += text
            num_characters += len(text)

        # Delete pdf
        os.remove(filename)
        return {'id': id, 'text': text_all, 'num_characters': num_characters, 'num_pages': number_of_pages, 'file_size': file_size}
    except:
        return {'id': id, 'text': '', 'num_characters': 0, 'num_pages': 0, 'file_size': 0}


# Use multiprocessing to download pdfs
def get_pdf_multi(ids, ratio=0.8):
    num_processes = int(ratio*mp.cpu_count())
    with Pool(num_processes) as p:
        data = list(tqdm(p.imap(get_pdf_data, ids), total=len(ids)))
    return data

In [6]:
# Get pdf data
data = get_pdf_multi(df['id'].values.tolist(), ratio=.8)

  0%|          | 0/2824 [00:00<?, ?it/s]

FloatObject (b'0.00-28544242') invalid; use 0.0 instead
FloatObject (b'0.00-17013554') invalid; use 0.0 instead
FloatObject (b'0.000000000-2910383') invalid; use 0.0 instead
 impossible to decode XFormObject /Im9
 impossible to decode XFormObject /Im14
FloatObject (b'0.00-5377789') invalid; use 0.0 instead
FloatObject (b'0.00-5377789') invalid; use 0.0 instead
FloatObject (b'0.00-5377789') invalid; use 0.0 instead
unknown widths : 
[0, IndirectObject(1146, 0, 140246656300128)]
unknown widths : 
[0, IndirectObject(1148, 0, 140246656300128)]
unknown widths : 
[0, IndirectObject(1150, 0, 140246656300128)]
unknown widths : 
[0, IndirectObject(1152, 0, 140246656300128)]
FloatObject (b'0.00-6361322') invalid; use 0.0 instead
FloatObject (b'0.00-6361322') invalid; use 0.0 instead
FloatObject (b'0.00-82034426') invalid; use 0.0 instead
FloatObject (b'0.00-5291005') invalid; use 0.0 instead
FloatObject (b'0.00-82034426') invalid; use 0.0 instead
FloatObject (b'0.00-5291005') invalid; use 0.0 in

In [9]:
data = pd.DataFrame(data)
data.tail()

Unnamed: 0,id,text,num_characters,num_pages,file_size
2819,PCQyUvAmKs,Don’t Pour Cereal into Coffee: Differentiable\...,72867,22,1779352
2820,tIqzLFf3kk,Rank Diminishing in Deep Neural Networks\nRuil...,79795,28,3910141
2821,k7xZKpYebXL,A Lower Bound of Hash Codes’ Performance\nXiao...,43848,13,2032842
2822,yam42JWePu,Fine-Grained Semantically Aligned\nVision-Lang...,84088,23,6930310
2823,FurHLDnmC5v,Sample Complexity of Learning Heuristic Functi...,49050,13,464242


## Save dataset as `hdf5` file

In [10]:
data.to_hdf(DATA_PATH + f'{venue_short}_pdf_data_{time.strftime("%Y%m%d")}.h5', key='df', mode='a')