# Splitting Pdfs

The following code takes a pdf from jstor, checks for the coverpage, and if the coverpage exists, it creates a pdf without the coverage with certain naming conventions as specified. This is to account for documents that were taken from scihub and do not have a jstor coverpage.

We also use this opportunity to count the number of pages in each document. Although we do have start and end pages, getting the pdf count is a much more accurate measure especially if the article starts or ends in roman numerals or alternative page numbering systems. As an example: pages in special issues are usually prefixed by arbitrary letters eg: S or I.

Next, each pdf is split into different pages with page 1 indicated by suffix 0.

In [1]:
import pandas as pd
import numpy as np
import os
import re
from unidecode import unidecode
import fitz #for opening pdfs

# for OCR using PyTesseract
import cv2                              # pre-processing images
import pytesseract                      # extracting text from images
import numpy as np
import matplotlib.pyplot as plt         # displaying output images
from PIL import Image
import regex

In [2]:
base_path="/Users/sijiawu/Work/Thesis/Data/Affiliations/"
data_base_path="/Users/sijiawu/Work/Thesis/Data/"
nets_path="/Users/sijiawu/Work/80YearsEconomicResearch/032_auth_graph_gen/networks/"
pdf_base_path="/Users/sijiawu/Dropbox/80YearsEconomicResearch/Data/0_PDF/"

In [3]:
JOURNALS= ['AER', 'JPE', 'ECTA', 'RES', 'QJE']
#read in all processed masterlists
All=pd.read_pickle(data_base_path+"Combined/011_merged_proc_scopus_inception_2020.pkl")
All=All[All.duplicated()==False].reset_index().drop('index', axis=1)

In [4]:
All.shape

(62277, 61)

In [5]:
All.columns

Index(['issue_url', 'author', 'title', 'journal', 'volume', 'number', 'pages',
       'year', 'ISSN', 'abstract', 'URL', 'publisher', 'content_type', 'type',
       'jid', 'author_split', 'urldate', 'reviewed-author', 'uploaded',
       'title_10', 'URL_og', 'number_og', 'title_og', 'author_og', 'pages_og',
       'j_fix', 'scopus_jid', 'scopus_id', 'scopus_authorgroup',
       'scopus_authors', 'scopus_affiliations', 'scopus_references',
       'scopus_author_full_names', 'scopus_title', 'scopus_year',
       'scopus_source_title', 'scopus_volume', 'scopus_issue', 'scopus_art_no',
       'scopus_page_start', 'scopus_page_end', 'scopus_page_count',
       'scopus_cited_by', 'scopus_doi', 'scopus_abstract', 'scopus_publisher',
       'scopus_document_type', 'scopus_publication_stage',
       'scopus_open_access', 'scopus_source', 'scopus_eid', 'scopus_title_og',
       'scopus_volume_og', 'scopus_issue_og', 'scopus_page_start_og',
       'scopus_page_end_og', 'scopus_year_og', 's_fix', 

In [6]:
All['id']=All['URL'].str.split('/').str[-1]
All.loc[:, 'authors_lower'] = All['author'].copy().str.lower().to_numpy() #make field to lower all author names

## Reduce the set to exclude reviews and miscellaneous content

In [7]:
ex_content=['MISC', 'Errata','Discussion', 'Review', 'Review2']
content=['Article', 'Comment', 'Reply', 'Rejoinder']

All.columns

Index(['issue_url', 'author', 'title', 'journal', 'volume', 'number', 'pages',
       'year', 'ISSN', 'abstract', 'URL', 'publisher', 'content_type', 'type',
       'jid', 'author_split', 'urldate', 'reviewed-author', 'uploaded',
       'title_10', 'URL_og', 'number_og', 'title_og', 'author_og', 'pages_og',
       'j_fix', 'scopus_jid', 'scopus_id', 'scopus_authorgroup',
       'scopus_authors', 'scopus_affiliations', 'scopus_references',
       'scopus_author_full_names', 'scopus_title', 'scopus_year',
       'scopus_source_title', 'scopus_volume', 'scopus_issue', 'scopus_art_no',
       'scopus_page_start', 'scopus_page_end', 'scopus_page_count',
       'scopus_cited_by', 'scopus_doi', 'scopus_abstract', 'scopus_publisher',
       'scopus_document_type', 'scopus_publication_stage',
       'scopus_open_access', 'scopus_source', 'scopus_eid', 'scopus_title_og',
       'scopus_volume_og', 'scopus_issue_og', 'scopus_page_start_og',
       'scopus_page_end_og', 'scopus_year_og', 's_fix', 

In [8]:
pd.unique(All.content_type)

array(['Article', 'MISC', 'Comment', 'Reply', 'Errata', 'Rejoinder',
       'Discussion', 'Review', 'Review2'], dtype=object)

In [9]:
Reduced=All[['author', 'title', 'journal', 'volume', 'number', 'pages',
       'year', 'ISSN', 'abstract', 'URL', 'publisher', 'content_type', 'type',
       'jid','id']]

In [10]:
R_1940=Reduced[(Reduced['author'].isna()==False) & (Reduced['year']>=1940)].reset_index(drop=True) #only select those papers that are 1940s or later
R_1940_NMR=R_1940[R_1940['content_type'].isin(content)].reset_index(drop=True) #exclude miscellaneous, reviews, discussion and reviews

In [11]:
Reduced.shape

(62277, 15)

## Set path

In [12]:
Merged=R_1940_NMR
print(Merged['content_type'].unique())

['Article' 'Comment' 'Reply' 'Rejoinder']


In [13]:
zoom_x = 2.0 # horizontal zoom
zoom_y = 2.0 # vertical zoom
mat = fitz.Matrix(zoom_x, zoom_y)

## Set functions

In [14]:
#creates a pdf in doc2_name taken from doc1 from the page specified inclusive
def make_new_pdf(doc1, doc2_name, from_pg):
    doc2 = fitz.open()                 # new empty PDF
    doc2.insert_pdf(doc1, from_page = from_pg)  
    pg_count=doc2.page_count
    doc2.save(doc2_name)
    doc2.close()
    return pg_count

In [15]:
#given a fitz object doc1 it will insert the specified pages inclusive into the name specified in doc2_name
def make_new_pdf2(doc1, doc2_name, from_pg, to_pg):
    doc2 = fitz.open()                 # new empty PDF
    doc2.insert_pdf(doc1, from_page = from_pg, to_page = to_pg)  # first 10 pages
    pg_count=doc2.page_count
    doc2.save(doc2_name)
    doc2.close()
    return pg_count

In [16]:
# cover page removal
def coverpage_removal(og_path,woc_path, shardpath, id):
    doc = None
    page= None
    try:
        doc=fitz.open(og_path)
        page=doc[0]
    except:
        doc.close()
        raise Exception("this file is corrupt")
    
    png = shardpath + id + '_page-%i.png' % page.number
    if os.path.exists(png)==False:
        pix = page.get_pixmap(matrix=mat)
        print(png)
        pix.save(png)
    
    doc2_name=woc_path
    if os.path.exists(doc2_name)==False:
        original_image = cv2.imread(png)

        text = pytesseract.image_to_string(original_image, lang='lat', config='--oem 3 --psm 6')

        #print(doc2_name)
        if (re.search('AUTHOR\(S\)', text.upper()) is not None) or (re.search('PUBLISHED BY:', text.upper()) is not None):
            make_new_pdf(doc, doc2_name, 1)
            print('found')
        else:
            make_new_pdf(doc, doc2_name, 0)
            print('not found')
    doc.close()
    
    return 1

In [17]:
# function splits the pdf into pages and saves them to the given path and returns a list of the pdf paths
def shard(SCANNED_FILE, id, shard_path, year, bucket):
    df=[]
    doc = None
    try:
        doc=fitz.open(SCANNED_FILE)
    except:
        doc.close()
        raise Exception("this file is corrupt")
    pg_count=doc.page_count
    for page in doc:
        doc2_name=shard_path+id+'_wo_cover_page-%i.pdf' % page.number

        if os.path.exists(doc2_name)==False:
            make_new_pdf2(doc, doc2_name, page.number, page.number)
        df.append(
            {
                'pdf_url': '\''+bucket+id+'_wo_cover_page-%i.pdf' % page.number+'\'',
                'year': year
            }
        )
    doc.close()
    return (pd.DataFrame(df), pg_count)

## Going through the full data set

In [18]:
# make pandas objects to store the list of new pdf paths
JPE_refs=pd.DataFrame()
ECTA_refs=pd.DataFrame()
QJE_refs=pd.DataFrame()
AER_refs=pd.DataFrame()
RES_refs=pd.DataFrame()

pd_df={}

ranges={
    "AER": [1940, 2020, AER_refs], # I usually run these one at a time
    "ECTA": [1940, 2020, ECTA_refs],
    "JPE": [1940, 2020, JPE_refs],
    "QJE": [1940, 2020, QJE_refs],
    "RES": [1940, 2020, RES_refs]
}
for journal in ranges.keys():
    bucket='https://myawsbucket-1231.s3.eu-west-3.amazonaws.com/'+journal+'_shards/'
    ret_frame=None
    print(journal)
    filter=Merged[(Merged['year']<=ranges[journal][1]) & (Merged['year']>=ranges[journal][0])& (Merged['jid'].str.upper()==journal)].drop_duplicates()
    for i in filter.index:
        # make the file paths
        og=pdf_base_path+Merged.loc[i,'jid'].upper()+'/'+Merged.loc[i,'jid']+'_og/'
        woc=pdf_base_path+Merged.loc[i,'jid'].upper()+'/'+Merged.loc[i,'jid']+'_wo_cover/'
        sha=pdf_base_path+Merged.loc[i,'jid'].upper()+'/'+Merged.loc[i,'jid']+'_shards/'
        png=pdf_base_path+Merged.loc[i,'jid'].upper()+'/'+Merged.loc[i,'jid']+'_shards_png/'
        path_woc=woc+Merged.loc[i,'id']+'_wo_cover.pdf'
        path_og=og+Merged.loc[i,'id']+'.pdf'
        
        # print(path_og)
        if os.path.exists(path_og)==True:
            # print(Merged.loc[i,'id'])
            if os.path.exists(path_woc)==False:
                try:
                    coverpage_removal(path_og, path_woc, sha, Merged.loc[i,'id']) #remove cover page
                except Exception as e:
                    print(str(e)+ " issue generating coverless pdf " +Merged.loc[i,'id'])
                    continue
            if os.path.exists(path_woc):
                try:
                    ret_frame=shard(path_woc, Merged.loc[i,'id'], sha, int(Merged.loc[i,'year']), bucket) #shard
                    ranges[journal][2]=pd.concat([ranges[journal][2], ret_frame[0]], ignore_index=True) #concat to the pandas objects
                    pd_df[Merged.loc[i,'id']]=ret_frame[1]
                except Exception as e:
                    print(str(e)+ " issue sharding woc pdf " +Merged.loc[i,'id'] + "potentially malformed")
        else:
            print(path_og+' missing')

AER
ECTA
JPE
QJE
RES


## Save the lists

In [19]:
ranges['AER'][2].to_csv("aer_refs_all.csv",index=False)
ranges['AER'][2].shape

(153202, 2)

In [20]:
ranges['ECTA'][2].to_csv("ecta_refs_all.csv",index=False)
ranges['ECTA'][2].shape

(95467, 2)

In [21]:
ranges['JPE'][2].to_csv("jpe_refs_all.csv",index=False)
ranges['JPE'][2].shape

(92739, 2)

In [22]:
ranges['QJE'][2].to_csv("qje_refs_all.csv",index=False)
ranges['QJE'][2].shape

(87187, 2)

In [23]:
ranges['RES'][2].to_csv("res_refs_all.csv",index=False)
ranges['RES'][2].shape

(60903, 2)

In [24]:
exp=[]
for i in pd_df.keys():
    exp.append({"id":i, "page_count":pd_df[i]})

In [25]:
pg_count=pd.DataFrame(exp)

In [26]:
All=pd.merge(All, pg_count, on="id", how="left")

In [27]:
All.to_pickle(data_base_path+"Combined/011_merged_proc_scopus_inception_2020_w_counts.pkl")
