In [None]:
#CHRISTOPHER M. CHURCH
#ASSISTANT PROFESSOR
#UNIVERSITY OF NEVADA, RENO

In [30]:
#SPLIT THE PDF FROM 2-PAGE PER PAGE (COLUMNED) TO 1-PAGE PER PAGE (NO COLUMNS)
#
import copy
import sys
import math
import pyPdf

def split_pages(src, dst1,dst2):
    '''splits a PDF from a 2-page multipage format into one-page-per-page format, using input and output
       Source http://stackoverflow.com/a/15741856/1301753'''
    src_f = file(src, 'r+b')
    dst_f1 = file(dst1, 'w+b')
    dst_f2 = file(dst2, 'w+b')

    input = pyPdf.PdfFileReader(src_f)
    output1 = pyPdf.PdfFileWriter()
    output2 = pyPdf.PdfFileWriter()
    
    for i in range(input.getNumPages()):
        p = input.getPage(i)
        q = copy.copy(p)
        q.mediaBox = copy.copy(p.mediaBox)

        x1, x2 = p.mediaBox.lowerLeft
        x3, x4 = p.mediaBox.upperRight

        x1, x2 = math.floor(x1), math.floor(x2)
        x3, x4 = math.floor(x3), math.floor(x4)
        x5, x6 = math.floor(x3/2), math.floor(x4/2)

        if x3 > x4:
            # horizontal
            p.mediaBox.upperRight = (x5, x4)
            p.mediaBox.lowerLeft = (x1, x2)

            q.mediaBox.upperRight = (x3, x4)
            q.mediaBox.lowerLeft = (x5, x2)
        else:
            # vertical
            p.mediaBox.upperRight = (x3, x4)
            p.mediaBox.lowerLeft = (x1, x6)

            q.mediaBox.upperRight = (x3, x6)
            q.mediaBox.lowerLeft = (x1, x2)

        output1.addPage(p)
        output2.addPage(q)

    output1.write(dst_f1)
    output2.write(dst_f2)
   
    src_f.close()
    dst_f1.close()
    dst_f2.close()

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

def convert_pdf_to_txt(path):
    '''this uses PDFMINER to extract the text contained within the PDF files'''
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [39]:
#READ THE INPUT DIRECTORY AND SPLIT ALL THE PDF FILES
import os
from shutil import copyfile
read_dir="C:/Users/Christopher/Doten/"

for input_filename in os.listdir(read_dir):
    path = os.path.join(read_dir, input_filename)
    if os.path.isdir(path): continue
    pages = input_filename.split(".")
    pages = pages[0].split("pg")[1]
    if "-" in pages:
        pages = pages.split('-')
        out1=read_dir + "split/"+pages[0].zfill(5)+".pdf"
        out2=read_dir + "split/"+pages[1].zfill(5)+".pdf"
        split_pages(read_dir+input_filename,out1,out2)        
    else:
        out_filename = read_dir+"split/"+input_filename.split('pg')[1].split(".")[0].zfill(5)+".pdf"
        copyfile(read_dir+input_filename, out_filename)
    print "\r",input_filename + " split.",

pg98-99.pdf split.


In [40]:
#GRAB AND EXPORT PDF OCR LAYER

import os
read_dir="C:/Users/Christopher/Doten/split/"
for filename in os.listdir(read_dir):
    filepath = os.path.join(read_dir, filename)
    if os.path.isdir(filepath): continue
    output = filename.split('.')[0] + ".txt"
    text = convert_pdf_to_txt(filepath)
    with open(read_dir+"txt/"+output,"w") as out:
        out.write(text)
    print "\r","Output: " + output,

Output: 00101.txt


In [167]:
#PROCESS THE TEXT
import pandas as pd
import os
import re

#read_dir="C:/Users/Christopher/Doten/split/txt/"
read_dir="I:/Dropbox/NDAD/Doten/split/txt/"
year=""
place=""
text=""

def process_text(line,kwords,days):
    line = line.replace("(cid:173)","").rstrip("\n") + " "
    for kword in kwords:
        tokens = line.split(" ")
        if kword in tokens[0]:
            line = '</p><p><div class="date">'
            for idx,token in enumerate(tokens):
                try:
                    int(token)
                    if tokens[idx+1] == "through": continue
                except:
                    continue
                line+= " ".join(tokens[0:idx+1]) + "</div>" + " ".join(tokens[idx+1:])
                break
            break
    return line

days = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]
months = ["January","February","March","April","May","June","July","August","September","October","November","December",
         "Jan","Feb","Mar","Apr","Jun","Jul","Aug","Sept","Oct","Nov","Dec"]
kwords = days+months

df = pd.DataFrame(columns=('page', 'place','year', 'text'))

for filename in os.listdir(read_dir):
    filepath = os.path.join(read_dir, filename)
    if filename.endswith('.txt'):
        with open(filepath,"r") as f:
            text="<p>"
            page = int(filename.split(".")[0])
            if int(page%2!=0):
                for idx,line, in enumerate(f):
                        if idx == 0:
                            chunks = line.split(":")
                            try:
                                place=chunks[0].strip()
                                year = chunks[1].strip()
                            except:
                                pass
                        elif idx > 3:
                            text += process_text(line,kwords,days)  
            else:
                for idx,line, in enumerate(f):
                        if idx > 3:
                            text += process_text(line,kwords,days)     
        text+="</p>"
        result = [str(page),place,year,text]
        df.loc[df.shape[0]]=result
df.to_csv(read_dir+"doten-diaries.csv")
print "CSV outputted"

CSV outputted


In [168]:
#text = df.get_value(90,'text')
#text

'<p>some pretty girls, very prettily dressed, with short trousers which became them much, I thought -  frocks and turkish  </p><p><div class="date">July 11</div>... Afternoon I went to see Dr D D Cogswell and had three of my double teeth filled - 10 dollars apiece - A Mr Steward was hung for murder by the people, down at the end of long wharf - He was arrested yesterday -  </p><p><div class="date">July 12 through 26</div>[AD visits, sees the sights, gets three more teeth filled, goes back to Stockton on the Weber, to Chinn\'s by stagecoach and out to his diggings on foot - Of interest:]  .  </p><p><div class="date">July 14</div>. . Morning I went down on long wharf. There I met Nelson Pierce, who has just arrived from the Sandwich Islands with a load of vegetables - He has got Ellis B Barnes with him as mate. I went on board and Ellis set me into the watermelons and bananas .  .  .  </p><p><div class="date">July 15</div>... Steamer Pacific sailed for Panama - I sent home two letters, a