In [2]:
# Extract table content from given PDF
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTRect
from pdfminer.converter import PDFPageAggregator
from itertools import islice
from sh import pdftotext

def extractTableDataFromPdf(pdf):
    fp = open(pdf, 'rb')
    # Parse pdf document to xml format
    parser = PDFParser(fp)
    doc = PDFDocument(parser, "pass")
    parser.set_document(doc)
    
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Iterate page wise to get the coordinates of table's horizontal and vertical lines
    for pageno, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        layout = device.get_result()
        hlines=[]
        vlines=[]
        for i in layout:
            # Get the horizontal and vertical lines coordinates (top, left, height, width) if the layout is table (LTRect)
            if not type(i) == LTRect: continue
            if not i.x0: continue
            # print([int(i.x0), int(i.x1), int(i.y0), int(i.y1)])
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        # print(hlines)
        # print(vlines)
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        i=0
        while(i<len(vlines)-1):
            # If the difference b/w next and current vertical line is less then 10 then continue
            if not vlines[i+1]-vlines[i]>10:
                i=i+1
                continue
            j=0
            row=[]
            while(j<len(hlines)-1):
                # If the difference b/w next and current horizontal line is less then 10 then continue
                if not hlines[j+1]-hlines[j]>10:
                    j=j+1
                    continue
                # Store table content into list
                row.append(' '.join(get_region(pdf,
                                               pageno+1,
                                               hlines[j]+1,
                                               vlines[i],
                                               hlines[j+1]-1,
                                               vlines[i+1]).split()))
                j=j+1
            print(row)
            i=i+1
    fp.close()
# Get unique line coordinates for lines and remove lines having lesser gap between (lines without content)
def filterclose(lst):
    if not lst:
        return ''
    tmp=[lst[0]]
    for elem in islice(lst, 1, None):
        if elem - 2 > tmp[-1]:
            tmp.append(elem)
    return tmp

# Get table content between given coordinates using pdftotext 
def get_region(pdf, page, x1,y1,x2,y2):
    return pdftotext('-nopgbrk',
                     '-f', page,
                     '-l', page,
                     '-x', x1,
                     '-y', y1,
                     '-H', abs(y2-y1),
                     '-W', abs(x2-x1),
                     pdf,
                     '-'
                     )

if __name__=='__main__':
    extractTableDataFromPdf('sample_doc.pdf')

['SrNo', 'City', 'Observation', 'Count']
['1', 'Mumbai', 'This columns has information about the rainfall', '23']
['2', 'Pune', 'Information about the traffic', '55']
['3', 'Delhi', 'Population Data', '55 lakh']
['4', 'Chennai', 'Traffic data', '100']
['5', 'Mumbai', 'This columns has information about the rainfall', '23']
['6', 'Pune', 'Information about the traffic', '56']
