In [6]:
from __future__ import division
import cv2
import numpy as np
import matplotlib.pyplot as plt
import csv

try:
    from PIL import Image
except ImportError:
    import Image
import pytesseract
import string

printable = set(string.printable)

KnownMistakes = {'+':'-',
                 '~':'-',
                 'i':'1',
                 'I':'1',
                 'S':'5'}

def GetTables(contours):
    # Filter contours
    c_filtered = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        print x,y,w,h
        if h>40 and w<1000:
            c_filtered.append([x,y,w,h])
            #cv2.rectangle(cell_image,(x,y),(x+w,y+h),(0,255,0),2)
    #Get tables
    tables = []
    table = []
    sorted_c = sorted(c_filtered, key = lambda x:x[0])
    x_min = sorted_c[0][0]
    x_max = sorted_c[-1][0]
    width = x_max - x_min
    width = width / 10
    y_scan = 0
    for i in c_filtered:
        if (i[1] - y_scan) > 10:
            if table != []:
                tables.append(table)
            table = [i]
            y_scan = i[1]
            h_scan = i[3]
        elif (i[2] / width) > 1.5:
            if abs(i[3] - h_scan) > 10:
                print 'TROUBLE WITH CELL DETECTION'
            cutNumber = int(round(i[2] / width))
            w_new = int(i[2] / cutNumber)
            for j in range(cutNumber):
                x_new = i[0] + w_new * j
                table.append([x_new,i[1],w_new,i[2]])
        else:
            if abs(i[3] - h_scan) > 10:
                print 'TROUBLE WITH CELL DETECTION'
            table.append(i)
    return tables

def ConvertNonASCII(TesOut):
    TesStr = filter(lambda x: x in printable, TesOut)  
    return TesStr

def GetNumberCoord(stripe):
    x_mean = [np.mean(stripe[i,:]) for i in range(stripe.shape[0])]
    Number = False
    events = []
    for i in range(stripe.shape[0]):
        if not Number and x_mean[i]<250:
            Number = True
            events.append(i)
        if Number and x_mean[i] > 250:
            Number = False
            events.append(i)
    # get coordinates of number edges
    edges = []
    for i in range(int(len(events) / 2)):
        height = events[2*i + 1] - events[2*i] 
        if height > 12 and height < 16:
            edges.append([events[2*i],events[2*i+1]])
    return edges

def ExtractTables(name):

    # get the image
    loadname = name + '.jpg'
    img = cv2.imread(loadname,0)

    # thresholding the image to a binary image
    thresh,img_bin = cv2.threshold(img,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # inverting the image
    img_bin = 255-img_bin

    # countcol(width) of kernel as 100th of total width
    kernel_len = np.array(img).shape[1]//100

    # Defining a vertical kernel to detect all vertical lines of image
    ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))

    # Defining a horizontal kernel to detect all horizontal lines of image
    hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))

    # A kernel of 2x2
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    
    # Use vertical kernel to detect and save the vertical lines in a jpg
    image_1 = cv2.erode(img_bin, ver_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=3)

    # Use horizontal kernel to detect and save the horizontal lines in a jpg
    image_2 = cv2.erode(img_bin, hor_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=3)

    # Combine horizontal and vertical lines in a new third image, with both having same weight.
    img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)

    # Eroding and thesholding the image
    img_vh = cv2.erode(~img_vh, kernel, iterations=2)
    thresh, img_vh = cv2.threshold(img_vh,128,255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    bitxor = cv2.bitwise_xor(img,img_vh)
    bitnot = cv2.bitwise_not(bitxor)

    # Detect contours for following box detection
    contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Sort all the contours by top to bottom.
    contours, boundingBoxes = sort_contours(contours, method="top-to-bottom")
    cell_image = np.copy(img)  # creating a blank to draw lines on

    # Filter contours
    c_filtered = []
    width = []
    height = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if (w>70 and w<80 and h>40):
            c_filtered.append([x,y,w,h])
            width.append(w)
            height.append(h)
            cv2.rectangle(cell_image,(x,y),(x+w,y+h),(0,255,0),2)

    # get the image
    #cellname = name + '_cells.jpg'
    #cv2.imwrite(cellname, cell_image)

    ######################### EXTRACT TABLES (10 SUMS EACH) #################################
        
    iterator = 0
    page = []
    for i in range(int(len(c_filtered)/10)):
        table = c_filtered[i*10:(i+1)*10]
        
        # Sort contours
        table_sorted = []
        TableX = [k[0] for k in table]
        Index = np.argsort(TableX)
        for j in Index:
            table_sorted.append(table[j])
            
        # update table
        table = table_sorted
            
        # Get the dimension of the table
        
        x1 = 10000
        x2 = 0
        y1 = 10000
        y2 = 0
        y1min = 0
        for cell in table:
            if cell[0]<x1:
                x1=cell[0]
            if cell[1]<y1:
                y1=cell[1]
            if cell[1]>y1min:
                y1min=cell[1]
            if cell[2]+cell[0]>x2:
                x2=cell[2]+cell[0]
            if cell[1]+cell[3]>y2:
                y2=cell[1]+cell[3]
                
        # Solution frame
        AddY = 30
    
        # Frame it
        Tframe = img[y1:y2+AddY,x1:x2]
        
        # Save it
        #tableNameStripes = name + '_table_' + str(i) + '_full.jpg'
        #cv2.imwrite(tableNameStripes,Tframe)
        
        ############################### DESKEW THE TABLE ######################
        # Get the skew_angle
        skew_angle = float((table[9][1]-table[0][1])/(table[9][0]-table[0][0]))*180/np.pi

        # rotate the table to deskew it
        (h, w) = Tframe.shape[:2]
        center = (w // 2, h // 2)
        M = cv2.getRotationMatrix2D(center, skew_angle, 1.0)
        Tframe_deskew = cv2.warpAffine(Tframe, M, (w, h),
                         flags=cv2.INTER_CUBIC, 
                         borderMode=cv2.BORDER_REPLICATE)
        
        # Save it
        #tableNameStripes = name + '_table_' + str(i) + '_deskew.jpg'
        #cv2.imwrite(tableNameStripes,Tframe_deskew)
        
        
        #################### REMOVE VERTICAL SEPARATION LINES ###################
        
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
        
        # Remove separation lines
        CutStripe = 10
        DetectionBorder = 2
        tableScript = []
        for stripe in table:
            primer = []
            l = int(stripe[0] + CutStripe - x1)
            r = int(stripe[0] + stripe[2] - CutStripe - x1)
            stripeFrame = Tframe_deskew[:,l:r]
            thresh,stripe_bin = cv2.threshold(stripeFrame,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
            NumbC = GetNumberCoord(stripe_bin)
            for coord in NumbC:
                t = coord[0] - DetectionBorder
                b = coord[1] + DetectionBorder
                NumberFrame = stripe_bin[t:b,:]
                border = cv2.copyMakeBorder(NumberFrame,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
                resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
                dilation = cv2.dilate(resizing, kernel,iterations=1)
                erosion = cv2.erode(dilation, kernel,iterations=2)
                out = pytesseract.image_to_string(erosion, config='--psm 6')
                #try:
                out = ConvertNonASCII(out)
                #except:
                #    out = ''
                primer.append(out)
            tableScript.append(primer)
        page.append(tableScript)
        fileName = name + '_table_' + str(i) + '.txt'
        with open(fileName, 'w') as file:
            writer = csv.writer(file)
            writer.writerows(tableScript)
        
        ########################## GET INDIVIDUAL CELL COORDINATES ################
        
        
        #thresh,number_bin = cv2.threshold(number1,128,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
        #border = cv2.copyMakeBorder(number_bin,2,2,2,2, cv2.BORDER_CONSTANT,value=[255,255])
        #resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
        #dilation = cv2.dilate(resizing, kernel,iterations=1)
        #erosion = cv2.erode(dilation, kernel,iterations=2)
        
        #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
        #dilation = cv2.dilate(number1, kernel,iterations=1)
        #erosion = cv2.erode(dilation, kernel,iterations=2)
        
        #print pytesseract.image_to_string(dilation, config='--psm 6')
        
        #tableNameStripes = name + '_table_' + str(i) + '_erosion.jpg'
        #cv2.imwrite(tableNameStripes,erosion)
        
        ################################### OCR BLOCK #############################
        
# Sorting of countours and generation of bounding boxes
def sort_contours(cnts, method="left-to-right"):
    # initialize the reverse flag and sort index
    reverse = False
    i = 0
    # handle if we need to sort in reverse
    if method == "right-to-left" or method == "bottom-to-top":
        reverse = True
    # handle if we are sorting against the y-coordinate rather than
    # the x-coordinate of the bounding box
    if method == "top-to-bottom" or method == "bottom-to-top":
        i = 1
    # construct the list of bounding boxes and sort them from top to
    # bottom
    boundingBoxes = [cv2.boundingRect(c) for c in cnts]
    (cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
    key=lambda b:b[1][i], reverse=reverse))
    # return the list of sorted contours and bounding boxes
    return (cnts, boundingBoxes)




In [7]:
fileName = 'DeskewedPages/16'
ExtractTables(fileName)