In [9]:
from __future__ import print_function
import numpy as np
import cv2
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
import glob
import os

from ShapeDetect import ShapeDetect as sd
from ArrowDetect import ArrowDetect as ad
from TextDetect_OPENCV import TextDetectAll as tda
from Diag2Graph_v2 import Diag2Graph as tgv2
import pytesseract
from ParseJSON import ParseJSON as pj
from FigTypeDetect import FigTypeDetect as ftd
import subprocess
from subprocess import TimeoutExpired
# Comment below line for LINUX - Update below path for WINDOWS
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [10]:
def preprocessImage(image_path, resize):
    
    # load the image from disk and then preprocess it
    image = cv2.imread(image_path)
    # add white border in the original image        
    image = cv2.copyMakeBorder(image,10,10,10,10,cv2.BORDER_CONSTANT,value=[255, 255, 255]) 
    if resize == 1:
        newX, newY = image.shape[1]*1.5, image.shape[0]*1.5
    else:
        newX, newY = image.shape[1], image.shape[0]
            
    image_resize = cv2.resize(image,(int(newX),int(newY)))
        
    imgPIL = Image.open(image_path)
    imgPIL = ImageOps.expand(imgPIL, border = 10, fill = 'white')
    imgPIL = imgPIL.resize((int(newX),int(newY)), Image.ANTIALIAS)        
    imgPIL = ImageEnhance.Color(imgPIL)
    imgPIL = imgPIL.enhance(0)
    gray_im = imgPIL.convert('L') 

    gray_imcv = np.array(gray_im, dtype=np.uint8)    
    _, thresh_im = cv2.threshold(gray_imcv, 240, 255, cv2.THRESH_BINARY_INV)    
        
    return image_resize, thresh_im, gray_imcv

In [11]:
### Input folder is expected to contain publication pdfs that will be analyzed
input_path = "Input"
'''
Output folder will contain a folder for each paper, with subfolders for extracted figures 'Figures/' 
and subfolder with annotated figures and rdf triples files 'diag2graph'
'''
op_path = "Output"
model_Dir = r"..\..\..\..\demo\run_all_modalities\image2graph_models"

In [12]:
op_path_all = op_path + "/All"
if not os.path.exists(op_path_all):
    os.makedirs(op_path_all)

# command = "sbt \"runMain org.allenai.pdffigures2.FigureExtractorBatchCli "+ input_path +"/ -s stat_file.json -m " + op_path_all + "/ -d " + op_path_all + "/\""
command = 'java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli ' + input_path + '/ -s stat_file.json -m ' + op_path_all + '/ -d ' + op_path_all + '\\'
print(command)
print("[INFO] extracting figures from pdf files ...")
        
process = subprocess.Popen(command, shell=True, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
    
print("[INFO] loading trained models ...")
        
figtypedetector = ftd(model_Dir)
figtypedetector.loadFigClassModels("vgg16")


if (process.returncode == 0):
    print("[INFO] loading images ...")
       
    for filename in glob.glob(os.path.join(op_path_all, '*png')):
        print(filename)
        image_file_name = os.path.splitext(os.path.basename(filename))[0]
        abspath = os.path.abspath(filename)
        if (filename.find('Figure') != -1): 
            parsejson = pj()
            paper_title, paper_file_name, paper_conf, paper_year, fig_caption, fig_text = parsejson.getCaption(filename)

            figTypeResult = parsejson.isResult(fig_caption)
            figTypeDiag = parsejson.isDiag(fig_caption)
               
            if not figTypeResult and figTypeDiag:
                im, thresh_im, gray_imcv = preprocessImage(filename, 0)
                binType, mcType = figtypedetector.detectFigType(im)
                
                if mcType < 3:
                        
                    if not os.path.isdir(os.path.join(op_path, paper_file_name)):
                        os.mkdir(os.path.join(op_path, paper_file_name))
                        os.mkdir(os.path.join(op_path, paper_file_name, "diag2graph"))
                        os.mkdir(os.path.join(op_path, paper_file_name, "Figures"))

                    cv2.imwrite(os.path.join(op_path, paper_file_name+ "/Figures/" + os.path.basename(filename)), im)        
                        

                    shapedetector = sd()
                    component, flow_dir = shapedetector.find_component(filename, op_path, im, thresh_im, gray_imcv)
                               
                    textdetector = tda()
                    text_list = textdetector.combinedTextDetect(filename, im, component, fig_text)
                        
                    arrowdetector = ad()            
                    line_connect = arrowdetector.detectLines(im, thresh_im, gray_imcv, component, text_list)
                    
                    graphcreator = tgv2()
                    graphcreator.createDiag2Graph(op_path, filename, im, thresh_im, component, flow_dir, text_list, line_connect, paper_title, paper_file_name, paper_conf, paper_year, fig_caption)
                
else:

    print("Pdf2Fig Terminated with Status %d. Exiting."% (process.returncode)   )
        


java -cp "pdffigures2_2.12-0.1.0.jar;pdffigures2-assembly-0.1.0-deps.jar;scala-library.jar" org.allenai.pdffigures2.FigureExtractorBatchCli Input/ -s stat_file.json -m Output/All/ -d Output/All\
[INFO] extracting figures from pdf files ...
[INFO] loading trained models ...
Loaded binary classifier model from disk
Loaded multiclass classifier model from disk
[INFO] loading images ...
Output/All\1802.06006v2-Figure1-1.png
Output/All\1802.06006v2-Figure10-1.png
Output/All\1802.06006v2-Figure11-1.png
Output/All\1802.06006v2-Figure12-1.png
Output/All\1802.06006v2-Figure13-1.png
Output/All\1802.06006v2-Figure14-1.png
Output/All\1802.06006v2-Figure15-1.png
Output/All\1802.06006v2-Figure2-1.png
Output/All\1802.06006v2-Figure3-1.png
Output/All\1802.06006v2-Figure4-1.png
Output/All\1802.06006v2-Figure5-1.png
Output/All\1802.06006v2-Figure6-1.png
Output/All\1802.06006v2-Figure7-1.png
Output/All\1802.06006v2-Figure8-1.png
Output/All\1802.06006v2-Figure9-1.png
Output/All\1802.06006v2-Table1-1.png
O