# Config

In [5]:
import os
import types
from typing import Union
from PIL import Image
import string
import fitz

In [6]:
# import conversion tools
from conv_tools import *
from conv_tools import tools as ctools
from conv_tools import ter
ter

{'pdf2img': 'pdf', 'pytesseract_ocr': 'png', 'PyPDF4_ocr': 'pdf'}

# Load Files

In [7]:
data_path = os.path.join('data')

input_path = os.path.join(data_path, 'input')
output_path = os.path.join(data_path, 'output')

# Doc and Converter Classes

A `Doc` class, instantiated once for each document, holding its path, and original/converted data in various formats.

`{format: {tool-name: converted-text}}`

`e.g. {'jpg': {'pdf2txt': <jpg-object>}}`

by using the `to_target_format` method, the document can be converted and stored in another format.


In [8]:
"""global selected_tool_name
selected_tool_name = 'pdf2img'"""

"global selected_tool_name\nselected_tool_name = 'pdf2img'"

In [9]:
class Doc:
   def __init__(self, input_filepaths, tools):
      self.tools = tools

      # input_filepath is a LIST of filepaths (in case more files belong to the same document)
      self.input_filepaths = input_filepaths
      self.main_input_filepath = input_filepaths[0] # only used to detect filename and extension
      
      self.input_extension = os.path.splitext(self.main_input_filepath)[-1].lower().translate(str.maketrans('', '', string.punctuation))
      self.input_filename = os.path.splitext(self.main_input_filepath)[-2].lower()

      self.txt_conversions = dict() # dict of {tool-name: converted-text} pairs

      self.data = dict() # dict of {target_format: {tool-name: converted-text}} e.g. {'jpg': {'pdf2txt': <jpg-object>}}

   ## doc conversion img<>pdf
   def to_target_format(self, target_format, tool_name = None):
      print("LLAMADAAAAAAAA to_target_format")
      """target_format can be pdf, png"""
      #selected_tool_name = tool_name
      global selected_tool_name
      
      data_selection = self.data.get(target_format)      
      if data_selection:                           # if at least an instance of (converted/original) data is already stored for given target_format...
         if not tool_name:
            converted_data = data_selection[0]     # ...take the first instance available...
         else:
            converted_data = data_selection[tool_name]  #...unless a preferred conversion tool was selected
         return converted_data
      else: # if the requested extension has no instances, add an empty instance target_format:{}
         self.data[target_format] = dict()
         
      # if requested target_format matches filepath extension, open the document and store it as an object
      if target_format == 'pdf':
         print('ENTERED TARGET_FORMAT == PDF IF')
         if self.data['pdf']:
            return
         if self.is_pdf():
            print('self.is_pdf() is True')
            # open all files for the given document...
            pdf_objects = []
            for pdf_path in self.input_filepaths:
               with open(pdf_path, 'rb') as file:
                  pdf_objects.append(file)
            converted_data = pdf_objects
            
            self.data[target_format]['original'] = converted_data # ...and store original data in self.data
            print('self.data:',self.data)
         elif self.is_png(): # if requested target_format does NOT match filepath extension, convert it
            print('self.is_png() is True')
            tool_selection = self.tools[('png','pdf')] # this gives a dict of suitable tools
            
            if tool_name is None:
               selected_tool_name = list(tool_selection)[0]
               conversion_tool = tool_selection[selected_tool_name] # take first available tool if no tool was specified
            else:
               selected_tool_name=tool_name
               conversion_tool = tool_selection[selected_tool_name] # assign tool based on tool name
                        
            png_objects = None # TODO open files...
            
            converted_data = conversion_tool( # ...apply conversion tool on them...
               png_objects
            )

            self.data[target_format][selected_tool_name] = converted_data # ...and store the converted data
         else:
            print('Error: source format {0} is nor accepted'.format(self.input_extension))
            return
      
      # if requested target_format matches filepath extension, open the document and store it as an object
      elif target_format == "png":
         if self.data['png']:
            print('target == png and self.data[{0}] is {1}'.format(target_format, self.data[target_format]))
         if self.is_png():            
            # open all files for the given document...
            converted_data = [
               Image.open(image_path) for image_path in self.input_filepaths
               ]
               
            self.data[target_format]['original'] = converted_data # ... and store original data in self.data

         elif self.is_pdf():
            tool_selection = self.tools[('pdf','png')] # this gives a dict of suitable tools
            if tool_name is None:
               selected_tool_name = list(tool_selection)[0]
               conversion_tool = tool_selection[selected_tool_name] # take first available tool if no tool was specified
            else:
               selected_tool_name=tool_name
               conversion_tool = tool_selection[selected_tool_name] # assign tool based on tool name
                        
            pdf_objects = [ # open files...
               fitz.open(pdf_path) for pdf_path in self.input_filepaths
            ]

            converted_data = conversion_tool( # ...apply conversion tool on them...
               pdf_objects
            )
            self.data[target_format][selected_tool_name] = converted_data # ...and store the converted data
            print('target_format:',target_format)
            #print('self.data[{0}][{1}]:{2}'.format(target_format,selected_tool_name,self.data[target_format][selected_tool_name]))
            print('self.data.keys:',self.data.keys)
         else:
            print('Error: source format {0} is nor accepted'.format(self.input_extension))
            return
      else:
         print('Error: target_format {0} is nor accepted'.format(target_format))
         return
      
      #print('self.data[{0}][{1}]:{2}'.format(target_format,selected_tool_name,self.data[target_format][selected_tool_name]))
      return 
      
   ## check extension
   def is_pdf(self):
      return self.input_extension.lower() == 'pdf'
   def is_png(self):
      return self.input_extension.lower() == 'png'

   ## save output txt
   def save_txt_conversion(self, tool_name, output_dir_path):
      # define output filepath
      output_file_path = os.path.join(output_dir_path,tool_name,str(self.filename)+'.txt')

      # save txt in chosen dir
      with open(output_file_path, 'w') as f:
         f.write(self.txt_conversions[tool_name])
   
   def save_all_txt_conversions(self, output_dir_path):
      for key in self.txt_conversions.keys():
         self.save_txt_conversion(key, output_dir_path)

In [10]:
class TxtConverter:
   def __init__(self, tools):
      #self.available_functions = {str(f):f for f in globals().values() if type(f) == types.FunctionType}
      self.tools = tools
      
   # select tool and convert
   def convert_to_txt(self, doc, tool_names = None):

      input_format = doc.input_extension
      output_format = 'txt'
      
      tool_selection = self.tools[(input_format, output_format)] # this gives a dict of suitable tools
      print('input_format',input_format)
      if isinstance(tool_names, str):
         tool_names = [tool_names]

      if tool_names is None: # use all the conversion tools
         tool_names = tool_selection.keys()
         
      selected_tools = {
         tool_name: tool_selection[tool_name] for tool_name in tool_names
      }

      extracted_texts = dict()
      for t_name in tool_names:
         req_ext = ter[t_name]      # extract input format of tool
         doc_ext = doc.data[req_ext]
         first_tool_name = list(doc_ext.keys())[0]
         obj = doc_ext[first_tool_name]        # get object from doc
      
         extracted_texts[t_name]= selected_tools[t_name](obj)  # pass it to tool

      #store in doc.txt_conversions
      doc.txt_conversions.update(extracted_texts) # join the dictionaires
      
      # store in doc.data
      if not doc.data.get('txt'):
         doc.data['txt'] = dict()
      
      doc.data['txt'].update(extracted_texts) # join the dictionaires

# Conversion and Evaluation

In [11]:
import fastwer

In [12]:
# create convertor instance from class
converter = TxtConverter(tools=ctools)

In [13]:
# for doc in data (each doc has its own directory)
print(input_path)
print(output_path)
cers = {} # CER scores

for dir in os.listdir(input_path):
    dir_path = os.path.join(input_path,dir)
    doc_paths = [os.path.join(dir_path, filepath) for filepath in os.listdir(os.path.join(input_path,dir))]
    # create Doc instance with doc path
    doc_object = Doc(doc_paths, tools=ctools)

    ##############
    # CONVERSION #
    ##############
      
    # convert to jpg/pdf
    doc_object.to_target_format('png', tool_name='pdf2img')
    doc_object.to_target_format('pdf')
    
    # convert to txt it with all tools
    converter.convert_to_txt(doc_object, tool_names=[pytesseract_ocr])

    # save all
    doc_object.save_all_txt_conversions(output_path)

    ##############
    # EVALUATION #
    ##############

    name = doc_object.input_filename
    cers[name] = {}

    for tool_name,text in doc_object.data['txt'].items():
        ref = [doc_path for doc_path in doc_paths if doc_paths.endswith('ref.txt')][0]
        output = text
        score = fastwer.score_sent(output, ref, char_level=True)
        cers[name][tool_name] = score

print(cers)

data/input
data/output
LLAMADAAAAAAAA to_target_format
target_format: png
self.data.keys: <built-in method keys of dict object at 0x7f910abd1900>
LLAMADAAAAAAAA to_target_format
ENTERED TARGET_FORMAT == PDF IF
self.is_pdf() is True
self.data: {'png': {'pdf2img': [<PIL.Image.Image image mode=RGB size=1191x1684 at 0x7F910ABB73D0>, <PIL.Image.Image image mode=RGB size=1191x1684 at 0x7F910ABB7A60>]}, 'pdf': {'original': [<_io.BufferedReader name='data/input/flyer_coleottero_giapponese/flyer_coleottero_giappone_zone_misure.pdf'>]}}
input_format pdf


KeyError: <function pytesseract_ocr at 0x7f910ab9d280>

In [None]:

"""
for dir in os.listdir(input_path):
    dir_path = os.path.join(input_path,dir)
    doc_paths = [os.path.join(dir_path, filepath) for filepath in os.listdir(os.path.join(input_path,dir))]

    txt_paths = [doc_path for doc_path in doc_paths if doc_paths.endswith('.txt')]

    
    doc_object = Doc(txt_paths, tools=ctools)"""