# Config

In [None]:
import os
import types
from typing import Union
try:
 from PIL import Image
except ImportError:
 import Image

import fitz


In [None]:
# import conversion tools
import utils
from conversion_tools import *
from conversion_tools import tools_for_conversion

# Load Files

In [None]:
data_path = os.path.join('/content', 'drive', 'MyDrive', '6_work', 'PHIM', 'data')

input-path = os.path.join(data_path, 'input')
output_path = os.path.join(data_path, 'output')

#pdf_path = os.path.join(input-pat, 'pdf')
#img_path = os.path.join(input-pat, 'img')

# Converter Classes

In [None]:
class Doc:
   def __init__(self, input_filepaths):
      # input_filepath is a LIST of filepaths (in case more files belong to the same document)
      self.input_filepaths = input_filepaths
      self.main_input_filepath = input_filepaths[0] # only used to detect filename and extension
      
      self.input_extension = os.path.splitext(self.main_input_filepath)[-1].lower()
      self.input_filename = os.path.splitext(self.main_input_filepath)[-2].lower()

      self.txt_conversions = dict() # dict of {tool-name: converted-text} pairs

      self.data = dict() # dict of {format: {tool-name: converted-text}} e.g. {'jpg': {'pdf2txt': <jpg-object>}}


   # doc conversion img<>pdf
   def to_format(self, format, tool_name = None):
      """format can be pdf, png"""
      data_selection = self.data.get(format)       
      if data_selection:                           # if at least an instance of (converted/original) data is already stored for given format...
         if not tool_name:
            converted_data = data_selection[0]     # ...take the first instance available...
         else:
            converted_data = data_selection[tool_name]  #...unless a preferred conversion tool was selected
         return converted_data

      else: # if the requested format has no instances, add an empty instance format:{}
         self.data[format] = dict()

      # if requested format matches filepath extension, open the document and store it as an object
      if format is "pdf":
         if self.is_pdf():

            # open all files for the given document...
            pdf_objects = []
            for pdf_path in self.input_filepaths:
               with open(pdf_path, 'rb') as file:
                  pdf_objects.append(file)
            converted_data = pdf_objects
            
            self.data[format]['original'] = converted_data # ...and store original data in self.data

         if self.is_png(): # if requested format does NOT match filepath extension, convert it
           
            tool_selection = tools_for_conversion[('png','pdf')] # this gives a dict of suitable tools
            
            if tool_name is None:
               tool = tool_selection.keys()[0] # take first available tool if no tool was specified

            conversion_tool = tool_selection[tool_name] # assign tool based on tool name
                        
            png_objects = None # TODO open files...
            
            converted_data = conversion_tool( # ...apply conversion tool on them...
               png_objects
            )

            self.data[format][tool] = converted_data # ...and store the converted data
      
      # if requested format matches filepath extension, open the document and store it as an object
      if format is "png":
         if self.is_png():
            
            # open all files for the given document...
            converted_data = [
               Image.open(image_path) for image_path in self.input_filepaths
               ]
               
            self.data[format]['original'] = converted_data # ... and store original data in self.data

         if self.is_pdf():
            
            tool_selection = tools_for_conversion[('png','pdf')] # this gives a dict of suitable tools
            
            if tool_name is None:
               tool = tool_selection.keys()[0] # take first available tool if no tool was specified

            conversion_tool = tool_selection[tool_name] # assign tool based on tool name
                        
            pdf_objects = [ # open files...
               fitz.open(self.pdf_path) for pdf_path in self.input_filepaths
            ]

            converted_data = conversion_tool( # ...apply conversion tool on them...
               pdf_objects
            )

            self.data[format][tool] = converted_data # ...and store the converted data

      return converted_data


   # check format
   def is_pdf(self):
      return self.input_extension.lower() == 'pdf'

   def is_png(self):
      return self.input_extension.lower() == 'png'

   # save output txt
   def save_txt_conversion(self, tool_name, output_dir_path):
      # define output filepath
      output_file_path = os.path.join(output_dir_path,tool_name,str(self.filename)+'.txt')

      # save txt in chosen dir
      with open(output_file_path, 'w') as f:
         f.write(self.txt_conversions[tool_name])
   
   def save_all_txt_conversions(self, output_dir_path):
      for key in self.txt_conversions.keys():
         self.save_txt_conversion(key, output_dir_path)
            
      

In [None]:
class TxtConverter:
   def __init__(self, tools=tools_for_conversion):
      #self.available_functions = {str(f):f for f in globals().values() if type(f) == types.FunctionType}
      self.tools = tools
      
   # select tool and convert
   def convert_to_txt(self, doc, tool_names = None):

      input_format = doc.input_extension
      output_format = 'txt'

      tool_selection = tools_for_conversion[(input_format, output_format)] # this gives a dict of suitable tools

      if isinstance(tool_names, str):
         tool_names = [tool_names]

      if tool_names is None: # use all the conversion tools
         tool_names = tool_selection.keys()
      
      selected_tools = {
         tool_name: tools_for_conversion[ tool_name] for tool_name in tool_names
      }

      objects = doc.to_format(input_format) # this is a list

      # this is where the heavy lifting happens
      extracted_texts = {
         tool_name: selected_tools[tool_name](objects) for tool_name in tool_names
      }

      #store in doc.txt_conversions
      doc.txt_conversions.update(extracted_texts) # join the dictionaires
      
      # store in doc.data
      if not doc.data.get('txt') :
         doc.data['txt'] = dict()
      
      doc.data['txt'].update(extracted_texts) # join the dictionaires

# Conversion

# Evaluation