In [None]:
import pytesseract as tess
import PIL
import os
import glob
from wand.color import Color
from wand.image import Image
from datetime import datetime
import cv2
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [None]:
os.chdir(r'E:\temporary_flownform_directory\input')

### converting all pdfs to images

In [None]:
import os
from wand.image import Image


def convert_pdf(filename, output_path, resolution=150):
    """ Convert  PDF into images.

        All the pages will give a single png file with format:
        {pdf_filename}-{page_number}.png

        The function removes the alpha channel from the image and
        replace it with a white background.
    """
    all_pages = Image(filename=filename, resolution=resolution)
    for i, page in enumerate(all_pages.sequence):
        with Image(page) as img:
            img.format = 'png'
            img.background_color = Color('white')
            img.alpha_channel = 'remove'

            image_filename = os.path.splitext(os.path.basename(filename))[0]
            image_filename = image_filename.replace('-', '_')
            image_filename = '{}-{}.png'.format(image_filename, i)
            image_filename = os.path.join(output_path, image_filename)

            img.save(filename=image_filename)
            

# apply the function
for file in glob.glob("*.pdf"):
    try:
        print('Processing file ', file)
        convert_pdf(file, os.getcwd())
    except Exception as e:
        print("File {} was not processed corectly due to some error".format(file))

### convert to grayscale with Wand

In [None]:
# import os
# from wand.image import Image


# def convert_to_gray_scale(filename, output_path, resolution=300):

#     with Image(filename=filename) as img:
#         img.type = 'grayscale'
#         image_filename = os.path.splitext(os.path.basename(filename))[0]
#         image_filename = 'grayscale_' + image_filename  + '.png'
#         img.save(filename=image_filename)
            

# for file in glob.glob("*.png"):
#     print('Processing file ', file)
#     convert_to_gray_scale(file, os.getcwd())



# def convert_pdf(filename, output_path, resolution=150):
#     """ Convert a PDF into images.

#         All the pages will give a single png file with format:
#         {pdf_filename}-{page_number}.png

#         The function removes the alpha channel from the image and
#         replace it with a white background.
#     """
#     all_pages = Image(filename=filename, resolution=resolution)
#     for i, page in enumerate(all_pages.sequence):
#         with Image(page) as img:
#             img.type = 'grayscale'
#             img.background_color = Color('white')
#             img.alpha_channel = 'remove'

#             image_filename = os.path.splitext(os.path.basename(filename))[0]
#             image_filename = image_filename.replace('-', '_')
#             image_filename = '{}-{}_grayscale.png'.format(image_filename, i)
#             image_filename = os.path.join(output_path, image_filename)

#             img.save(filename=image_filename)
            

# for file in glob.glob("*.pdf"):
#     print('Processing file ', file)
#     convert_pdf(file, os.getcwd())


### convert to grayscale with cv2

In [None]:
eror_counter = 0

for file in glob.glob(r"*.png"):
    
    try:
        print("processing file:", file)
        normal = cv2.imread(file)
        gray = cv2.cvtColor(normal, cv2.COLOR_BGR2GRAY)

        image_filename = os.path.splitext(os.path.basename(file))[0]
        image_filename = 'grayscale_'+ image_filename + '.png'
        image_filename = image_filename.replace('-', '_')
        cv2.imwrite(image_filename,gray)
        
    except Exception as e:
        print("File {} was not processed corectly due to some error".format(file))
        eror_counter += 1


        

for file in glob.glob(r"*.jpg"):
    
    try:
        print("processing file:", file)
        normal = cv2.imread(file)
        gray = cv2.cvtColor(normal, cv2.COLOR_BGR2GRAY)
        image_filename = os.path.splitext(os.path.basename(file))[0]
        image_filename = 'grayscale_'+ image_filename + '.png'
        image_filename = image_filename.replace('-', '_')

        cv2.imwrite(image_filename,gray)
    except Exception as e:
        print("File {} was not processed corectly due to some error".format(file))
        eror_counter += 1


for file in glob.glob(r"*.jpeg"):
    
    try:
        print("processing file:", file)
        normal = cv2.imread(file)
        gray = cv2.cvtColor(normal, cv2.COLOR_BGR2GRAY)
        image_filename = os.path.splitext(os.path.basename(file))[0]
        image_filename = 'grayscale_'+ image_filename + '.png'
        image_filename = image_filename.replace('-', '_')
        cv2.imwrite(image_filename,gray)

    except Exception as e:
        print("File {} was not processed corectly due to some eror".format(file))
        eror_counter += 1


print("TOTAL NUMBER OF FILES WITH ERROR WAS {}".format(eror_counter))

In [None]:
# removing non grayscale from further process
for file in glob.glob("*.png"):
    if "grayscale" not in file:
        os.remove(file)
        print("this file is not grayscale", file)


### converting images to txt

In [None]:
eror_counter = 0

# if having trouble with windows recognizing tesseract as environmental variable do it manually with codeline below
tess.pytesseract.tesseract_cmd = r'E:\sporedni programi\TESSERACT_binary_installation\tesseract.exe'

for file in glob.glob("*.png"):
    
    try:
        print('Processing file ', file)

        # opening the image
        img = PIL.Image.open(file)

        # using tesseract to OCR text
        text = tess.image_to_string(img,
                                   config='--psm 11' )

        # saving/writing to the file
        with open("{}{}.txt".format(r'E:\temporary_flownform_directory\input\\' , "psm11_" + os.path.splitext(file)[0]),
                                    "w" 
    #               encoding = "UTF-8"
                 ) as text_file:

            text_file.write(text)
        
    except Exception as e:
        print("File {} was not processed corectly due to some  eror".format(file))
        eror_counter += 1
        

print("TOTAL NUMBER OF FILES WITH ERROR WAS {}".format(eror_counter))

### processing and cleaning text files

In [None]:
# getting a list of all txt files
all_txt_files = glob.glob("*.txt")

# getting unique original files names
stripped_txt_files = set([x.split('.txt')[0] for x in all_txt_files])
stripped_txt_files = set([x.split('-')[0] for x in stripped_txt_files])
print("Expected number of unique outputs is {}".format(len(stripped_txt_files)))


# merging the files based on their original name
for file_prefix in stripped_txt_files:
    print('processing file {}'.format(file_prefix))
    filenames = glob.glob("*{}*.txt".format(file_prefix))

    with open('{}{}.txt'.format(r'E:\temporary_flownform_directory\output\\',file_prefix ),
              'w' 
#              encoding = "UTF-8"
             ) as f:
        for file in filenames:
            with open(file) as infile:
                f.write(infile.read()+'\n')


### preparing txt for doccano - requires single multiline txt file

In [None]:
os.chdir(r'E:\temporary_flownform_directory\output')

date = str(datetime.now().date()).replace('-','_')

list_of_singleline_txt = []


# merging files(pages) of the same doccument which got separated during the pdf2image step
final_list_of_files = glob.glob("*.txt")
for file in final_list_of_files: 
    
    print('processing file {}'.format(file))
    
    # removing blank lines from text and converting multiline to singleline
    with open(file, 'r',
#               encoding = "UTF-8"
             ) as file: 

        lines = file.readlines()
        lines = [line for line in lines if not line.isspace()] 
        lines = " ".join(map(str, lines))
        lines = "".join( lines.splitlines())
#         print("lines", lines)

    # appending singleline txts to appender list
    list_of_singleline_txt.append(lines)
#     print("list_of_singleline_txt", list_of_singleline_txt)
    
# writing the flyers txt as a single file multiline output
output = open(r'E:\temporary_flownform_directory\output_clean\flyers_txt_{}.txt'.format(date), 
              'w',
              encoding='utf-8'
             )

output.writelines([str(x) + "\n" for x in list_of_singleline_txt])
output.close()


# additional check on the output - remove blank lines if any of the OCRed files was a completely blank file
with open(r'E:\temporary_flownform_directory\output_clean\flyers_txt_{}.txt'.format(date), encoding="utf8") as infile,\
open(r'E:\temporary_flownform_directory\output_clean\flyers_txt_{}_final_output.txt'.format(date), 'w', encoding="utf8") as outfile:
    
    for line in infile:
        if not line.strip(): continue  # skip the empty line
        outfile.write(line) 
        
#         print(line)

list_df = pd.DataFrame(final_list_of_files)
list_df.index = np.arange(1, len(list_df)+1)
list_df.to_excel(r'E:\temporary_flownform_directory\output_clean\file_order.xlsx')