## Import libraries

In [1]:
from matplotlib import pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import pytesseract
import shutil
import cv2
import os

# psm 6 = Assume a single uniform block of text.
config_tesseract = '--tessdata-dir ./ttesseract_langs --psm 6'

## Load list of input image files

In [2]:
def read_resize_data(input_file, size=160):
    img = cv2.imread(input_file)
    
    width = size
    height = size
    dim = (width, height)

    # resize image
    resized = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)

    return resized


In [3]:
print(os.getcwd())

d:\MTU\_Project_Grunt_Work\Seperate_Folders\sipa_03


In [4]:
def read_input_file_list():
    # input_data = pd.read_csv(r'labelled_images_just_sipa2.txt', names=["file_name","seen_value","ncol2","ncol3"], sep="\t", header=None)
    input_data = pd.read_csv(r'./labelled_images_sipa3.txt', names=["file_name","seen_value","ncol2","ncol3"], sep="\t", header=None)
    input_data = input_data.reset_index()

    print(len(input_data), "input images")
    return input_data

In [5]:
def read_input_data(input_file):
    return cv2.imread(input_file)

In [6]:
# Image Dimensions
def image_dimension_details(input_file):

    # Get image data
    img = read_input_data(input_file)
    
    hght = img.shape[0]
    wdth = img.shape[1]
    dims = img.shape[2]

    return hght, wdth, dims


In [7]:
def get_text(image):

    # Read text from image using Seven Segment training data
    text_ssd = pytesseract.image_to_string(image, lang="ssd", config=config_tesseract)

    # Read text from image using English character training data
    text_eng = pytesseract.image_to_string(image, lang="eng", config=config_tesseract)
    
    # Clean text
    text_ssd = ''.join(c for c in text_ssd if c.isdigit() or c == '.')
    text_eng = ''.join(c for c in text_eng if c.isdigit() or c == '.')

    return text_ssd, text_eng


In [8]:
def process_file_closing(input_file):

    # Get image data
    # img = read_input_data(input_file)
    img = read_resize_data(input_file)

    # Convert to RGB (three dimensions)
    nimRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Convert to gray (one dimension)
    nimGray = cv2.cvtColor(nimRGB, cv2.COLOR_BGR2GRAY)

    # Perform Dilation using a 5x5 matrix
    cdilation = cv2.dilate(nimGray, np.ones((5,5), np.uint8))

    # PErform Erod using a 5x5 matrix
    nimClosing = cv2.erode(cdilation, np.ones((5,5), np.uint8))

    # Get Text for Seven Segment and English
    text_ssd, text_eng = get_text(nimClosing)
    
    return text_ssd, text_eng
    



In [9]:
def mask_red(img):

    # img=cv2.imread('./sipaimages/2/22319_20220321100409.jpg')
    img_hsv=cv2.cvtColor(img, cv2.COLOR_BGR2HSV)

    # lower mask (0-10)
    lower_red = np.array([0,50,50])
    upper_red = np.array([40,255,255])
    mask0 = cv2.inRange(img_hsv, lower_red, upper_red)

    # upper mask (170-180)
    lower_red = np.array([170,50,50])
    upper_red = np.array([180,255,255])
    mask1 = cv2.inRange(img_hsv, lower_red, upper_red)

    # join my masks
    mask = mask0+mask1

    # set my output img to zero everywhere except my mask
    output_img = img.copy()
    output_img[np.where(mask==0)] = 0

    return output_img


In [10]:
def process_mask_otsu(input_file):

    # Get image data
    # img = read_input_data(input_file)
    img = read_resize_data(input_file)

    imgMask = mask_red(img)
    
    # Convert to RGB (three dimensions)
    nimRGB = cv2.cvtColor(imgMask, cv2.COLOR_BGR2RGB)

    # Convert to gray (one dimension)
    nimGray = cv2.cvtColor(nimRGB, cv2.COLOR_BGR2GRAY)

    # 0 means threshold level which actually is omitted because we used CV_THRESH_OTSU flag
    min_threshold = 0

    # 255 is a value that is going to be assigned to respectively pixels in the result 
    # (namely, to all pixels which value in the source is greater then computed threshold level)
    max_threshold = 255

    # THRESH_BINARY | THRESH_OTSU is a required flag to perform Otsu thresholding. Because in fact we would like to perform binary thresholding,
    # so we use CV_THRESH_BINARY (you can use any of 5 flags opencv provides) combined with CV_THRESH_OTSU
    value, nimOTSU = cv2.threshold(nimGray, min_threshold, max_threshold, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

    # Get Text for Seven Segment and English
    text_ssd, text_eng = get_text(nimOTSU)
    
    return text_ssd, text_eng

## Control Loop

In [12]:

input_data = read_input_file_list()
input_array = []
for row in tqdm(input_data.iterrows()):
   
    input_file = row[1][1]
    # print("input_file", input_file)
    
    # Get image Dimensions
    w,h,d  = image_dimension_details(input_file)
    
    # Dialation
    closing_ssd, closing_eng = process_file_closing(input_file)

    # Mask Red OTSU
    mro_ssd, mro_eng = process_mask_otsu(input_file)

    folder_split = input_file.split("\\")
    if "sipaimages" in folder_split:
        sub_folder = "sipaimages_" + folder_split[2]
    else:
        sub_folder = folder_split[1]

    # Add text to array
    new_row = [input_file, w, h, d, closing_ssd, closing_eng, mro_ssd, mro_eng, sub_folder]

    input_array.append(new_row)


# Alter input_array to numpy array
input_array = np.array(input_array)


# Add fields to the input_data Data Frame
input_data['width']               = input_array[:, 1]
input_data['height']              = input_array[:, 2]
input_data['dimensions']          = input_array[:, 3]
input_data['closing_ssd']         = input_array[:, 4]
input_data['closing_eng']         = input_array[:, 5]
input_data['mro_ssd']             = input_array[:, 6]
input_data['mro_eng']             = input_array[:, 7]
input_data['sub_folder']          = input_array[:, 8]



# Create csv
input_data.to_csv("Images_Tesseract_Analysis_sepa3.csv", encoding='utf-8')

26 input images


26it [00:10,  2.48it/s]


In [13]:
input_data

Unnamed: 0,index,file_name,seen_value,ncol2,ncol3,width,height,dimensions,closing_ssd,closing_eng,mro_ssd,mro_eng,sub_folder
0,0,.\sipaimages\3\1.jpg,30.31,,,490,817,3,31.0,31.0,,,sipaimages_3
1,1,.\sipaimages\3\10.jpg,13.05,,,490,817,3,135.0,1305.0,,,sipaimages_3
2,2,.\sipaimages\3\11.jpg,20.59,,,490,817,3,11.0,5.0,,,sipaimages_3
3,3,.\sipaimages\3\12.jpg,13.05,,,490,817,3,1808.0,1305.0,,,sipaimages_3
4,4,.\sipaimages\3\13.jpg,30.31,,,490,817,3,4.0,303.0,,,sipaimages_3
5,5,.\sipaimages\3\14.jpg,13.04,,,490,817,3,13.4,1304.0,11.0,1.0,sipaimages_3
6,6,.\sipaimages\3\15.jpg,30.24,,,816,1468,3,0.0,,1.0,,sipaimages_3
7,7,.\sipaimages\3\16.jpg,13.04,,,490,817,3,184.0,1304.0,,,sipaimages_3
8,8,.\sipaimages\3\17.jpg,13.03,,,490,817,3,13.9,1303.0,11.0,,sipaimages_3
9,9,.\sipaimages\3\18.jpg,30.23,,,816,1468,3,14.0,30.23,11.1,2.0,sipaimages_3


# Analysis Section

In [15]:
df = pd.read_csv(r'Images_Tesseract_Analysis_sepa3.csv')
df = df.reset_index()
df

Unnamed: 0.1,level_0,Unnamed: 0,index,file_name,seen_value,ncol2,ncol3,width,height,dimensions,closing_ssd,closing_eng,mro_ssd,mro_eng,sub_folder
0,0,0,0,.\sipaimages\3\1.jpg,30.31,,,490,817,3,31.0,31.0,,,sipaimages_3
1,1,1,1,.\sipaimages\3\10.jpg,13.05,,,490,817,3,135.0,1305.0,,,sipaimages_3
2,2,2,2,.\sipaimages\3\11.jpg,20.59,,,490,817,3,11.0,5.0,,,sipaimages_3
3,3,3,3,.\sipaimages\3\12.jpg,13.05,,,490,817,3,1808.0,1305.0,,,sipaimages_3
4,4,4,4,.\sipaimages\3\13.jpg,30.31,,,490,817,3,4.0,303.0,,,sipaimages_3
5,5,5,5,.\sipaimages\3\14.jpg,13.04,,,490,817,3,13.4,1304.0,11.0,1.0,sipaimages_3
6,6,6,6,.\sipaimages\3\15.jpg,30.24,,,816,1468,3,0.0,,1.0,,sipaimages_3
7,7,7,7,.\sipaimages\3\16.jpg,13.04,,,490,817,3,184.0,1304.0,,,sipaimages_3
8,8,8,8,.\sipaimages\3\17.jpg,13.03,,,490,817,3,13.9,1303.0,11.0,,sipaimages_3
9,9,9,9,.\sipaimages\3\18.jpg,30.23,,,816,1468,3,14.0,30.23,11.1,2.0,sipaimages_3


In [16]:
def check_index(index, index_array):
    if index not in index_array:
        index_array.append(index)
    

In [19]:
def copy_found_input_file(input_file):
    folder_split = input_file.split("\\")

    if "sipaimages" in folder_split:
        sub_folder = "found\sipa\\" + folder_split[2]
    else:
        sub_folder = "found\idoc"

    if not os.path.exists(sub_folder):
        os.makedirs(sub_folder)

    shutil.copy2(input_file, sub_folder)

    
    return False

In [18]:
def copy_not_found_input_file(input_file):
    folder_split = input_file.split("\\")

    if "sipaimages" in folder_split:
        sub_folder = "not_found\sipa\\" + folder_split[2]
    else:
        sub_folder = "not_found\idoc"

    if not os.path.exists(sub_folder):
        os.makedirs(sub_folder)

    shutil.copy2(input_file, sub_folder)

    
    return True

In [20]:
folder_dic = {}
closing_sfa = []
mrOTSU_sfa = []

index_array = []
t_closing_ssd = 0
t_closing_eng = 0
t_mro_ssd = 0
t_mro_eng = 0

for index, row in df.iterrows():

    # print(row['file_name'])
    # newpath = r'C:\Program Files\arbitrary' 
    # if not os.path.exists(newpath):
        # os.makedirs(newpath)

    t_found = False

    t_float_seen_value = 0
    try:
        float(row['seen_value'])
        t_float_seen_value = float(row['seen_value'])

    except:
        t_float_seen_value = row['seen_value']



    if t_float_seen_value==row['mro_ssd']:
        t_mro_ssd += 1
        check_index(index, index_array)
        t_found = copy_found_input_file(row['file_name'])
        if row['sub_folder'] not in mrOTSU_sfa:
            mrOTSU_sfa.append(row['sub_folder'])
        continue

    if t_float_seen_value==row['mro_eng']:
        t_mro_eng += 1
        check_index(index, index_array)
        t_found = copy_found_input_file(row['file_name'])
        if row['sub_folder'] not in mrOTSU_sfa:
            mrOTSU_sfa.append(row['sub_folder'])
        continue

    if t_float_seen_value==row['closing_ssd']:
        t_closing_ssd += 1
        check_index(index, index_array)
        t_found = copy_found_input_file(row['file_name'])
        if row['sub_folder'] not in closing_sfa:
            closing_sfa.append(row['sub_folder'])
        continue

    if t_float_seen_value==row['closing_eng']:
        t_closing_eng += 1
        check_index(index, index_array)
        t_found = copy_found_input_file(row['file_name'])
        if row['sub_folder'] not in closing_sfa:
            closing_sfa.append(row['sub_folder'])
        continue


        
    if not t_found:
        copy_not_found_input_file(row['file_name'])


print("closing:", "eng", t_closing_eng, "ssd", t_closing_ssd, "sub_folders:", closing_sfa)
print("mask red OTSU:", "eng", t_mro_eng, "ssd", t_mro_ssd, "sub_folders:", mrOTSU_sfa)

len(index_array)

closing: eng 1 ssd 2 sub_folders: ['sipaimages_3']
mask red OTSU: eng 0 ssd 0 sub_folders: []


3

In [21]:
for index, row in df.iterrows():
    # print(type(row['seen_value']),type(row['mro_ssd']))
    print(float(row['seen_value'])==row['mro_ssd'])

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [22]:
for loop_counter in range(1,11):
    print(loop_counter)

1
2
3
4
5
6
7
8
9
10
