# Предобработка данных

## Импорт библиотек

In [1]:
import pandas as pd
import cv2
import os
import re
import numpy as np
import datetime as dt

from PIL import Image as PILImage

from img2table.document import Image

from pytesseract import Output
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

#import pyocr
#pyocr.tesseract.TESSERACT_CMD = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
#from IPython.display import display_html
#from openpyxl import load_workbook
#from io import BytesIO
#from img2table.ocr import TesseractOCR
#TesseractOCR.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

## Папка с исходниками

In [2]:
IMAGE_NAMES = []
CSV_NAMES = []
SOURCE_DIR = '405/'

try:
    for filename in os.listdir(SOURCE_DIR):
        if 'jpg' in filename.lower() or 'jpeg' in filename.lower() or 'png' in filename.lower():
            IMAGE_NAMES.append(SOURCE_DIR + filename)
        if 'csv' in filename:
            CSV_NAMES.append(SOURCE_DIR + filename)
except:
    p = input()
    IMAGE_NAMES = [f"app/405/{p}.jpg"]

## Загрузка и осмотр данных

In [3]:
shape_counter = 0
nan_counter = 0
columns = 10
df_list = []
filename_list = []

for filename in CSV_NAMES:
    path = filename
    df = pd.read_csv(path)
    #print('\n', filename)
    #print(df.head())
    df_list.append(df)
    filename_list.append(filename.split('/')[1])
    if df.shape[1] != columns:
        shape_counter += 1
    if df.isna().sum().sum() != 0:
        nan_counter += 1
print('\n', 'Число файлов где количество столбцов отличается от 10---', shape_counter)
print('\n', 'Число файлов с пропусками---', nan_counter)


 Число файлов где количество столбцов отличается от 10--- 0

 Число файлов с пропусками--- 0


## Добавляем город, преобразуем дату и убираем лишние столбцы

In [4]:
def region_to_city(st):
    return st.split(', ')[-1]

def string_to_date(st):
    try:
        d = dt.datetime.strptime(st, '%d.%m.%Y')
    except:
        d = 0
    return d

In [5]:
trash_columns = ['ID', 'ID пользователя', 'Дата добавления донации', 'Место стадчи', 'Статус донации', 'Есть справка', 'Регион']

for i in range(len(df_list)):
    df_list[i]['Город'] = df_list[i]['Регион'].apply(region_to_city)
    df_list[i]['Дата донации'] = df_list[i]['Дата донации'].apply(string_to_date)
    df_list[i] = df_list[i].drop(trash_columns, axis=1)

## Запись обработанных csv-таргетов

In [6]:
os.makedirs('result/targets', exist_ok=True)  
for i in range(len(df_list)):
    df_list[i].to_csv('result/targets/' + str(filename_list[i]))  

## Функции для работы

### Препроцессинг

**Обрезка** Для этой задачи - обрезаем пополам. Потому что у справки формы 405 таблица всегда в нижней половине.

**Оттенки серого**

**Удаление шумов**

**Приведение к пороговому значению** Gaussian

In [7]:
def pre(img):
    height = int(img.shape[0])
    half = int(img.shape[0] / 2)
    cut = img[half:height, :]
    gray = cv2.cvtColor(cut, cv2.COLOR_BGR2GRAY)
    noise = cv2.medianBlur(gray,1)
    thresh = cv2.adaptiveThreshold(noise,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,11,11)
    out = thresh
    return out

#ret,thresh3 = cv.threshold(img,127,255,cv.THRESH_TRUNC)
#ret,thresh4 = cv.threshold(img,127,255,cv.THRESH_TOZERO)

### Поиск координат ячеек таблиц с помощью img2table

In [8]:
def search(path):
    img = Image(src=path)
    extracted_tables = img.extract_tables()
    table_img = (cv2.imread(path))
    find_cell = []
    for table in extracted_tables:
        for row in table.content.values():
            for cell in row:
                cv2.rectangle(table_img, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), (255, 0, 0), 2)
                rect = [cell.bbox.y1, cell.bbox.y2, cell.bbox.x1, cell.bbox.x2]
                find_cell.append(rect)
    if len(find_cell) < 5:
        return 'PREPROCESSING ERROR TRY ANOTHER IMAGE'
    else:
        return find_cell
    #PILImage.fromarray(table_img)

### Распознавание числовых значений и дат

In [9]:
def numbers(cells, path):
    cfg = r'--oem 3 --psm 7 outputbase digits'
    nums = []
    img = (cv2.imread(path))
    for place in cells:
        img_part = img[place[0]:place[1],place[2]:place[3]]
        nums.append(pytesseract.image_to_string(img_part, lang='rus', config=cfg))
    return nums

### Распознавание всего остального

In [10]:
def words(cells, path):
    cfg = r'--oem 3 --psm 7'
    wds = []
    img = (cv2.imread(path))
    for place in cells:
        img_part = img[place[0]:place[1],place[2]:place[3]]
        wds.append(pytesseract.image_to_string(img_part, lang='rus', config=cfg))
    return wds

### Объединение распознанных данных

In [11]:
def join_data(n, w):
    for i in range(len(n)):
        if n[i] == '':
            n[i] = w[i]
    return n

### Очистка выходных значений: оставляем только данные в формате "дата - тип донации - объем", удаляем пробельные символы и точки из конца строки

In [12]:
def clean(list):
    stop_head = 0
    for i in range(len(list)):
        if re.search(r'\d{2}.\d{2}.\d{4}', list[i]):
            stop_head = i
            break
    cut_head = list[stop_head:]
    for j in range(len(cut_head)-1, 0, -1):
        if re.search(r'\d{2,}', cut_head[j]):
            stop_tail = j
            break
    cut_tail = cut_head[:j+1]
    clean_list = cut_tail
    clean_counter = 0
    for k in range(len(clean_list)):
        if (not (re.search(r'\d{2}.\d{2}.\d{4}', clean_list[k]) or \
        re.search(r'\d{2,}', clean_list[k]) or \
        re.fullmatch(r'\w{,2}\s', clean_list[k]))):
            clean_list[k] = 'unknown'
            clean_counter += 1
        clean_list[k] = re.sub(r'\s', '', clean_list[k])
        clean_list[k] = re.sub(r'[-, =]', '', clean_list[k])
        if clean_list[k][-1] == '.':
            clean_list[k] = clean_list[k][:-1]
        if clean_list[k][0] == '.':
            clean_list[k] = clean_list[k][1:]
    return clean_list

### Формирование таблицы из распознанных значений и сортировка их по дате

In [13]:
def table(list):
    date = []
    volume = []
    don = []
    error = []
    rec_counter = 0
    for i in range(len(list)):
        if re.fullmatch(r'\d{2}.\d{2}.\d{4}', list[i]):
            try:
                d = dt.datetime.strptime(list[i], '%d.%m.%Y')
                d = dt.datetime.date(d)
                rec_counter += 1
            except:
                d = None
            date.append(d)
        elif re.fullmatch(r'\d{3,4}', list[i]) and (50 <= int(list[i]) <= 750):
            volume.append(int(list[i]))
        elif re.search(r'[8, б, в, 6]', list[i]) and len(list[i]) < 3:
            don.append('Безвозмездно')
        else:
            error.append(list[i]) 
    if rec_counter == 0:
        return 'RECOGNITION ERROR TRY ANOTHER IMAGE'
    else:
        result = pd.concat([pd.Series(date), pd.Series(volume), pd.Series(don)], axis=1)
        result = result.rename(columns={0: "Дата донации", 1: "Объем, мл", 2: "Тип донации"})
        result = result.dropna(subset=["Дата донации"]).reset_index(drop=True)
        result = result.fillna(0)
        result = result.drop_duplicates(subset=["Дата донации"]).reset_index(drop=True)
        result = result.sort_values(by='Дата донации')  
        return result

# Распознавание изображений

## Загрузка и предобработка изображений

In [14]:
os.makedirs('result/preprocessed/', exist_ok=True)
img_path_list = []

for path in IMAGE_NAMES:
    input_image = cv2.imread(path)
    preprocessed_image = pre(input_image)
    #cv2.imshow(path+'_pre', preprocessed_image)
    #cv2.waitKey()
    #cv2.destroyAllWindows()
    path_1 = 'result/preprocessed/' + 'pre_' + re.sub(r'\s', '', path.split('/')[1])
    img_path_list.append(path_1)
    cv2.imwrite(path_1, preprocessed_image)

## Распознавание

In [15]:
os.makedirs('result/recognized/', exist_ok=True)

for pth in img_path_list:
    cell = search(pth)
    if type(cell) == str:
        print(pth, '---',cell)
        continue
    n = numbers(cell, pth)
    w = words(cell, pth)
    raw_out = (join_data(n, w))
    cleaned_out = clean(raw_out)
    out_data = table(cleaned_out)
    if type(out_data) == str:
        print(pth, '---', out_data)
        continue
    name = re.sub(r'\D', '', pth) + '.csv'
    out_data.to_csv('result/recognized/' + name)
    print(pth, '---', 'IMAGE RECOGNIZED')

result/preprocessed/pre_141899.jpg --- RECOGNITION ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_204119.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_213950.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_225629.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_227414.jpg --- RECOGNITION ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_228963.jpg --- RECOGNITION ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_231820.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_233749.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_236000.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_238716.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_240493.jpg --- PREPROCESSING ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_243478.jpg --- RECOGNITION ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_245365.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_254586.jpg --- IMAGE RECOGNIZED
result/preprocessed/pre_256578.jpg --- RECOGNITION ERROR TRY ANOTHER IMAGE
result/preprocessed/pre_256838.jpg --- IMAGE

# Расчет accuracy

Методика расчета - "по определению" - количество правильных ответов деленное на общее количество ответов. Общее количество ответов берется из количества ячеек в целевых столбцах

In [16]:
def acc_score(pred, true):
    df_true = true.drop(columns=['Unnamed: 0', 'Город', 'Класс крови'], axis=1)
    answers = df_true.shape[0] * df_true.shape[1]
    correct_answers = 0
    for i in range(len(df_true['Дата донации'])):
        for j in range(len(pred['Дата донации'])):
            if pred['Дата донации'][j] == df_true['Дата донации'][i]:
                correct_answers += 1
                if pred['Тип донации'][j] == df_true['Тип донации'][i]:
                    correct_answers += 1
                else:
                    continue
        else:
            continue
    return round(correct_answers / answers, 2)

In [17]:
for filename in os.listdir('result/recognized/'):
    true_path = 'result/targets/' + filename
    pred_path = 'result/recognized/' + filename
    pred = pd.read_csv(pred_path)
    true = pd.read_csv(true_path)
    print(pred_path, '---ACCURACY VALUE---', acc_score(pred, true))

result/recognized/204119.csv ---ACCURACY VALUE--- 0.33
result/recognized/213950.csv ---ACCURACY VALUE--- 0.34
result/recognized/225629.csv ---ACCURACY VALUE--- 0.44
result/recognized/231820.csv ---ACCURACY VALUE--- 0.62
result/recognized/233749.csv ---ACCURACY VALUE--- 0.43
result/recognized/236000.csv ---ACCURACY VALUE--- 1.0
result/recognized/238716.csv ---ACCURACY VALUE--- 0.74
result/recognized/245365.csv ---ACCURACY VALUE--- 0.85
result/recognized/254586.csv ---ACCURACY VALUE--- 0.36
result/recognized/256838.csv ---ACCURACY VALUE--- 0.05
