In [None]:
import numpy as np
from skimage import io, transform, color, filters
from matplotlib import pyplot as plt
from skimage.morphology import disk, ball
from skimage.filters import threshold_otsu, median

import cv2
import glob
from tqdm.notebook import tqdm

from numba import jit
import pandas as pd
from scipy.signal import argrelextrema
from statsmodels.tsa.api import SimpleExpSmoothing
from scipy.signal import find_peaks

from PIL import Image
from skimage import measure
import os

## Header crop

In [None]:
images_path='data/msk22/blanks_aligned_jpg/'
new_folder='data/msk22/headers_jpg/'

files_paths=glob.glob('data/msk22/blanks_aligned_jpg/*.jpg')
files_paths=[path.split('\\')[-1][:-4] for path in files_paths]

for id in tqdm(files_paths):

    img_path=images_path+id+'.jpg'
    img=io.imread(img_path)
    # msk22
    img =img[230:748,75:1390]

    # nov23
    # img = img[185:725, 90:1400]

    # msk 23
    # img = img[212:740, 70:1350]

    new_img_path=new_folder+id+'.jpg'
    io.imsave(new_img_path, img)

## Update metadata

In [None]:
df=pd.read_csv('grades.csv')

new_df=[]

files_paths=glob.glob('data/nov23/headers_aligned_jpg/*.jpg')
files_paths=[path.split('\\')[-1][:-4] for path in files_paths]

tmp_df=df[df.type=='nov23']
for i in tqdm(range(len(tmp_df))):
    id=tmp_df.iloc[i]['id']
    if id in files_paths:
        new_df.append(tmp_df.iloc[i])

new_df=pd.DataFrame(new_df,columns=df.columns)
new_df['aligned_doc']=True
new_df.to_csv('grades_nov23.csv',index=False)

In [None]:
df1=pd.read_csv('grades_nov23.csv')
df2=pd.read_csv('grades_msk23.csv')
df3=pd.read_csv('grades_msk22.csv')

df=pd.concat([df1,df2,df3])

df.to_csv('grades.csv')

In [None]:
df

## Crop chars

In [None]:
@jit(nopython=True)
def frame_image(img, frame_width):
    b = frame_width # border size in pixel
    ny, nx = img.shape[0], img.shape[1] # resolution / number of pixels in x and y
    framed_img = np.ones((b+ny+b, b+nx+b))
    framed_img[b:-b, b:-b] = img
    return framed_img

In [None]:
def rgb2smyk(bgr):
    bgrdash = bgr.astype(np.float32)/255.

    # Calculate K as (1 - whatever is biggest out of Rdash, Gdash, Bdash)
    K = 1 - np.max(bgrdash, axis=2)

    # Calculate C
    C = (1-bgrdash[...,2] - K)/(1-K)

    # Calculate M
    M = (1-bgrdash[...,1] - K)/(1-K)

    # Calculate Y
    Y = (1-bgrdash[...,0] - K)/(1-K)

    K*=255
    K=K.astype(np.uint8)

    C*=255
    C=C.astype(np.uint8)

    M*=255
    M=M.astype(np.uint8)

    Y*=255
    Y=Y.astype(np.uint8)

    return C,M,Y,K

In [None]:
def crop(gra_f):
    x_max=[]
    x_min=[]
    flag=True
    for k in range(2):
        mean_pixel_column=[]
        if k==0:
            for j in range(gra_f.shape[1]):
                pixels=[]
                for i in range(gra_f.shape[0]):
                    pixels.append(gra_f[i,j])
                mean_pixel_column.append(np.mean(pixels))
        else:
            for j in range(gra_f.shape[0]):
                pixels=[]
                for i in range(gra_f.shape[1]):
                    pixels.append(gra_f[j,i])
                mean_pixel_column.append(np.mean(pixels))


        deriv=np.gradient(mean_pixel_column)

        fit1 = SimpleExpSmoothing(deriv).fit(smoothing_level=0.2,optimized=False).fittedvalues

        peaks=find_peaks(fit1,)[0]
        min_extremax=argrelextrema(fit1, np.less)[0]

        if len(peaks)>0 and len(min_extremax)>0:
            tmp_x_min=peaks[0]
            tmp_x_max=min_extremax[-1]

            x_max.append(tmp_x_max)
            x_min.append(tmp_x_min)

        else:
            flag=False

    return tuple(x_min), tuple(x_max), (flag)


In [None]:
# img=io.imread('msk22_ref_header_62263_5.jpg')
img=io.imread('62263_10.jpg')
chars_num=9

coords={'surname':[82,46], 'name':[215, 46], 'code':[348, 46]}
width=45
height=55
step=6
boundary_adj=3

line_type='surname'
x,y=coords[line_type]
img=img[x-boundary_adj:x+boundary_adj+height, y-boundary_adj:y+boundary_adj+width*chars_num+step*(chars_num-1)]

(C, M, Y, K) = rgb2smyk(img)

thresh, binaryImage = cv2.threshold(Y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

label_image = measure.label(binaryImage)

labels=np.sort(np.unique(label_image))
labels_num=[len(np.where(label_image==label)[0]) for label in labels][1:]

chars_labels=np.where(labels_num>np.mean(labels_num)/2)[0]+1
chars_labels=np.append(chars_labels, 0)

tmp_img=np.zeros(binaryImage.shape)
for label in chars_labels:
    tmp_img+=np.where(label_image==label, binaryImage, 0)

imgs=[]

img_width=tmp_img.shape[1]//chars_num
for i in range(chars_num):
    char_img=tmp_img[:,i*img_width:(i+1)*img_width]
    if np.sum(char_img)!=0:
        (y_min_h,y_min_v),(y_max_h,y_max_v)=crop(char_img)

        N=5

        y_min_h=y_min_h-N
        if y_min_h<0:
            y_min_h=0

        y_min_v=y_min_v-N
        if y_min_v<0:
            y_min_v=0

        char_img=char_img[y_min_v:y_max_v+N, y_min_h:y_max_h+N]
    char_img=transform.resize(char_img, (32,32,1))
    imgs.append(char_img)



fig,axes=plt.subplots(1,3, figsize=(25,6))

axes[0].imshow(img)
axes[1].imshow(Y,cmap='gray')
axes[2].imshow(tmp_img,cmap='gray')
plt.show()

fig,axes=plt.subplots(1,chars_num, figsize=(25,6))
for i in range(chars_num):
    axes[i].imshow(imgs[i],cmap='gray')

plt.show()


In [None]:
translit_dict = {'а' : 'a', 'б' : 'b', 'в' : 'v', 'г' : 'g', 'д' : 'd', 'е' : 'ye', 'ё' : 'yo', 'ж' : 'zh', 'з' : 'z', 'и' : 'i', 'й' : 'j', 'к' : 'k', 'л' : 'l', 'м' : 'm', 'н' : 'n', 'о' : 'o', 'п' : 'p', 'р' : 'r', 'с' : 's', 'т' : 't', 'у' : 'u', 'ф' : 'f', 'х' : 'kh', 'ц' : 'ts', 'ч' : 'ch', 'ш' : 'sh', 'щ' : 'sch', 'ъ' : 'tzn', 'ы' : 'y', 'ь' : 'mzn', 'э' : 'e', 'ю' : 'yu', 'я' : 'ya', '0':'0', '1':'1', '2':'2', '3':'3', '4':'4', '5':'5', '6':'6', '7':'7', '8':'8', '9':'9'}

# msk22
coords={'surname':[82,46], 'name':[215, 46], 'code':[348, 46]}

# msk23
# coords=[[87,49], [220, 49], [353, 49]]

# nov23
# coords=[[96,39], [228, 39], [363, 39]]

width=45
height=55

step=6
boundary_adj=3

type='msk22'

df=pd.read_csv('grades.csv')
df_tmp=df[df.type==type]

df_tmp['surname_doc']=None
df_tmp['name_doc']=None
df_tmp['code_doc']=None

N=5

for j in tqdm(range(len(df_tmp))):
    id=df_tmp.iloc[j]['id']
    img=io.imread(f'data/{type}/headers_aligned_jpg/{id}.jpg')
    os.mkdir(f'data/{type}/chars/{id}')

    for line_type in ['surname', 'name', 'code']:
        os.mkdir(f'data/{type}/chars/{id}/{line_type}')
        if isinstance(df_tmp.iloc[j][line_type], str):
            if ',' not in df_tmp.iloc[j][line_type]:
                chars_num=len(df_tmp.iloc[j][line_type])

                x,y=coords[line_type]
                img_t=img[x-boundary_adj:x+boundary_adj+height, y-boundary_adj:y+boundary_adj+width*chars_num+step*(chars_num-1)]

                (C, M, Y, K) = rgb2smyk(img_t)

                thresh, binaryImage = cv2.threshold(Y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

                label_image = measure.label(binaryImage)

                labels=np.sort(np.unique(label_image))
                labels_num=[len(np.where(label_image==label)[0]) for label in labels][1:]

                chars_labels=np.where(labels_num>np.mean(labels_num)/2)[0]+1
                chars_labels=np.append(chars_labels, 0)

                tmp_img=np.zeros(binaryImage.shape)
                for label in chars_labels:
                    tmp_img+=np.where(label_image==label, binaryImage, 0)

                imgs=[]

                img_width=tmp_img.shape[1]//chars_num
                bad_indices=[]
                for i in range(chars_num):
                    char_img=tmp_img[:,i*img_width:(i+1)*img_width]

                    (y_min,y_max, flag)=crop(char_img)

                    if flag:

                        y_min_h,y_min_v=y_min
                        y_max_h,y_max_v=y_max

                        y_min_h=y_min_h-N
                        if y_min_h<0:
                            y_min_h=0

                        y_min_v=y_min_v-N
                        if y_min_v<0:
                            y_min_v=0

                        char_img_cropped=char_img[y_min_v:y_max_v+N, y_min_h:y_max_h+N]
                    else:
                        bad_indices.append(i)
                        char_img_cropped=char_img

                    char_img_cropped=transform.resize(char_img_cropped, (32,32,1))
                    imgs.append(char_img_cropped)

                for k, img_t in enumerate(imgs):
                    char=translit_dict.get(df_tmp.iloc[j][line_type][k])
                    if char is not None:
                        if k not in bad_indices:
                            name=f'data/{type}/chars/{id}/{line_type}/{k}_{char}_correct.jpg'
                            df_tmp.iloc[j, df_tmp.columns.get_loc(line_type+'_doc')] = True
                            # new_df.append(np.append(df_tmp.iloc[j][line_type+'_doc'].values,True ))
                        else:
                            name=f'data/{type}/chars/{id}/{line_type}/{k}_{char}_error.jpg'
                            df_tmp.iloc[j, df_tmp.columns.get_loc(line_type+'_doc')] = False
                        io.imsave(name, img_t.astype(np.uint8))

# new_df=pd.DataFrame(new_df, columns=tmp_df.columns)