# Neural network approach to writer identification

In this file you will be able to test your photo on the neural network classification by handwriting. 

Skip all functions to the end, there are an example hpw to call function.

Make sure your photo is in PNG format and it contains minimal bordersof non-text image

#### Functions to siamese network

In [78]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import cv2
from os.path import join


def fetch(img_dir, name):
    #print('image ' + str(name))
    img = cv2.imread(join(img_dir, name))
    if img.shape == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    elif img.shape == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img


def resize(img, size=(1024, 768)):
    assert len(size) == 2
    return cv2.resize(img, size, interpolation=cv2.INTER_CUBIC)


def pad(img, size=(1024, 768)):
    assert len(img.shape) == 3
    assert len(size) == 2
    h, w, _ = img.shape
    #assert w <= size[0] and h <= size[1]
    pad_vert = np.ceil((size[1]-h) / 2).astype(np.uint32)
    pad_hor = np.ceil((size[0]-w) / 2).astype(np.uint32)

    padded = np.full((size[1], size[0], 3), 255).astype(np.uint8)
    padded[pad_vert:pad_vert+h, pad_hor:pad_hor+w, :] = img.copy()
    return padded

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys 
sys.path.insert(0, '../')

import numpy as np
import pandas as pd

from keras.utils import Sequence
from keras.preprocessing.image import ImageDataGenerator


class WordsSequence(Sequence):
    def __init__(self, img_dir, input_shape, x_set, y_set=None, batch_size=16):
        if y_set is not None:
            self.x, self.y = x_set, y_set
            self.dataset = pd.DataFrame(data={'x': self.x, 'y': self.y, 'used': np.zeros_like(self.y)})
            self.dataset['class_count'] = self.dataset.groupby('y')['y'].transform('count')
        else:
            self.x, self.y = x_set, None
            
        self.img_dir = img_dir
        self.input_shape = input_shape
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        if self.y is None:
            batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
            return np.array([self.preprocess(fetch(self.img_dir, name)) for name in batch_x])

        unused = self.dataset.loc[self.dataset['used'] == 0]
            
        if len(unused) >= self.batch_size:
            batch_indices = unused.sample(n=self.batch_size).index
        else:
            batch_indices = unused.sample(n=self.batch_size, replace=True).index

        self.dataset.loc[batch_indices, 'used'] = 1
        batch_x = self.dataset.iloc[batch_indices]['x'].values
        batch_y = self.dataset.iloc[batch_indices]['y'].values
        return np.array([self.preprocess(fetch(self.img_dir, name)) for name in batch_x]), np.array(batch_y)

    def preprocess(self, img):
        assert len(img.shape) == 3

        h, w, _ = img.shape
        if h / w <= self.input_shape[0] / self.input_shape[1]:
            img = resize(img, (self.input_shape[1], int(self.input_shape[1] * h / w)))
        else:
            img = resize(img, (int(self.input_shape[0] * w / h), self.input_shape[0]))

        img = pad(img, (self.input_shape[1], self.input_shape[0]))
        return img / 255.  

    def on_epoch_end(self):
        if self.y is not None:
            self.dataset = pd.DataFrame(data={'x': self.x, 'y': self.y, 'used': np.zeros_like(self.y)})
            self.dataset['class_count'] = self.dataset.groupby('y')['y'].transform('count')

import tensorflow as tf


def valid_triplets_mask(labels):
    """Compute the 3D boolean mask where mask[a, p, n] is True if (a, p, n) is a valid triplet,
    as in a, p, n are distinct and labels[a] == labels[p], labels[a] != labels[n].

    :param labels: tensor of shape (batch_size,)
    :return mask: tf.bool tensor of shape (batch_size, batch_size, batch_size)
    """

    indices_equal = tf.cast(tf.eye(tf.shape(labels)[0]), tf.bool)
    indices_not_equal = tf.logical_not(indices_equal)
    i_not_equal_j = tf.expand_dims(indices_not_equal, 2)
    i_not_equal_k = tf.expand_dims(indices_not_equal, 1)
    j_not_equal_k = tf.expand_dims(indices_not_equal, 0)
    distinct_indices = tf.logical_and(tf.logical_and(i_not_equal_j, i_not_equal_k), j_not_equal_k)

    label_equal = tf.equal(tf.expand_dims(labels, 0), tf.expand_dims(labels, 1))
    i_equal_j = tf.expand_dims(label_equal, 2)
    i_equal_k = tf.expand_dims(label_equal, 1)
    valid_labels = tf.logical_and(i_equal_j, tf.logical_not(i_equal_k))

    mask = tf.logical_and(distinct_indices, valid_labels)
    return mask


def euclidean_distance(embeddings, squared=False):
    """Computes pairwise euclidean distance matrix with numerical stability.
    output[i, j] = || feature[i, :] - feature[j, :] ||_2

    :param embeddings: 2-D Tensor of size [number of data, feature dimension].
    :param squared: Boolean, whether or not to square the pairwise distances.
    :return dist: 2-D Tensor of size [number of data, number of data].
    """
    dist_squared = tf.add(tf.reduce_sum(tf.square(embeddings), axis=1, keepdims=True),
                          tf.reduce_sum(tf.square(tf.transpose(embeddings)), axis=0, keepdims=True)
                          ) - 2.0 * tf.matmul(embeddings, tf.transpose(embeddings))

    # Deal with numerical inaccuracies. Set small negatives to zero.
    dist_squared = tf.maximum(dist_squared, 0.0)
    # Get the mask where the zero distances are at.
    error_mask = tf.less_equal(dist_squared, 0.0)
    # Optionally take the sqrt.
    dist = dist_squared if squared else tf.sqrt(dist_squared + tf.cast(error_mask, dtype=tf.float32) * 1e-16)
    # Undo conditionally adding 1e-16.
    dist = tf.multiply(dist, tf.cast(tf.logical_not(error_mask), dtype=tf.float32))

    n_data = tf.shape(embeddings)[0]
    # Explicitly set diagonals to zero.
    mask_offdiagonals = tf.ones_like(dist) - tf.linalg.diag(tf.ones([n_data]))
    dist = tf.multiply(dist, mask_offdiagonals)
    return dist


def masked_maximum(data, mask, dim=1):
    """Computes the axis wise maximum over chosen elements.
    :param data: 2-D float `Tensor` of size [n, m].
    :param mask: 2-D Boolean `Tensor` of size [n, m].
    :param dim: The dimension over which to compute the maximum.
    :return masked_maximums: N-D `Tensor`. The maximized dimension is of size 1 after the operation.
    """
    axis_minimums = tf.reduce_min(data, axis=dim, keepdims=True)
    masked_maximums = tf.reduce_max(tf.multiply(data - axis_minimums, mask), axis=dim, keepdims=True) + axis_minimums
    return masked_maximums


def masked_minimum(data, mask, dim=1):
    """Computes the axis wise minimum over chosen elements.
    :param data: 2-D float `Tensor` of size [n, m].
    :param mask: 2-D Boolean `Tensor` of size [n, m].
    :param dim: The dimension over which to compute the minimum.
    :return masked_minimums: N-D `Tensor`. The minimized dimension is of size 1 after the operation.
    """
    axis_maximums = tf.reduce_max(data, axis=dim, keepdims=True)
    masked_minimums = tf.reduce_min(tf.multiply(data - axis_maximums, mask), axis=dim, keepdims=True) + axis_maximums
    return masked_minimums


def triplet_loss(margin=1.0, strategy='batch_semi_hard'):
    """Compute the triplet loss over the batch of embeddings. tf contrib inspired:
    https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/losses/python/metric_learning/metric_loss_ops.py

    :param margin: margin that is going to be enforced by the triplet loss
    :param strategy: string, that indicated whether we're using the 'batch hard', 'batch all' or 'batch_semi_hard' mining strategy
    :return: a callback function that calculates the loss according to the specified strategy
    """
    def get_loss_tensor(positive_dists, negative_dists):
        """Compute the triplet loss function tensor using specified margin:

        :param positive_dists: positive distances tensor
        :param negative_dists:  negative distances tensor
        :return: resulting triplet loss tensor
        """
        if margin == 'soft':
            return tf.nn.softplus(positive_dists - negative_dists)

        return tf.maximum(positive_dists - negative_dists + margin, 0.0)

    def batch_semi_hard(labels, embeddings):
        """Computes the triplet loss with semi-hard negative mining.
        The loss encourages the positive distances (between a pair of embeddings with
        the same labels) to be smaller than the minimum negative distance among
        which are at least greater than the positive distance plus the margin constant
        (called semi-hard negative) in the mini-batch. If no such negative exists,
        uses the largest negative distance instead.
        See: https://arxiv.org/abs/1503.03832.

        :param labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels.
        :param embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should be l2 normalized.
        :return loss: tf.float32 scalar.
        """
        labels = tf.reshape(labels, [-1, 1])
        batch_size = tf.size(labels)
        # Build pairwise squared distance matrix.
        dist = euclidean_distance(embeddings, squared=True)
        # Build pairwise binary adjacency matrix (equal label mask).
        adjacency = tf.equal(labels, tf.transpose(labels))
        # Invert so we can select negatives only.
        adjacency_not = tf.logical_not(adjacency)

        # Compute the mask.
        dist_tile = tf.tile(dist, [batch_size, 1])  # stack dist matrix batch_size times, axis=0
        mask = tf.logical_and(tf.tile(adjacency_not, [batch_size, 1]), tf.greater(dist_tile, tf.reshape(dist, [-1, 1])))
        mask = tf.cast(mask, dtype=tf.float32)
        is_negatives_outside = tf.reshape(tf.greater(tf.reduce_sum(mask, axis=1, keepdims=True), 0.0), [batch_size, batch_size])
        is_negatives_outside = tf.transpose(is_negatives_outside)

        # negatives_outside: smallest D_an where D_an > D_ap.
        negatives_outside = tf.reshape(masked_minimum(dist_tile, mask), [batch_size, batch_size])
        negatives_outside = tf.transpose(negatives_outside)

        # negatives_inside: largest D_an.
        adjacency_not = tf.cast(adjacency_not, dtype=tf.float32)
        negatives_inside = tf.tile(masked_maximum(dist, adjacency_not), [1, batch_size])

        semi_hard_negatives = tf.where(is_negatives_outside, negatives_outside, negatives_inside)

        # In lifted-struct, the authors multiply 0.5 for upper triangular
        #   in semihard, they take all positive pairs except the diagonal.
        mask_positives = tf.cast(adjacency, dtype=tf.float32) - tf.linalg.diag(tf.ones([batch_size]))
        n_positives = tf.reduce_sum(mask_positives)

        loss_mat = get_loss_tensor(dist, semi_hard_negatives)
        loss = tf.math.divide_no_nan(tf.reduce_sum(tf.multiply(loss_mat, mask_positives)), n_positives)
        return loss

    def batch_all(labels, embeddings):
        """Compute the loss by generating all the valid triplets and averaging over the positive ones

        :param labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels.
        :param embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should be l2 normalized.
        :return loss: tf.float32 scalar.
        """
        dist = euclidean_distance(embeddings, squared=True)
        #mask = tf.to_float(valid_triplets_mask(labels))
        mask = tf.cast(valid_triplets_mask(labels), dtype=tf.float32)

        anchor_positive_dist = tf.expand_dims(dist, 2)
        anchor_negative_dist = tf.expand_dims(dist, 1)

        loss_tensor = get_loss_tensor(anchor_positive_dist, anchor_negative_dist)
        loss_tensor = tf.multiply(loss_tensor, mask)

        #num_non_easy_triplets = tf.reduce_sum(tf.to_float(tf.greater(loss_tensor, 1e-16)))
        num_non_easy_triplets = tf.reduce_sum(tf.cast(tf.greater(loss_tensor, 1e-16), dtype=tf.float32))
        #loss = tf.div_no_nan(tf.reduce_sum(loss_tensor), num_non_easy_triplets)
        loss = tf.math.divide_no_nan(tf.reduce_sum(loss_tensor), num_non_easy_triplets)
        return loss

    def batch_hard(labels, embeddings):
        """Compute the loss by generating only hardest valid triplets and averaging over the positive ones.
        One triplet per embedding, i.e. per anchor

        :param labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels.
        :param embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should be l2 normalized.
        :return loss: tf.float32 scalar.
        """
        dist = euclidean_distance(embeddings, squared=True)
        adjacency = tf.cast(tf.equal(tf.reshape(labels, (-1, 1)), tf.reshape(labels, (1, -1))), tf.float32)

        pos_dist = tf.reduce_max(adjacency * dist, axis=1)
        inf = tf.constant(1e+9, tf.float32)
        neg_dist = tf.reduce_min((adjacency * inf) + dist, axis=1)

        loss_mat = get_loss_tensor(pos_dist, neg_dist)

        num_non_easy_triplets = tf.reduce_sum(tf.to_float(tf.greater(loss_mat, 1e-16)))
        loss = tf.div_no_nan(tf.reduce_sum(loss_mat), num_non_easy_triplets)
        return loss

    if strategy == 'batch_semi_hard':
        return batch_semi_hard
    elif strategy == 'batch hard':
        return batch_hard
    else:
        return batch_all

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import random
import numpy as np
import pandas as pd
from random import  sample
from collections import Counter
from itertools import combinations


from keras.models import load_model
from keras.models import Model, Sequential
from keras import backend as K
from keras.optimizers import RMSprop, Adam
from keras.applications.mobilenet import MobileNet
from keras.layers import Input, Lambda, Dense, Flatten
from keras.callbacks import ModelCheckpoint
from sklearn.neighbors import KNeighborsClassifier
from keras.utils.generic_utils import CustomObjectScope


from keras.callbacks import Callback
from tensorflow.python.keras.utils.generic_utils import Progbar


def get_str2numb_numb2dict(vect):
    str_to_ind_dict = {}
    count = 0
    for v in vect:
        if v not in str_to_ind_dict.keys():
            str_to_ind_dict[v] = count
            count += 1
    reverse_dict = {v:k for k, v in str_to_ind_dict.items()}
    return str_to_ind_dict, reverse_dict

def apply_dict(dict_keys, X):
    res = []
    for x in X:
        res.append(dict_keys[x])
    return res

class ProgbarLossLogger(Callback):
    def __init__(self):
        super(ProgbarLossLogger, self).__init__()

    def on_train_begin(self, logs=None):
        self.epochs = self.params['epochs']

    def on_epoch_begin(self, epoch, logs=None):
        self.seen = 0
        self.target = self.params['steps']

        if self.epochs > 1:
            print('Epoch %d/%d' % (epoch + 1, self.epochs))
        self.progbar = Progbar(target=self.target, verbose=True, stateful_metrics=['loss'])

    def on_batch_begin(self, batch, logs=None):
        if self.seen < self.target:
            self.log_values = []

    def on_batch_end(self, batch, logs=None):
        logs = logs or {}
        num_steps = logs.get('num_steps', 1)
        self.seen += num_steps

        for k in self.params['metrics']:
            if k in logs:
                self.log_values.append((k, logs[k]))
        self.progbar.update(self.seen, self.log_values)
        
class TripletModel:
    def __init__(self, alpha, input_shape, cache_dir):
        self.alpha = alpha
        self.input_shape = input_shape
        self.cache_dir = cache_dir
        if not os.path.isdir(self.cache_dir):
            os.makedirs(self.cache_dir)
        self.model = self.build_model()
        self.embeddings = None
        
    def build_model(self):
        
        base_network = MobileNet(input_shape=self.input_shape, alpha=self.alpha, weights='imagenet', include_top=False, 
                                 pooling='avg')
        x = Dense(128)(base_network.output)
        x = Lambda(lambda x: K.l2_normalize(x, axis=1))(x)
        model = Model(inputs=base_network.input, outputs=x)
        model.summary()
        return model
           
    def train(self, train_dir, train_csv, validation_dir, validation_csv, epochs, batch_size=32, learning_rate=0.001, margin=0.5):
        train = pd.read_csv(train_csv)
        # validation = pd.read_csv(validation_csv)
        x_train, y_train = train['file_name'].as_matrix(), train['label'].as_matrix()
        # x_validation, y_validation = validation['file_name'].as_matrix(), validation['label'].as_matrix()
        
        str2ind_train_dict, ind2str_train_dict = get_str2numb_numb2dict(y_train)
        y_train = np.array(apply_dict(str2ind_train_dict, y_train))

        # str2ind_val_dict, ind2str_val_dict = get_str2numb_numb2dict(y_validation)
        # y_validation = np.array(apply_dict(str2ind_val_dict, y_validation))
        
        self.num_classes = len(np.unique(y_train))
        train_generator = WordsSequence(train_dir, input_shape=self.input_shape, x_set=x_train, y_set=y_train, batch_size=batch_size)
        # validation_generator = WordsSequence(validation_dir, input_shape=self.input_shape, x_set=validation_pairs, y_set=validation_y, batch_size=batch_size)

        # optimize = RMSprop(lr=learning_rate)
        optimize = Adam(lr=0.00001)
        self.model.summary()
        self.model.compile(loss=triplet_loss(margin=1.0, strategy="batch_semi_hard"), optimizer=optimize)
        #self.model.compile(loss=triplet_loss(margin=1.0, strategy="batch_all"), optimizer=optimize)
        
        # validation_data=validation_generator, 
        self.model.fit_generator(train_generator, shuffle=True, epochs=epochs, verbose=1, 
        callbacks=[ModelCheckpoint(filepath=os.path.join(self.cache_dir, 'checkpoint-{epoch:02d}.h5'), save_weights_only=True)])
        
        self.model.save('final_model.h5')
        self.save_weights('final_weights.h5')


    def save_embeddings(self, filename):
        self.embeddings.to_pickle(filename)
    
    def load_embeddings(self, filename):
        self.embeddings = pd.read_pickle(filename)    
        
    def save_weights(self, filename):
        self.model.save_weights(filename)
        
    def load_weights(self, filename):
        self.model.load_weights(filename, by_name=True, skip_mismatch=True)
        
    
    def make_embeddings(self, img_dir, csv, path_to_save, batch_size=32):
        if self.embeddings is not None:
            print(self.embeddings[0][0])
            self.clf = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
            self.clf.fit(self.embeddings[0][0], self.embeddings[0][1])
        else:
            data = pd.read_csv(csv)
            x, y = data['file_name'].as_matrix(), data['label'].as_matrix()
            
            self.str2ind_test_dict, self.ind2str_test_dict = get_str2numb_numb2dict(y)
            y = np.array(apply_dict(self.str2ind_test_dict, y))

            words = WordsSequence(img_dir, input_shape=self.input_shape, x_set=x, batch_size=batch_size)
            pred = self.model.predict_generator(words, verbose=1)

            self.clf = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
            self.clf.fit(pred, y) 
     
            self.embeddings =  pd.DataFrame(data=[pred, y])
            self.save_embeddings(path_to_save)
    
    def predict(self, img_dir, test_csv, author_tested = '', batch_size=32):
        self.model.summary()
        test = pd.read_csv(test_csv)
        x_test, y_test = test['file_name'].as_matrix(), test['label'].as_matrix()
        
        str2ind_test_dict, ind2str_test_dict = get_str2numb_numb2dict(y_test)
        test_y = np.array(apply_dict(str2ind_test_dict, y_test))

        words = WordsSequence(img_dir, input_shape=self.input_shape, x_set=x_test, batch_size=batch_size)
        test_embeddings = self.model.predict_generator(words, verbose=1)
 
        res = self.clf.predict(test_embeddings) 
        predict = np.array(apply_dict(ind2str_test_dict , res))
        
        autors = np.unique(y_test)
        autor_ind = [np.argwhere(y_test == a) for a in autors]
            
        if author_tested == '':
            
            count = 0
            for i,j in zip(predict, y_test):
                if i == j:
                    count += 1
            print('word accuracy: ', count / len(y_test))
        
            count = 0            
            for i,inds in enumerate(autor_ind):
                p = Counter(np.ravel(predict[inds])).most_common(1)[0][0]
                if p == autors[i]:
                    print('совпал автор №  ' + str(autors[i]))
                    count += 1
                else:
                     print('автор №   ' + str(autors[i]) + ' не совпал с ' + str(p))

            print('top-1 autor accuracy: ', count / len(autors))


            count = 0
            for i,inds in enumerate(autor_ind):
                p = [pair[0] for pair in Counter(np.ravel(predict[inds])).most_common(5)]
                if autors[i] in p:
                    count += 1

            print('top-5 autor accuracy: ', count / len(autors))
        
        else:
            autors = np.unique(y_test)
            autor_ind = [np.argwhere(y_test == a) for a in autors]
            for i,inds in enumerate(autor_ind):
                if autors[i] == img_name:
                    p = Counter(np.ravel(predict[inds])).most_common(100)[0][0]
            print('Words identified with these authors: ')
            #print(np.unique(p))
            return p


#### Functions to word selection

In [9]:
import os
import sys
import cv2
import random
import math
import itertools
import numpy as np
import matplotlib.pylab as plt
from matplotlib.pyplot import plot
import statistics   
from statistics import mean
from collections import namedtuple
import pandas as pd

GREEN = (0, 255, 0)
BLUR = (255, 0, 0)
RED = (0, 0, 255)

def show(img):
    """show rgb image"""
    ax = plt.axes([0,0,4,4], frameon=False)
    ax.set_axis_off()
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.show()
    
def show_gray(img):
    """show grayscale image"""
    ax = plt.axes([0,0,4,4], frameon=False)
    ax.set_axis_off()
    plt.imshow(img, cmap='gray', vmin=0, vmax=255)
    plt.show()
    
def save_words(path, words):
    for i, word in enumerate(words):
        cv2.imwrite(os.path.join(path , 'word_' + str(i) + '.png'), word)
        
def sort_w(word):
    shape = word.shape
    return shape[1]

class RECT:
    """Class that helps to  find where one contours are inside others"""
    def __init__(self, x, y, h,  w):
        self.x = x
        self.y = y
        self.h = h
        self.w = w

def is_rectangle_internal(R1,  R2):
    """If one rectangle contains another"""
    if ((R2.x+R2.w) < (R1.x+R1.w)) and ((R2.x) > (R1.x)) and ((R2.y) > (R1.y)) and ((R2.y+R2.h) < (R1.y+R1.h)):
            return True;
    else:
        return False;

def get_rectangles_from_contours(contours, h_img, w_img):
    """Get all rectangles from contours, delete internal contours"""
    rectangles = []
    for i, ctr in enumerate(contours):
        x, y, w, h = cv2.boundingRect(ctr)
        if w >= w_img/50 and h >= h_img/50 and w <= w_img/2 and h <= h_img/10:
            r = RECT(x,y,h, w)
            rectangles += [r]
    #Get only internal rectangles
    resulted_rectangles = []
    for i in range (len(rectangles)):
        isSmall = True
        r1 = rectangles[i]
        for j in range(i+1, len(rectangles)):
            r2 = rectangles[j]
            if (is_rectangle_internal(r1, r2)):
                isSmall = False       
        if isSmall:
            resulted_rectangles += [r1]
    return resulted_rectangles

def percent_of_white_pixels_word(thresh_image):
    """Count percent of white pixels in the concrete word"""
    white_pixels = 0
    h, w, = thresh_image.shape
    for i in range (h):
        for j in range (w):
            if (thresh_image[i][j] == 255):
                white_pixels += 1

    all_pixels = h *  w
    return (round(white_pixels/all_pixels, 2))

def percent_of_white_pixels(img, thresh_index):
    """Count percent of white pixels in the whole image"""
    print("thresh_index" + str(thresh_index))
    h, w, _ = img.shape
    # delete boundaries
    image = img[40:h-40, 40:w-40]
    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    ret,thresh = cv2.threshold(gray,thresh_index,255,cv2.THRESH_BINARY_INV)
    show(thresh)
    white_pixels = 0
    h, w = thresh.shape
    for i in range (h):
        for j in range (w):
            if (thresh[i][j] == 255):
                white_pixels += 1

    all_pixels = h *  w
    return (round(white_pixels/all_pixels, 2))

        
def compare_thresh_indexes(img):
    """Determines which binarization index is the best is the best """
    #values to check which index is optimal for binarization
    thresh_indexes = [107, 127, 147, 167, 187, 207, 227 ]
    dict_with_norm_white_percent = {}
    dict_with_big_white_percent = {}
    best_percent = 100; best_index = 0; best_val = 100
    
    # Add to dictionary all values of each index binarization
    for index in thresh_indexes:
        percent = percent_of_white_pixels(img, index)
        if percent >= 0.01 and percent <= 0.15:
            dict_with_norm_white_percent[index] = percent
            print("index " + str(index) + " - " + str(percent))
        else:
            dict_with_big_white_percent[index] = percent
            
    # Select best index of binarization
    if len(dict_with_norm_white_percent) == 0:
        for index in dict_with_big_white_percent.keys():
            if dict_with_big_white_percent[index] <= best_percent:
                best_percent = dict_with_big_white_percent[index]
                best_index = index
            
    else:
        for index in dict_with_norm_white_percent.keys():
            new_value = math.fabs(dict_with_norm_white_percent[index] - 0.07)
            if new_value <= best_val:
                best_val = new_value
                best_percent = dict_with_norm_white_percent[index]
                best_index = index
    print("best index = " + str(index) + " - best_percent = " + str(best_percent))
    return index

def get_thresh_image(img, index):
    if len(img.shape) == 3:
        h, w, _  = img.shape
    else:
        h, w  = img.shape
    contour = [0, 0, w, h]
    binary = cv2.cvtColor(img.copy(), cv2.COLOR_RGB2GRAY)
    show_gray(binary)
    ret,thresh = cv2.threshold(binary,index,255,cv2.THRESH_BINARY)
    show(thresh)
    return thresh

def prepare_binary_contour(contour):
    """Aggregate function to select lines"""
    hists, average_black_height, average_white_height, lines_count = build_hists(contour)

    sm_hists = smooth_hists(hists)
    #show_hists(sm_hists)
    valleys = find_valleys(sm_hists)
    #show_valleys(contour, valleys)
    lines, avg_height = get_lines(contour, valleys) 

    #show_lines(contour, lines)
    if lines != []:
        lines = filter_chunks(lines, average_black_height, average_white_height)
    else: 
        return []

    #show_lines(contour, lines)
    created_lines = get_first_approach_lines(lines, average_black_height)
    #draw_lines(contour, created_lines)

    created_lines = sorted(created_lines, key = lambda x: x.data[0].y)
    #draw_lines(contour, created_lines)
    extracted_lines = cut_lines(contour, created_lines)
    return extracted_lines


def build_hists(image):
    """Build hist to each chunk"""
    hists = []
    chunks = get_chunks(image)
    black_height = []; white_height = []
    lines_count = []
    for i, val in enumerate(chunks[:-1]):
        chunk = image[:, val : chunks[i + 1]]
        hist = np.sum(1 - (chunk / 255), axis=1)
        current_black_height = 0
        current_white_height = 0
        count = 0
        if hist[0] == 0:
            current_black_height += 1
        else:
            current_white_height += 1
        for pix in hist[1:]:
            if pix == 0:
                if current_black_height > 0:
                    current_black_height += 1
                else:
                    current_black_height += 1
                    white_height.append(current_white_height)
                    current_white_height = 0
            else:
                if current_white_height > 0:
                    current_white_height += 1
                else:
                    current_white_height += 1
                    black_height.append(current_black_height)
                    count += 1
                    current_black_height = 0
        lines_count.append(count)
        hists.append(hist)
    return hists, mean(black_height), mean(white_height), lines_count

def smooth_hists(hists):
    """Smooth hists witch is got in function build_hists"""
    new_hists = []
    for hist in hists:
        new_hists.append(median_smooth(hist))
    return new_hists

def median_smooth(signal, kernel_size = 5):
    """Smooth signal"""
    smooth_signal = []
    for i, val in enumerate(signal[:-kernel_size]):
        smooth_signal.append(sum(signal[i: i + kernel_size]) / kernel_size)
    return np.array(smooth_signal)

def show_hists(hists):
    """
    Draw hists
    """
    fig = plt.figure()
    y = np.arange(len(hists[0]))
    for i in range(len(hists)):
        h = hists[i]
        plt.plot(h[::-1] + i*35, y)
    ax = plt.axes([0,0,1,1], frameon=False)
    ax.set_axis_off()
    plt.show()
    fig.savefig('temp.png', dpi=fig.dpi)
    
def find_valleys(sm_hists, thresh = 1):
    """Finding troughs on smoothed histograms"""
    valleys = []
    interval_average_height = []
    for curr in sm_hists:
        curr[curr < thresh] = 0
        valleys_ind, curr_valley = [], []
        prev = 1
        for i, val in enumerate(curr[:-1]):
            if (val == 0 and prev != 0):
                curr_valley.append(i)
            if (val == 0 and curr[i + 1] != 0):
                curr_valley.append(i)
            if len(curr_valley) == 2:
                valleys_ind.append(curr_valley)
                interval_average_height.append(curr_valley[1] - curr_valley[0])
                curr_valley = []
            if len(curr_valley) == 1 and (i == len(curr) - 2):
                curr_valley.append(i)
                valleys_ind.append(curr_valley)
            prev = val
        valleys.append(valleys_ind)
    return valleys

def show_valleys(image, valleys, channels = 2):
    chunks = get_chunks(image, channels)
    img = image.copy()
    
    for i, y in enumerate(chunks[:-1]):
        for (x_1, x_2) in valleys[i]:
            cv2.line(img, (y, x_1), (chunks[i + 1], x_1),GREEN,3)
            cv2.line(img, (y, x_2), (chunks[i + 1], x_2),RED,3)
    if channels == 3:
        show(img)
    else:
        show_gray(img)     
        
def get_lines(image, valleys, channels = 2):
    """Calculate the middle line in the trough and the average line height"""
    total_lines = []
    chunks = get_chunks(image, channels)
    for i, chunk in enumerate(chunks[:-1]):
        chunk_lines = []
        for val in valleys[i]:
            chunk_lines.append(Trait(chunk, chunks[i+1], sum(val) // len(val)))
        total_lines.append(chunk_lines)
    
    height = []
    for i, line in enumerate(total_lines):
        for j, trait in enumerate(line[:-1]):
            height.append(abs(trait.y_1 - line[j + 1].y_1))
    height = sum(height) / len(height)
    return total_lines, height

def show_lines(image, lines):
    img = image.copy()
    for chunk in lines:
        for line in chunk:
            cv2.line(img, (line.x_1, line.y_1), (line.x_2, line.y_1),GREEN,3)
    show(img)
    
def get_chunks(image, channels = 1):
    """
    Split image on chunks, each chunk is 10 percent of image width
    return: array with y-coordinates
    """
    n, m = image.shape
    return np.arange(0, m + 1, m // 10)

Point = namedtuple('Point', ['x' , 'y'])

class Line:
    def __init__(self):
        self.data = []
        self.last_trait = None
        
    def continue_line(self, trait):
        self.data.append(Point(trait.x_1, trait.y_1))
        self.data.append(Point(trait.x_2, trait.y_1))

class Trait:
    def __init__(self, x_1, x_2, y_1):
        self.x_1 = x_1
        self.x_2 = x_2
        self.y_1 = y_1
        #self.y_2 = y_2
    
    def dist(self, other):
        return ((self.x_2 - other.x_1) ** 2 + (self.y_1 - other.y_1) ** 2) ** (1 / 2)
    
    def print_(self):
        print('x_1: {}  x_2: {}  y: {}'.format(self.x_1, self.x_2, self.y_1))

        
def filter_chunk(chunk, avg_black_height, average_white_height):
    if chunk == []:
        return []
    prev = chunk[0]
    new_traits = [prev]
    for i, trait in enumerate(chunk[1:-1]):
        next_trait = chunk[i + 2]
        if abs(prev.y_1 - trait.y_1) > (avg_black_height + average_white_height/4) \
            and abs(next_trait.y_1 - trait.y_1) > (avg_black_height+ average_white_height/4):
            new_traits.append(trait)
            prev = trait
    new_traits.append(chunk[-1])
    return new_traits

def filter_chunks(chunks, avg_black_height, average_white_height):
    new_chunks = []
    if chunks == []:
        return []
    for chunk in chunks:
        new_chunks.append(filter_chunk(chunk, avg_black_height, average_white_height))
    return new_chunks

def get_first_approach_lines(chunk_with_traits, avg_height):
    """
    Get first approach of splitting lines
    """
    if chunk_with_traits is None:
        return []
    created_lines = connect_two_chunks(chunk_with_traits[0], chunk_with_traits[1], avg_height)
    connect_start_lines_with_next_chunks(created_lines, chunk_with_traits[2], avg_height)
    for chunk in chunk_with_traits[3:]:
        connect_lines_with_chunk(created_lines, chunk, avg_height)
    #print('avg height = ' + str(avg_height))
    return created_lines
                
def connect_two_chunks(chunk_1, chunk_2, avg_height):
    if chunk_1 == [] or chunk_2 == []:
        return []
    all_inds_from_chunk2 = [i for i in range(len(chunk_2))]
    used_traits_from_chunk2 = []
    created_lines = []
    for trait_1 in chunk_1:
        minimum = sys.maxsize
        min_trait = Trait(trait_1.x_1, trait_1.x_2, 100)
        for j, trait_2 in enumerate(chunk_2):
            if trait_1.dist(trait_2) <  minimum:
                minimum = trait_1.dist(trait_2)
                min_trait = trait_2
            if minimum >= 2 * avg_height / 3:
                min_trait = Trait(min_trait.x_1, min_trait.x_2, trait_1.y_1)
                
            if (min_trait.x_1 == trait_2.x_1) and (min_trait.x_2 == trait_2.x_2) \
                and (min_trait.y_1 == trait_2.y_1):
                used_traits_from_chunk2.append(j)
        new_line = Line()
        new_line.continue_line(trait_1)
        new_line.continue_line(min_trait)
        new_line.last_trait = min_trait
        created_lines.append(new_line)
    unused_traits = list(set(all_inds_from_chunk2) - set(used_traits_from_chunk2))

    for trait in unused_traits:
        chunk = chunk_2[trait]
        new_line = Line()
        start_trait = Trait(chunk_1[0].x_1, chunk.x_1, chunk.y_1)
        new_line.continue_line(start_trait)
        new_line.continue_line(chunk)
        new_line.last_trait = chunk
        created_lines.append(new_line)
    return created_lines

def connect_start_lines_with_next_chunks(lines, chunk, avg_height):
    all_inds_from_chunk = [i for i in range(len(chunk))]
    used_traits_from_chunk = []
    for line in lines:
        trait_1 = line.last_trait
        min_trait = Trait(trait_1.x_1, trait_1.x_2, 100)
        minimum = sys.maxsize
        for j, trait_2 in enumerate(chunk):
            if trait_1.dist(trait_2) <  minimum:
                minimum = trait_1.dist(trait_2)
                min_trait = trait_2
            if minimum >= avg_height / 2.0:
                min_trait = Trait(min_trait.x_1, min_trait.x_2, trait_1.y_1)
            if (min_trait.x_1 == trait_2.x_1) and (min_trait.x_2 == trait_2.x_2) \
                and (min_trait.y_1 == trait_2.y_1):
                used_traits_from_chunk.append(j)
        line.continue_line(min_trait)
        line.last_trait = min_trait
    unused_traits = list(set(all_inds_from_chunk) - set(used_traits_from_chunk))
    print(unused_traits)
    for i in unused_traits:
        trait = chunk[i]
        start_trait = Trait(0, trait.x_1, trait.y_1)
        new_line = Line()
        new_line.continue_line(start_trait)
        new_line.continue_line(trait)
        new_line.last_trait = trait
        lines.append(new_line)
        
def connect_lines_with_chunk(lines, chunk, avg_height):
    """ Combine the remaining chunk and already created lines"""
    for line in lines: 
        trait_1 = line.last_trait
        minimum = sys.maxsize
        if len(chunk) == 0:
            return
        for trait_2 in chunk:
            if trait_1.dist(trait_2) <  minimum:
                minimum = trait_1.dist(trait_2)
                min_trait = trait_2
            if minimum >=  avg_height / 1:
            
                min_trait = Trait(min_trait.x_1, min_trait.x_2, trait_1.y_1)
        line.continue_line(min_trait)
        line.last_trait = min_trait
        
def cut_line(image, line_1, line_2):
    n, m = image.shape
    img = image.copy()
    x_list = []
    for i, point in enumerate(line_1.data[:-1]):
        next_point = line_1.data[i + 1]
        img[:point.y, point.x:next_point.x] = 255
        x_list.append(point.x)
    for i, point in enumerate(line_2.data[:-1]):
        next_point = line_2.data[i + 1]
        img[point.y:n, point.x:next_point.x] = 255
        x_list.append(point.x)
    max_x = max(x_list)
    img[:, max_x:m] = 255
    inds = np.argwhere(img == 0)
    if len(inds) != 0:
        x_min = min(inds[:, 0])
        x_max = max(inds[:, 0])
        y_min = min(inds[:, 1])
        y_max = max(inds[:, 1])
        show_gray(img[x_min:x_max,:])
        return img[x_min:x_max, y_min:y_max]       

def cut_lines(image, created_lines):
    lines = []
    for i, line in enumerate(created_lines[:-1]):
        lines.append(cut_line(image, line, created_lines[i + 1]))
    return lines

def draw_line(image, line):
    img = image.copy()
    color = [random.randint(0, 255) for _ in range(3)]
    for i, point in enumerate(line.data[:-1]):
        next_point = line.data[i + 1]
        cv2.line(img, (point.x, point.y), (next_point.x, next_point.y), color ,3)
    return img
    
def draw_lines(image, lines, channels = 2):
    """Draw splitting lines"""
    img = image.copy()
    for line in lines:
        img = draw_line(img, line)
    if channels == 3:
        cv2.imwrite("12345.png", img)
        show(img)
    else:
        show_gray(img)
        
def get_words_from_line(line, min_width = 10, thresh =  100): #400000
    """
    line : grayscale line
    min_width : min space length
    """
    words = []
    if line is None:
        return words
    n, m = line.shape
    image = cv2.bitwise_not(line)
    y = np.sum(image // 255, axis = 0)

    _, inds = np.where([y == 0])

    start = 0
    spaces = []
    for i, ind in enumerate(inds[:-1]):
        if (ind + 1 == inds[i + 1]) and (start == 0):
            start = ind
        elif (ind + 1 < inds[i + 1]) and (start != 0):
            if (ind - start) >= min_width:
                spaces.append([int(start), int(ind)])
                start = 0
            else:
                start = 0

    spaces = np.ravel(spaces)
    spaces = np.insert(spaces, [0, len(spaces)], [0, m])
    spaces = spaces.reshape(len(spaces) // 2, 2)     
    
    for inds in spaces:
        word = line[:, int(inds[0]): int(inds[1])]
        print("sum", np.sum(word))
        #show_gray(word)
        if np.sum(1 - (word / 255)) > thresh:
            words.append(word)
    return words

def extract_words(lines):
    words = []
    for line in lines:
        words += get_words_from_line(line)
    return words

def contours_extraction(img_path, thresh_index):
    img = cv2.imread(img_path)
    h_img, w_img, _ = img.shape
    image = img[40:h_img-40, 40:w_img-40]
    #show(image)
    words = []
    gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
    ret,thresh = cv2.threshold(gray,thresh_index,255,cv2.THRESH_BINARY_INV)
    #show(thresh)
    kernel = np.ones((5,5), np.uint8)
    img_dilation = cv2.dilate(thresh, kernel, iterations=1)
    ctrs, hier = cv2.findContours(img_dilation.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE )
    #sort contours
    sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
    resulted_rectangles = get_rectangles_from_contours(sorted_ctrs, h_img, w_img)
    
    # Show how resulted contours will be selected at image
    for rect in resulted_rectangles:
        cv2.rectangle(thresh,(rect.x,rect.y),( rect.x + rect.w, rect.y + rect.h ),(255,255,255),2)
    show(thresh)

    img = cv2.imread(img_path)
    image_to_cut = img[40:h_img-40, 40:w_img-40]
    
    #Add words into array
    for rect in resulted_rectangles:
        roi = image_to_cut.copy()[rect.y:rect.y+rect.h, rect.x:rect.x+rect.w]
        words.append(roi)
        
    return words
    
    #words.sort(key = sort_w, reverse = True)
    #res_words = []
    #for i, word in enumerate(words):
    #       res_words.append(word)
    #return res_words
    
def resulted_function(img_path, author, path_to_save):
    count_written = 0
    img = cv2.imread(img_path)
    #index1 = img_path.index('/')+1
    #index2 = img_path.index('.')
    #author = img_path[index1:index2]
    
    h_img, w_img, _ = img.shape
    # Get optimal binarization index
    res_index = compare_thresh_indexes(img)
    print('res_index ' + str(res_index))
    
    contours = contours_extraction(img_path, res_index)
    for i, contour in enumerate(contours):
        h, w, _ = contour.shape
        binary_contour = get_thresh_image(contour, res_index)
        percent = percent_of_white_pixels_word(binary_contour)

        if 0.7 <= percent <= 0.95 :
            print('contour № ' + str(i))
            show(binary_contour)
            try:
                if h >=  w/5 or w <= w_img / 15:
                    words = prepare_binary_contour(binary_contour)
                    #if lines is not None:
                        #words = extract_words(lines)
                    for k, word in enumerate(words):
                        try:
                            h_lw, w_lw = word.shape
                            if h_lw >= h/4:
                                path = path_to_save + '/' + author + '_' + 'word_' + str(i) + str(k) + '.png'
                                cv2.imwrite(path, word) 
                                count_written += 1
                        except:
                            path = path_to_save + '/' + author + '_' + 'word_' + str(i)+ '.png'
                            cv2.imwrite(path,binary_contour) 
                            count_written += 1
                else:
                    path = path_to_save + '/' + author + '_' + 'word_'+ str(i) + '.png'
                    cv2.imwrite(path,binary_contour) 
                    count_written += 1
            except (ZeroDivisionError, statistics.StatisticsError):
                path = path_to_save + '/' + author + '_' + 'word_'+ str(i) + '.png'
                cv2.imwrite(path,binary_contour) 
                count_written += 1

        #path = path_to_save + '/' + author + '_' + 'word_' + str(i)+ '.png'
       # cv2.imwrite(path,binary_contour) 
    print(len(contours))

def create_csv_file(words_path, words_csv):
    names = []
    authors = []
    for word_file in os.listdir(words_path):
        label =  word_file[:word_file.find('word' )-1]
        names.append(word_file)
        authors.append(label)
    pd.DataFrame({"file_name": names, "label": authors}) \
            .to_csv(words_csv, index=False, header=True, columns = ["file_name", "label"])  

### This function should be used to classify:

In [76]:
def classify_photo(path_to_photo, train_words, test_words, train_csv, test_csv, cache_dir, embeddings, 
                  weights, names_xlsx):
    
    img_name = 'author' +  str(random.randint(0, 100000))
    resulted_function(path_to_photo, img_name, path_to_test_words)
    
    create_csv_file(test_words, test_csv)
    create_csv_file(train_words, train_csv)
    
    try:
        os.mkdir(cache_dir)
    except OSError: 
        print('')
    
    model = TripletModel(alpha=0.75, input_shape=(160, 160, 3), cache_dir=cache_dir)
    model.load_weights(weights)
    #model.make_embeddings('train_words', 'train.csv', embeddings,  batch_size=32)
    model.load_embeddings(embeddings)
    model.make_embeddings(train_words, train_csv, embeddings, batch_size=1)
    
    d = {}
    result = model.predict(test_words, test_csv, author_tested, batch_size=1)
    
    names = pd.read_excel(names_xls, sheet_name=None)
    for i, id in enumerate(names['Лист1']['Identifier']):
        if id in (result):
            d[id] = names['Лист1']['Author'][i]

    for key in d.keys():
        print(d[key])

In [57]:
path_to_photo = 'Test photo/test_photo.png'

train_words = 'Test photo/train_words'
test_words = 'Test photo/test_words'

train_csv = 'Test photo/train.csv'
test_csv = 'Test photo/test.csv'

cache_dir = 'Test photo/triplet_cache'

embeddings = 'Test photo/embeddings.pkl'
weights = 'Test photo/final_weights.h5'

names_xls = 'Test photo/Autors_and_numbers.xlsx'

classify_photo(path_to_photo, train_words, test_words, train_csv, test_csv, cache_dir, embeddings, 
                  weights, names_xls)