# Writing Style Feature Extraction

Derived from  https://github.com/Malemm/ml-graphology

In [82]:
%matplotlib inline 
import numpy as np
import cv2
import math
import glob
import pandas as pd
from matplotlib import pyplot as plt
import os


In [None]:
IAM_DATA_LOCATION = '/data/iam_data'
path_data = f'{IAM_DATA_LOCATION}/lines'
path_result = f'{IAM_DATA_LOCATION}/iam_data/'
path_image = 'f'{IAM_DATA_LOCATION}/lines'
iam_xml_data = 'f'{IAM_DATA_LOCATION}/xml'
ontology_data = '../data/transcripts'

In [None]:
ANCHOR_POINT = 6000
IMG_THRESHOLD = 200
MIDZONE_THRESHOLD = 15000
MIN_HANDWRITING_HEIGHT_PIXEL = 20

In [None]:
''' The lines have shaded boxes around the letters.  Remove them '''
def removeNoise(img):
  if len(img.shape) == 3:
     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  else:
     gray = img
  th, threshed = cv2.threshold(gray, 70, 255, cv2.THRESH_BINARY_INV|cv2.THRESH_OTSU)
  if len(img.shape) == 3:
     threshed = cv2.cvtColor(threshed, cv2.COLOR_GRAY2BGR)
     threshed_marker = threshed>0
     return (img*threshed_marker) + (255-threshed)
  threshed_marker = threshed>0
  return (img*threshed_marker) + (255-threshed)

''' function for median filtering '''
def medianFilter(image, d):
  image = cv2.medianBlur(image,d)
  return image

''' function for INVERTED binary threshold '''  
def threshold(image, t=IMG_THRESHOLD):
  image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  ret,image = cv2.threshold(image,t,255,cv2.THRESH_BINARY_INV)
  return image

''' function for dilation of objects in the image '''
def dilate(image, kernalSize):
  kernel = np.ones(kernalSize, np.uint8)
  image = cv2.dilate(image, kernel, iterations=1)
  return image
  
''' function for erosion of objects in the image '''
def erode(image, kernalSize):
  kernel = np.ones(kernalSize, np.uint8)
  image = cv2.erode(image, kernel, iterations=1)
  return image

def widthandheight(img):
    a = np.where(img != 0)
    if len(a[0]) == 0:
        return 0, 0, 0, 0
    bbox = np.min(a[0]), np.max(a[0]), np.min(a[1]), np.max(a[1])
    h, w = bbox[1] - bbox[0], bbox[3] - bbox[2]
    return bbox[2], bbox[0], w, h

def boundingRegion(img):
    x, y, w, h = widthandheight(img)
    return (x, y), (x + w, y + h)

def show_file(test_imd):
    img = cv2.imread(test_imd)
    plt.imshow(img, interpolation='nearest')
    plt.show()

In [None]:
def deskew(img):
    SZ = img.shape[0]
    new_img = np.copy(img)
    angles = []
    for i in range(int(img.shape[1]/SZ)):
        sel = img[:,i*SZ:(i+1)*SZ]
        m = cv2.moments(sel)
        if abs(m['mu02']) < 1e-2:
            angles.append(0)
        skew = m['mu11']/m['mu02']
        M = np.float32([[1, skew, -0.5*SZ*skew], [0, 1, 0]])
        angles.append(skew)
        new_img[:,i*SZ:(i+1)*SZ] = cv2.warpAffine(sel, M, (sel.shape[1],sel.shape[0]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
    
    return new_img, 1.0 - np.mean(angles)

In [None]:

  
''' function for finding contours and straightening them horizontally. Straightening a line ( or lines)
    yields better results with horizontal projections. '''
def straighten(image):
  
  angle = 0.0
  angle_sum = 0.0
  contour_count = 0
  
  # these four variables are not being used, please ignore
  positive_angle_sum = 0.0 #downward
  negative_angle_sum = 0.0 #upward
  positive_count = 0
  negative_count = 0
  
  image = np.copy(image)

  filtered = removeNoise(image)

  # convert to grayscale and binarize the image by INVERTED binary thresholding
  thresh = threshold(filtered)
  #cv2.imshow('thresh',thresh)
  
  # dilate the handwritten lines in image with a suitable kernel for contour operation
  dilated = dilate(thresh, (5 ,100))
  
  ctrs,hier = cv2.findContours(dilated.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
  
  for i, ctr in enumerate(ctrs):
    x, y, w, h = cv2.boundingRect(ctr)
    
    # insure the contour is not a line if height > width or height is < 20 pixels. Here 20 is arbitrary.
    if h>w or h<MIN_HANDWRITING_HEIGHT_PIXEL:
      continue
    
    #  extract the region of interest/contour to be straightened.
    roi = image[y:y+h, x:x+w]

    # minAreaRect is necessary for straightening
    rect = cv2.minAreaRect(ctr)
    center = rect[0]
    angle = rect[2]

    # Author's original comment was not useful
    if angle < -45.0:
      angle += 90.0;
    rot = cv2.getRotationMatrix2D(((x+w)/2,(y+h)/2), angle, 1)
    try:
       extract = cv2.warpAffine(roi, rot, (w,h), borderMode=cv2.BORDER_CONSTANT, borderValue=(255,255,255))
    except Exception as ex:
        print(ex)
        continue

    # image is overwritten with the straightened contour
    image[y:y+h, x:x+w] = extract

    angle_sum += angle
    contour_count += 1
  
  # mean angle of the contours (not lines) is found
  mean_angle = angle_sum / contour_count
  #print ("Average baseline angle: "+str(mean_angle))
  return image,mean_angle

''' function to calculate horizontal projection of the image pixel rows and return it '''
def horizontalProjection(img):
    # Return a list containing the sum of the pixels in each row
     return np.sum(img,axis=1)/255
  
''' function to calculate vertical projection of the image pixel columns and return it '''
def verticalProjection(img):
    # Return a list containing the sum of the pixels in each column
    (h, w) = img.shape[:2]
    return np.sum(img,axis=0)/255


In [None]:

''' function to boundary of line of handwritten text '''
def extractLineAverageLetterSize(img):
    
  filtered = removeNoise(img)
  
  # convert to grayscale and binarize the image by INVERTED binary thresholding
  # it's better to clear unwanted dark areas at the document left edge and use a high threshold value to preserve more text pixels
  thresh = threshold(filtered)

  # extract a list containing values of the vertical projection of the image 
  vp = verticalProjection(thresh)

  boundaries = boundingRegion(thresh)
    
  lines = [(boundaries[0][0],boundaries[1][1])]

  low_value = np.quantile(vp,0.2)

  average_letter_size = np.mean(vp[vp>low_value]) 

  squished = thresh[:,vp>low_value]

  tops  = [np.where(squished[:,i])[0][0] for i in range(squished.shape[1])]

  relative_top_margin =  np.mean(tops)/squished.shape[0]
     
  return lines, relative_top_margin, average_letter_size, squished


In [None]:
  
''' function to extract words from the lines using vertical projection '''
def extractWords(image, lines, letter_size):
  
  # removed any 'boxes' an other shades of noise around letters
  filtered = removeNoise(image)
  
  # convert to grayscale and binarize the image by INVERTED binary thresholding
  thresh = threshold(filtered)
  #cv2.imshow('thresh', wthresh)
  
  # Width of the whole document is found once.
  width = thresh.shape[1]
  height = thresh.shape[0]
  words = [] # a 2D list storing the coordinates of each word: y1, y2, x1, x2

  # Isolated words or components will be extacted from each line by looking at occurance of 0's in its vertical projection.
  for i, line in enumerate(lines):
    extract = thresh[line[0]:line[1], 0:width] # y1:y2, x1:x2
    vp = verticalProjection(extract)
    #print(vp.shape)

    wordStart = 0
    wordEnd = 0
    setWordStart = True

    space_threshold = np.quantile(vp,0.03)
   # print(space_threshold)
    # we are scanning the vertical projection
    for j, sum in enumerate(vp):
      if(sum<=space_threshold):
        if not setWordStart:
            setWordStart = True
            words.append([line[0], line[1], wordStart, j-1])
      elif setWordStart:
        wordStart = j
        setWordStart = False
    
    
  new_words = []
  combine_words = []
  for i in range(len(words)):
        word = words[i]
        next_space  = words[i+1][2] - word[3] if i != (len(words)-1) else None
        if (word[3] - word[2]) < height/1.2 and next_space is not None and next_space < height/2:
            combine_words.append(word)
        else:
            if len(combine_words)> 0: 
                new_words.append((combine_words[0][0],combine_words[0][1],combine_words[0][2],word[3]))
                combine_words = []
            else:
                new_words.append(word)
                
  if len(combine_words) > 0:
      new_words.append((combine_words[0][0],combine_words[0][1],combine_words[0][2],word[3]))
                
  spaces = [(words[i+1][2] - words[i][3]) for i in range(len(words)-1)]
  space_columns = np.sum(spaces)
  space_count = len(spaces)
  if(space_count == 0):
    space_count = 1
  average_word_spacing = float(space_columns) / space_count
  relative_word_spacing = average_word_spacing / letter_size
  #print "Average word spacing: "+str(average_word_spacing)
  #print ("Average word spacing relative to average letter size: "+str(relative_word_spacing))
  return new_words, relative_word_spacing

In [None]:
''' function to determine the average slant of the handwriting '''
def extractSlant(img, wordCoordinates):
  
  '''
  0.01 radian = 0.5729578 degree :: I had to put this instead of 0.0 becuase there was a bug yeilding inacurate value which I could not figure out!
  5 degree = 0.0872665 radian :: Hardly noticeable or a very little slant
  15 degree = 0.261799 radian :: Easily noticeable or average slant
  30 degree = 0.523599 radian :: Above average slant
  45 degree = 0.785398 radian :: Extreme slant
  '''
  # We are checking for 9 different values of angle
  #theta = [-0.785398, -0.523599, -0.261799, -0.0872665, 0.01, 0.0872665, 0.261799, 0.523599, 0.785398]
  theta = [-0.785398, -0.523599, -0.436332, -0.349066, -0.261799, -0.174533, -0.0872665, 0, 0.0872665, 0.174533, 0.261799, 0.349066, 0.436332, 0.523599, 0.785398]
  angles = [45,30,15,5,0, -5,-15,-30,-45]
  s_function = np.zeros(9)
  img = threshold(img)

  SZ  = img.shape[0]
  # loop for each value of angle in theta
  for i, angle in enumerate(angles):
    s_temp = 0.0 # overall sum of the functions of all the columns of all the words!
    count = 0 # just counting the number of columns considered to contain a vertical stroke and thus contributing to s_tem
    
    for word in wordCoordinates:
       height = SZ
       word_image = img[:,word[2]:word[3]]
       width = word_image.shape[1]
       #plt.imshow(word_image, interpolation='nearest')
       #plt.show() 
      
        # the distance in pixel we will shift for affine transformation
        # it's divided by 2 because the uppermost point and the lowermost points are being equally shifted in opposite directions
       shift = (math.tan(angle) * height) / 2
      
       # the amount of extra space we need to add to the original image to preserve information
       # yes, this is adding more number of columns but the effect of this will be negligible
       pad_length = abs(int(shift))
      
       # create a new image that can perfectly hold the transformed and thus widened image
       new_image = np.zeros((height,width+pad_length*2), np.uint8)
       new_image[:, pad_length:width+pad_length] = word_image
      
       # points to consider for affine transformation
       (height, width) = new_image.shape[:2]
        
       rot = cv2.getRotationMatrix2D(((width)/2,(height)/2), angle, 1)
       # the approach from the paper, vs using the rotation matrix
       #M = np.float32([[1, skew, -0.5*SZ*skew], [0, 1, 0]])
       try:
           deslanted = cv2.warpAffine(new_image, rot, (width,height), borderMode=cv2.BORDER_CONSTANT, borderValue=(0))
       except Exception as ex:
           print(ex)
           continue
      
       # find the vertical projection on the transformed image
       vp = verticalProjection(deslanted)

       # loop for each value of vertical projection, which is for each column in the word image
       for k, sum in enumerate(vp):
         # the columns is empty
         if(sum == 0):
            continue
        
         # this is the number of foreground pixels in the column being considered
         num_fgpixel = sum

         # if number of foreground pixels is less than onethird of total pixels, it is not a vertical stroke so we can ignore
         if(num_fgpixel < int(height/3)):
            continue
        
         # the column itself is extracted, and flattened for easy operation
         column = deslanted[0:height, k:k+1]
         column = column.flatten()
        
         # now we are going to find the distance between topmost pixel and bottom-most pixel
         # l counts the number of empty pixels from top until and upto a foreground pixel is discovered
         for l, pixel in enumerate(column):
           if(pixel==0):
             continue
           break
         # m counts the number of empty pixels from bottom until and upto a foreground pixel is discovered
         for m, pixel in enumerate(column[::-1]):
           if(pixel==0):
             continue
           break
          
         # the distance is found as delta_y, I just followed the naming convention in the research paper I followed
         delta_y = height - (l+m)
      
         # please refer the research paper for more details of this function, anyway it's nothing tricky
         h_sq = (float(num_fgpixel)/delta_y)**2
        
         # I am multiplying by a factor of num_fgpixel/height to the above function to yeild better result
         # this will also somewhat negate the effect of adding more columns and different column counts in the transformed image of the same word
         h_wted = (h_sq * num_fgpixel) / height

         # add up the values from all the loops of ALL the columns of ALL the words in the image
         s_temp += h_wted
        
         count += 1
   
    s_function[i] = s_temp
  
  #print(s_function)
  # finding the largest value and corresponding index
  max_index = np.argmax(s_function)

  # We will add another value 9 manually to indicate irregular slant behaviour.
  # This will be seen as value 4 (no slant) but 2 corresponding angles of opposite sign will have very close values.
  if(max_index == 0):
    angle = 45
    result =  " : Extremely right slanted"
  elif(max_index == 1):
    angle = 30
    result = " : Above average right slanted"
  elif(max_index == 2):
    angle = 15
    result = " : Average right slanted"
  elif(max_index == 3):
    angle = 5
    result = " : A little right slanted"
  elif(max_index == 5):
    angle = -5
    result = " : A little left slanted"
  elif(max_index == 6):
    angle = -15
    result = " : Average left slanted"
  elif(max_index == 7):
    angle = -30
    result = " : Above average left slanted"
  elif(max_index == 8):
    angle = -45
    result = " : Extremely left slanted"
  elif(max_index == 4):
    try:
        p = s_function[4] / s_function[3]
        q = s_function[4] / s_function[5]
    except:
        p = 0
        q = 0
    #print 'p='+str(p)+' q='+str(q)
    # the constants here are abritrary but I think suits the best
    if((p <= 1.2 and q <= 1.2) or (p > 1.4 and q > 1.4)):
      angle = 0
      result = " : No slant"
    elif((p <= 1.2 and q-p > 0.4) or (q <= 1.2 and p-q > 0.4)):
      angle = 0
      result = " : No slant"
    else:
      max_index = 9
      angle = 180
      result =  " : Irregular slant behaviour"
  return angle

''' function to extract average pen pressure of the handwriting '''
def barometer(image):

  # it's extremely necessary to convert to grayscale first
  image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  # inverting the image pixel by pixel individually. This costs the maximum time and processing in the entire process!
  h, w = image.shape[:]
  inverted = image
  for x in range(h):
    for y in range(w):
      inverted[x][y] = 255 - image[x][y]
  
  # binary thresholding. Here we use 'threshold to zero' which is crucial for what we want.
  # If src(x,y) is lower than threshold=100, the new pixel value will be set to 0, else it will be left untouched!
  ret, thresh = cv2.threshold(filtered, 100, 255, cv2.THRESH_TOZERO)
  
  # add up all the non-zero pixel values in the image and divide by the number of them to find the average pixel value in the whole image
  total_intensity = 0
  pixel_count = 0
  for x in range(h):
    for y in range(w):
      if(thresh[x][y] > 0):
        total_intensity += thresh[x][y]
        pixel_count += 1
        
  average_intensity = float(total_intensity) / pixel_count


  return average_intensity

In [None]:

''' main '''
def run_analysis(file_name, show=[]):

    file_basename = os.path.basename(file_name)
    # read image from disk
    
    image = cv2.imread(file_name)
        #image = cv2.imread('images/'+file_name)
        #cv2.imshow(file_name,image)

        # Extract pen pressure. It's such a cool function name!
    pen_pressure  = barometer(image)

    # apply contour operation to straighten the contours  
    # the returned image is straightened version of the original image without filtration and binarization
    straightened, baseline_angle = straighten(image)
    if 'straighten' in show:
       print(('straighten', baseline_angle))
       plt.imshow(img, interpolation='nearest')
       plt.show()
    
    lineIndices, top_margin, letter_size, squished = extractLineAverageLetterSize(image)
    if 'extractLineAverageLetterSize' in show:
       print(('extractLineAverageLetterSize', lineIndices, top_margin, letter_size))
       plt.imshow(squished, interpolation='nearest')
       plt.show()
    
        # extract words from each line using vertical projection
        # it returns a 4D list of the vertical starting and ending indices and horizontal starting and ending indices (in that order) of each word in the handwriting
    wordCoordinates, word_spacing = extractWords(straightened, lineIndices, letter_size)
    if 'extractWords' in show:
       print(('extractWords', wordCoordinates, word_spacing))
        
    deskwed_image, skew_angle = deskew(squished)
    if 'deskew' in show:
        print(('skew',skew_angle))
        plt.imshow(deskwed_image, interpolation='nearest')
        plt.show()

    slant_angle = extractSlant(straightened, wordCoordinates)
    if 'extractSlant' in show:
        print(('extractSlant',slant_angle))
    
    baseline_angle = round(baseline_angle, 2)
    skew_angle = round(skew_angle, 2)
    top_margin = round(top_margin, 2)
    letter_size = round(letter_size, 2)
    word_spacing = round(word_spacing, 2)
    pen_pressure = round(pen_pressure, 2)
    slant_angle = round(slant_angle, 2)

    return [baseline_angle, top_margin, letter_size, word_spacing, pen_pressure, slant_angle,skew_angle,file_basename]
    


In [None]:

def pen_pressure_score(im, debug = False):
    kernel = np.ones((5,5),np.uint8)
    # black is pen, white is paper
    #im = im.astype(np.uint8)
    im_i = 255 - im
    ret, th = cv2.threshold(im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    th_i = 255 - th
    if debug:
        cv2.imwrite(fn.replace('.png', '.th.png'), th_i)
    th_m = cv2.morphologyEx(th_i, cv2.MORPH_CLOSE, kernel)
    return 1.0 - np.sum((th_m * im_i).astype(np.float) / 255.0) / np.sum(th_m / 255.0), \
           1.0 - np.sum((th_m*th_i).astype(np.float)/255.0)/np.sum(th_m/255.0)


In [None]:
# create a list of all writing images
files = glob.glob(path_data+ '/**/*.png',recursive=True)
len(files)

In [None]:
#Pen pressure scores
score_list = []
for file in files:
    im = cv2.imread(file, cv2.IMREAD_GRAYSCALE).astype(np.uint8)
    score_list.append((os.path.basename(file),) + pen_pressure_score(im))

In [None]:
my_list = {}

In [None]:
#writing attributes
for file in files:
    if os.path.basename(file) not in my_list:
        try:
            #show_file(file)
            my_list[os.path.basename(file)] = (run_analysis(file, show=['']))
        except Exception as ex:
            show_file(file)
            print(file)
            print (ex)
            raise(ex)

In [None]:
import xmltodict
writers_to_file_id = {}
path_to_files = os.path.join('iam_xml_data','*.xml')
for filename in sorted(glob.glob(path_to_files, recursive= True)):
    with open(filename) as fd:
        doc = xmltodict.parse(fd.read())
        writers_to_file_id[os.path.splitext(os.path.basename(filename))[0]] = doc['form']['@writer-id']

In [87]:
#Generate parameter file to be used for the difficulty measurement    
df0=pd.DataFrame(my_list.values(),columns =['BASELINE_ANGLE', 'TOP_MARGIN', 'LETTER_SIZE', 
                                            'WORD_SPACING', 'PEN_PRESSURE', 
                                            'SLANT_ANGLE','SKEW_ANGLE','file'])
df_pps=pd.DataFrame(score_list,columns=['file','pen_pressure_0','pen_pressure_1'])
df0['file'] = df0.file.astype(str)
df_pps['file'] = df_pps.file.astype(str)
df = df0.join(df_pps.set_index('file'), on='file')
df['writer']      = df['file'].apply(lambda x : writers_to_file_id['-'.join(x.split('-')[:-1])])
df['angle_norm'] = df['BASELINE_ANGLE']/np.max(df['BASELINE_ANGLE'])
df['letter_size_norm'] = (df['LETTER_SIZE']-np.min(df['LETTER_SIZE']))/(np.max(df['LETTER_SIZE'])-np.min(df['LETTER_SIZE']))
df['slant_angle_norm'] = (df['SLANT_ANGLE']-np.min(df['SLANT_ANGLE']))/(np.max(df['SLANT_ANGLE'])-np.min(df['SLANT_ANGLE']))
df['word_spacing_norm'] = (df['WORD_SPACING']-np.min(df['WORD_SPACING']))/(np.max(df['WORD_SPACING'])-np.min(df['WORD_SPACING']))
df['pen_pressure_norm'] = (df['PEN_PRESSURE']-np.min(df['PEN_PRESSURE']))/(np.max(df['PEN_PRESSURE'])-np.min(df['PEN_PRESSURE']))
df.to_csv(os.path.join(path_result,'iam_parameters.csv'),index=False)
df

Unnamed: 0,BASELINE_ANGLE,TOP_MARGIN,LETTER_SIZE,LINE_SPACING,WORD_SPACING,PEN_PRESSURE,SLANT_ANGLE,SKEW_ANGLE,file,pen_pressure_0,pen_pressure_1,writer,angle_norm,letter_size_norm,slant_angle_norm,word_spacing_norm,pen_pressure_norm
0,0.00,0.33,16.19,0,1.13,255.0,0,0.84,b06-012-00.png,0.557651,0.996304,126,0.000000,0.232323,0.200000,0.088627,1.0
1,-0.35,0.34,16.29,0,0.91,255.0,180,0.90,b06-012-01.png,0.540414,0.996357,126,-0.035105,0.234787,1.000000,0.071373,1.0
2,-0.13,0.33,17.17,0,0.96,255.0,180,0.88,b06-012-02.png,0.551377,0.996346,126,-0.013039,0.256467,1.000000,0.075294,1.0
3,-0.22,0.38,17.81,0,1.06,255.0,180,0.70,b06-012-03.png,0.567585,0.996387,126,-0.022066,0.272235,1.000000,0.083137,1.0
4,0.00,0.32,16.29,0,1.39,255.0,0,0.66,b06-012-04.png,0.529823,0.996324,126,0.000000,0.234787,0.200000,0.109020,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12896,0.00,0.22,20.68,0,0.74,255.0,-30,0.93,d06-100-03.png,0.528131,0.996251,207,0.000000,0.342942,0.066667,0.058039,1.0
12897,0.84,0.28,19.73,0,0.67,255.0,-15,1.15,d06-100-04.png,0.537016,0.996223,207,0.084253,0.319537,0.133333,0.052549,1.0
12898,0.00,0.36,18.95,0,0.70,255.0,-15,0.95,d06-100-05.png,0.542985,0.996262,207,0.000000,0.300320,0.133333,0.054902,1.0
12899,0.00,0.28,17.78,0,1.16,255.0,-30,0.95,d06-100-06.png,0.538419,0.996246,207,0.000000,0.271495,0.066667,0.090980,1.0


In [95]:
#Write out the writer profile
params = ['letter_size_norm',  'word_spacing_norm', 'pen_pressure_norm', 'slant_angle_norm','angle_norm']
writer_summary    = df.groupby('writer')[params].agg(['mean', 'std','min','max'])
writer_summary

Unnamed: 0_level_0,letter_size_norm,letter_size_norm,letter_size_norm,letter_size_norm,word_spacing_norm,word_spacing_norm,word_spacing_norm,word_spacing_norm,pen_pressure_norm,pen_pressure_norm,pen_pressure_norm,pen_pressure_norm,slant_angle_norm,slant_angle_norm,slant_angle_norm,slant_angle_norm,angle_norm,angle_norm,angle_norm,angle_norm
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max
writer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
000,0.147719,0.046463,0.044839,0.329391,0.215860,0.061387,0.023529,0.525490,1.0,0.0,1.0,1.0,0.348263,0.421613,0.000000,1.000000,0.011884,0.073550,-0.227683,0.332999
001,0.221395,0.042316,0.164080,0.310668,0.199272,0.044968,0.113725,0.282353,1.0,0.0,1.0,1.0,0.293651,0.299943,0.133333,1.000000,0.020203,0.058966,-0.060181,0.162487
002,0.102869,0.024427,0.052722,0.151762,0.224742,0.072237,0.039216,0.312157,1.0,0.0,1.0,1.0,0.400000,0.000000,0.400000,0.400000,-0.063007,0.113424,-0.334002,0.017051
003,0.198745,0.019873,0.168268,0.243656,0.140161,0.027752,0.100392,0.200784,1.0,0.0,1.0,1.0,0.275817,0.029459,0.222222,0.333333,0.001180,0.107979,-0.233701,0.250752
004,0.184419,0.036419,0.131313,0.242424,0.216906,0.044316,0.144314,0.281569,1.0,0.0,1.0,1.0,0.288889,0.033333,0.266667,0.333333,-0.031093,0.054160,-0.161484,0.011033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.247211,0.021995,0.217788,0.282582,0.116975,0.006411,0.109020,0.123922,1.0,0.0,1.0,1.0,0.571429,0.407080,0.200000,1.000000,0.009314,0.015972,0.000000,0.035105
668,0.149618,0.018945,0.119980,0.185760,0.223451,0.059673,0.138824,0.359216,1.0,0.0,1.0,1.0,0.286667,0.044997,0.266667,0.400000,0.017452,0.024059,0.000000,0.061184
669,0.294715,0.037435,0.263612,0.345898,0.157451,0.043576,0.118431,0.211765,1.0,0.0,1.0,1.0,0.566667,0.500370,0.133333,1.000000,0.039619,0.048064,0.000000,0.097292
670,0.240687,0.031117,0.141661,0.365607,0.182296,0.056372,0.011765,0.338824,1.0,0.0,1.0,1.0,0.276620,0.133445,0.200000,1.000000,0.028157,0.169453,-0.530592,0.870612


In [94]:
param_discrete = {}
for p in params:
    param_discrete[p] = np.histogram(writer_summary[p],bins=(3 if p != 'slant_angle_norm' else 9))[1]
    writer_summary[p + '_discrete'] = writer_summary[p]['mean'].apply(lambda x : np.argmax(x < param_discrete[p]))
  
writer_summary.to_csv(os.path.join(path_result,'writer_profile.csv'),index=False)

In [103]:
writer_summary

Unnamed: 0_level_0,letter_size_norm,letter_size_norm,letter_size_norm,letter_size_norm,word_spacing_norm,word_spacing_norm,word_spacing_norm,word_spacing_norm,pen_pressure_norm,pen_pressure_norm,pen_pressure_norm,pen_pressure_norm,slant_angle_norm,slant_angle_norm,slant_angle_norm,slant_angle_norm,angle_norm,angle_norm,angle_norm,angle_norm
Unnamed: 0_level_1,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max,mean,std,min,max
writer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2
000,0.147719,0.046463,0.044839,0.329391,0.215860,0.061387,0.023529,0.525490,1.0,0.0,1.0,1.0,0.348263,0.421613,0.000000,1.000000,0.011884,0.073550,-0.227683,0.332999
001,0.221395,0.042316,0.164080,0.310668,0.199272,0.044968,0.113725,0.282353,1.0,0.0,1.0,1.0,0.293651,0.299943,0.133333,1.000000,0.020203,0.058966,-0.060181,0.162487
002,0.102869,0.024427,0.052722,0.151762,0.224742,0.072237,0.039216,0.312157,1.0,0.0,1.0,1.0,0.400000,0.000000,0.400000,0.400000,-0.063007,0.113424,-0.334002,0.017051
003,0.198745,0.019873,0.168268,0.243656,0.140161,0.027752,0.100392,0.200784,1.0,0.0,1.0,1.0,0.275817,0.029459,0.222222,0.333333,0.001180,0.107979,-0.233701,0.250752
004,0.184419,0.036419,0.131313,0.242424,0.216906,0.044316,0.144314,0.281569,1.0,0.0,1.0,1.0,0.288889,0.033333,0.266667,0.333333,-0.031093,0.054160,-0.161484,0.011033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,0.247211,0.021995,0.217788,0.282582,0.116975,0.006411,0.109020,0.123922,1.0,0.0,1.0,1.0,0.571429,0.407080,0.200000,1.000000,0.009314,0.015972,0.000000,0.035105
668,0.149618,0.018945,0.119980,0.185760,0.223451,0.059673,0.138824,0.359216,1.0,0.0,1.0,1.0,0.286667,0.044997,0.266667,0.400000,0.017452,0.024059,0.000000,0.061184
669,0.294715,0.037435,0.263612,0.345898,0.157451,0.043576,0.118431,0.211765,1.0,0.0,1.0,1.0,0.566667,0.500370,0.133333,1.000000,0.039619,0.048064,0.000000,0.097292
670,0.240687,0.031117,0.141661,0.365607,0.182296,0.056372,0.011765,0.338824,1.0,0.0,1.0,1.0,0.276620,0.133445,0.200000,1.000000,0.028157,0.169453,-0.530592,0.870612


In [None]:
#writer_summary=read_csv(os.path.join(path_result,'writer_profile.csv'))

import os
import csv
import networkx

from activityrec.ontology import Ontology
from activityrec.ontology import _sub_graph, _distance


In [72]:
ontology = Ontology('ontology_data')

In [69]:
for i in range (len(writer_summary)):
    row = writer_summary.iloc[i]
    writer = ontology.add_check_label(writer_summary.index[i], 'writer-id')
    for param in params:
        dtype = param + '_discrete'
        val = int(row[dtype])
        label = ontology.add_check_label(str(val), dtype)
        ontology.add_edge(label,writer)
ontology.save()

    1.0
Name: 000, dtype: float64
Add  LETTER_SIZE_discrete: 1
    2.0
Name: 000, dtype: float64
Add  LINE_SPACING_discrete: 2
    1.0
Name: 000, dtype: float64
    0.0
Name: 000, dtype: float64
Add  PEN_PRESSURE_discrete: 0
    2.0
Name: 000, dtype: float64
    5.0
Name: 000, dtype: float64
Add  SKEW_ANGLE_discrete: 5
    3.0
Name: 000, dtype: float64
Add  pen_pressure_0_discrete: 3
    3.0
Name: 000, dtype: float64
    1.0
Name: 001, dtype: float64
    2.0
Name: 001, dtype: float64
    1.0
Name: 001, dtype: float64
    0.0
Name: 001, dtype: float64
    1.0
Name: 001, dtype: float64
    5.0
Name: 001, dtype: float64
    2.0
Name: 001, dtype: float64
    3.0
Name: 001, dtype: float64
    1.0
Name: 002, dtype: float64
    2.0
Name: 002, dtype: float64
    1.0
Name: 002, dtype: float64
    0.0
Name: 002, dtype: float64
    2.0
Name: 002, dtype: float64
    5.0
Name: 002, dtype: float64
    3.0
Name: 002, dtype: float64
    3.0
Name: 002, dtype: float64
    1.0
Name: 003, dtype: float64
 

Name: 056, dtype: float64
    1.0
Name: 056, dtype: float64
    0.0
Name: 056, dtype: float64
    1.0
Name: 056, dtype: float64
    5.0
Name: 056, dtype: float64
    2.0
Name: 056, dtype: float64
    3.0
Name: 056, dtype: float64
    2.0
Name: 058, dtype: float64
    2.0
Name: 058, dtype: float64
    1.0
Name: 058, dtype: float64
    0.0
Name: 058, dtype: float64
    1.0
Name: 058, dtype: float64
    5.0
Name: 058, dtype: float64
    3.0
Name: 058, dtype: float64
    3.0
Name: 058, dtype: float64
    2.0
Name: 059, dtype: float64
    2.0
Name: 059, dtype: float64
    1.0
Name: 059, dtype: float64
    0.0
Name: 059, dtype: float64
    1.0
Name: 059, dtype: float64
    5.0
Name: 059, dtype: float64
    3.0
Name: 059, dtype: float64
    3.0
Name: 059, dtype: float64
    1.0
Name: 060, dtype: float64
    2.0
Name: 060, dtype: float64
    1.0
Name: 060, dtype: float64
    0.0
Name: 060, dtype: float64
    1.0
Name: 060, dtype: float64
    5.0
Name: 060, dtype: float64
    2.0
Name: 060, dty

    3.0
Name: 094, dtype: float64
    1.0
Name: 100, dtype: float64
    2.0
Name: 100, dtype: float64
    1.0
Name: 100, dtype: float64
    0.0
Name: 100, dtype: float64
    1.0
Name: 100, dtype: float64
    5.0
Name: 100, dtype: float64
    3.0
Name: 100, dtype: float64
    3.0
Name: 100, dtype: float64
    1.0
Name: 102, dtype: float64
    2.0
Name: 102, dtype: float64
    1.0
Name: 102, dtype: float64
    3.0
Name: 102, dtype: float64
    2.0
Name: 102, dtype: float64
    5.0
Name: 102, dtype: float64
    3.0
Name: 102, dtype: float64
    3.0
Name: 102, dtype: float64
    1.0
Name: 103, dtype: float64
    2.0
Name: 103, dtype: float64
    1.0
Name: 103, dtype: float64
    0.0
Name: 103, dtype: float64
    2.0
Name: 103, dtype: float64
    5.0
Name: 103, dtype: float64
    2.0
Name: 103, dtype: float64
    3.0
Name: 103, dtype: float64
    1.0
Name: 104, dtype: float64
    2.0
Name: 104, dtype: float64
    1.0
Name: 104, dtype: float64
    0.0
Name: 104, dtype: float64
    1.0
Name: 

Name: 144, dtype: float64
    5.0
Name: 144, dtype: float64
    3.0
Name: 144, dtype: float64
    3.0
Name: 144, dtype: float64
    2.0
Name: 145, dtype: float64
    2.0
Name: 145, dtype: float64
    1.0
Name: 145, dtype: float64
    0.0
Name: 145, dtype: float64
    2.0
Name: 145, dtype: float64
    5.0
Name: 145, dtype: float64
    2.0
Name: 145, dtype: float64
    3.0
Name: 145, dtype: float64
    1.0
Name: 146, dtype: float64
    2.0
Name: 146, dtype: float64
    1.0
Name: 146, dtype: float64
    0.0
Name: 146, dtype: float64
    1.0
Name: 146, dtype: float64
    5.0
Name: 146, dtype: float64
    2.0
Name: 146, dtype: float64
    3.0
Name: 146, dtype: float64
    1.0
Name: 147, dtype: float64
    2.0
Name: 147, dtype: float64
    1.0
Name: 147, dtype: float64
    0.0
Name: 147, dtype: float64
    2.0
Name: 147, dtype: float64
    5.0
Name: 147, dtype: float64
    2.0
Name: 147, dtype: float64
    3.0
Name: 147, dtype: float64
    1.0
Name: 148, dtype: float64
    2.0
Name: 148, dty

Name: 184, dtype: float64
    1.0
Name: 185, dtype: float64
    2.0
Name: 185, dtype: float64
    1.0
Name: 185, dtype: float64
    0.0
Name: 185, dtype: float64
    1.0
Name: 185, dtype: float64
    5.0
Name: 185, dtype: float64
    2.0
Name: 185, dtype: float64
    3.0
Name: 185, dtype: float64
    1.0
Name: 186, dtype: float64
    2.0
Name: 186, dtype: float64
    1.0
Name: 186, dtype: float64
    0.0
Name: 186, dtype: float64
    3.0
Name: 186, dtype: float64
    5.0
Name: 186, dtype: float64
    2.0
Name: 186, dtype: float64
    3.0
Name: 186, dtype: float64
    2.0
Name: 187, dtype: float64
    2.0
Name: 187, dtype: float64
    1.0
Name: 187, dtype: float64
    0.0
Name: 187, dtype: float64
    1.0
Name: 187, dtype: float64
    5.0
Name: 187, dtype: float64
    2.0
Name: 187, dtype: float64
    3.0
Name: 187, dtype: float64
    1.0
Name: 188, dtype: float64
    2.0
Name: 188, dtype: float64
    1.0
Name: 188, dtype: float64
    0.0
Name: 188, dtype: float64
    3.0
Name: 188, dty

Name: 225, dtype: float64
    0.0
Name: 225, dtype: float64
    1.0
Name: 225, dtype: float64
    5.0
Name: 225, dtype: float64
    2.0
Name: 225, dtype: float64
    3.0
Name: 225, dtype: float64
    1.0
Name: 226, dtype: float64
    2.0
Name: 226, dtype: float64
    1.0
Name: 226, dtype: float64
    3.0
Name: 226, dtype: float64
    2.0
Name: 226, dtype: float64
    5.0
Name: 226, dtype: float64
    3.0
Name: 226, dtype: float64
    3.0
Name: 226, dtype: float64
    1.0
Name: 227, dtype: float64
    2.0
Name: 227, dtype: float64
    1.0
Name: 227, dtype: float64
    0.0
Name: 227, dtype: float64
    1.0
Name: 227, dtype: float64
    5.0
Name: 227, dtype: float64
    3.0
Name: 227, dtype: float64
    3.0
Name: 227, dtype: float64
    1.0
Name: 228, dtype: float64
    2.0
Name: 228, dtype: float64
    1.0
Name: 228, dtype: float64
    0.0
Name: 228, dtype: float64
    1.0
Name: 228, dtype: float64
    5.0
Name: 228, dtype: float64
    2.0
Name: 228, dtype: float64
    3.0
Name: 228, dty

Name: 265, dtype: float64
    5.0
Name: 265, dtype: float64
    2.0
Name: 265, dtype: float64
    3.0
Name: 265, dtype: float64
    1.0
Name: 266, dtype: float64
    2.0
Name: 266, dtype: float64
    1.0
Name: 266, dtype: float64
    0.0
Name: 266, dtype: float64
    2.0
Name: 266, dtype: float64
    5.0
Name: 266, dtype: float64
    3.0
Name: 266, dtype: float64
    3.0
Name: 266, dtype: float64
    1.0
Name: 267, dtype: float64
    2.0
Name: 267, dtype: float64
    1.0
Name: 267, dtype: float64
    0.0
Name: 267, dtype: float64
    1.0
Name: 267, dtype: float64
    5.0
Name: 267, dtype: float64
    3.0
Name: 267, dtype: float64
    3.0
Name: 267, dtype: float64
    1.0
Name: 268, dtype: float64
    2.0
Name: 268, dtype: float64
    1.0
Name: 268, dtype: float64
    0.0
Name: 268, dtype: float64
    1.0
Name: 268, dtype: float64
    5.0
Name: 268, dtype: float64
    2.0
Name: 268, dtype: float64
    3.0
Name: 268, dtype: float64
    1.0
Name: 269, dtype: float64
    2.0
Name: 269, dty

Name: 308, dtype: float64
    2.0
Name: 309, dtype: float64
    2.0
Name: 309, dtype: float64
    1.0
Name: 309, dtype: float64
    0.0
Name: 309, dtype: float64
    2.0
Name: 309, dtype: float64
    5.0
Name: 309, dtype: float64
    3.0
Name: 309, dtype: float64
    3.0
Name: 309, dtype: float64
    3.0
Name: 310, dtype: float64
    2.0
Name: 310, dtype: float64
    1.0
Name: 310, dtype: float64
    3.0
Name: 310, dtype: float64
    1.0
Name: 310, dtype: float64
    5.0
Name: 310, dtype: float64
    3.0
Name: 310, dtype: float64
    3.0
Name: 310, dtype: float64
    1.0
Name: 312, dtype: float64
    2.0
Name: 312, dtype: float64
    1.0
Name: 312, dtype: float64
    0.0
Name: 312, dtype: float64
    2.0
Name: 312, dtype: float64
    6.0
Name: 312, dtype: float64
    2.0
Name: 312, dtype: float64
    3.0
Name: 312, dtype: float64
    2.0
Name: 313, dtype: float64
    2.0
Name: 313, dtype: float64
    1.0
Name: 313, dtype: float64
    0.0
Name: 313, dtype: float64
    1.0
Name: 313, dty

Name: 350, dtype: float64
    1.0
Name: 350, dtype: float64
    0.0
Name: 350, dtype: float64
    2.0
Name: 350, dtype: float64
    5.0
Name: 350, dtype: float64
    2.0
Name: 350, dtype: float64
    3.0
Name: 350, dtype: float64
    1.0
Name: 351, dtype: float64
    2.0
Name: 351, dtype: float64
    1.0
Name: 351, dtype: float64
    0.0
Name: 351, dtype: float64
    1.0
Name: 351, dtype: float64
    5.0
Name: 351, dtype: float64
    2.0
Name: 351, dtype: float64
    3.0
Name: 351, dtype: float64
    1.0
Name: 352, dtype: float64
    2.0
Name: 352, dtype: float64
    1.0
Name: 352, dtype: float64
    0.0
Name: 352, dtype: float64
    2.0
Name: 352, dtype: float64
    5.0
Name: 352, dtype: float64
    3.0
Name: 352, dtype: float64
    3.0
Name: 352, dtype: float64
    1.0
Name: 353, dtype: float64
    2.0
Name: 353, dtype: float64
    1.0
Name: 353, dtype: float64
    3.0
Name: 353, dtype: float64
    2.0
Name: 353, dtype: float64
    5.0
Name: 353, dtype: float64
    3.0
Name: 353, dty

Name: 394, dtype: float64
    1.0
Name: 394, dtype: float64
    0.0
Name: 394, dtype: float64
    1.0
Name: 394, dtype: float64
    5.0
Name: 394, dtype: float64
    3.0
Name: 394, dtype: float64
    3.0
Name: 394, dtype: float64
    1.0
Name: 395, dtype: float64
    2.0
Name: 395, dtype: float64
    1.0
Name: 395, dtype: float64
    0.0
Name: 395, dtype: float64
    2.0
Name: 395, dtype: float64
    5.0
Name: 395, dtype: float64
    2.0
Name: 395, dtype: float64
    3.0
Name: 395, dtype: float64
    1.0
Name: 396, dtype: float64
    2.0
Name: 396, dtype: float64
    1.0
Name: 396, dtype: float64
    0.0
Name: 396, dtype: float64
    2.0
Name: 396, dtype: float64
    5.0
Name: 396, dtype: float64
    2.0
Name: 396, dtype: float64
    3.0
Name: 396, dtype: float64
    1.0
Name: 397, dtype: float64
    2.0
Name: 397, dtype: float64
    1.0
Name: 397, dtype: float64
    3.0
Name: 397, dtype: float64
    1.0
Name: 397, dtype: float64
    5.0
Name: 397, dtype: float64
    3.0
Name: 397, dty

Name: 435, dtype: float64
    1.0
Name: 435, dtype: float64
    0.0
Name: 435, dtype: float64
    2.0
Name: 435, dtype: float64
    5.0
Name: 435, dtype: float64
    2.0
Name: 435, dtype: float64
    3.0
Name: 435, dtype: float64
    1.0
Name: 436, dtype: float64
    2.0
Name: 436, dtype: float64
    1.0
Name: 436, dtype: float64
    0.0
Name: 436, dtype: float64
    1.0
Name: 436, dtype: float64
    5.0
Name: 436, dtype: float64
    2.0
Name: 436, dtype: float64
    3.0
Name: 436, dtype: float64
    2.0
Name: 439, dtype: float64
    2.0
Name: 439, dtype: float64
    1.0
Name: 439, dtype: float64
    0.0
Name: 439, dtype: float64
    2.0
Name: 439, dtype: float64
    5.0
Name: 439, dtype: float64
    2.0
Name: 439, dtype: float64
    3.0
Name: 439, dtype: float64
    1.0
Name: 440, dtype: float64
    2.0
Name: 440, dtype: float64
    1.0
Name: 440, dtype: float64
    3.0
Name: 440, dtype: float64
    2.0
Name: 440, dtype: float64
    5.0
Name: 440, dtype: float64
    3.0
Name: 440, dty

Name: 477, dtype: float64
    1.0
Name: 477, dtype: float64
    5.0
Name: 477, dtype: float64
    2.0
Name: 477, dtype: float64
    3.0
Name: 477, dtype: float64
    1.0
Name: 478, dtype: float64
    2.0
Name: 478, dtype: float64
    1.0
Name: 478, dtype: float64
    0.0
Name: 478, dtype: float64
    2.0
Name: 478, dtype: float64
    5.0
Name: 478, dtype: float64
    2.0
Name: 478, dtype: float64
    3.0
Name: 478, dtype: float64
    1.0
Name: 479, dtype: float64
    2.0
Name: 479, dtype: float64
    1.0
Name: 479, dtype: float64
    0.0
Name: 479, dtype: float64
    3.0
Name: 479, dtype: float64
    6.0
Name: 479, dtype: float64
    2.0
Name: 479, dtype: float64
    3.0
Name: 479, dtype: float64
    2.0
Name: 480, dtype: float64
    2.0
Name: 480, dtype: float64
    1.0
Name: 480, dtype: float64
    0.0
Name: 480, dtype: float64
    1.0
Name: 480, dtype: float64
    5.0
Name: 480, dtype: float64
    2.0
Name: 480, dtype: float64
    3.0
Name: 480, dtype: float64
    1.0
Name: 481, dty

Name: 519, dtype: float64
    1.0
Name: 519, dtype: float64
    0.0
Name: 519, dtype: float64
    2.0
Name: 519, dtype: float64
    5.0
Name: 519, dtype: float64
    2.0
Name: 519, dtype: float64
    3.0
Name: 519, dtype: float64
    1.0
Name: 520, dtype: float64
    2.0
Name: 520, dtype: float64
    1.0
Name: 520, dtype: float64
    0.0
Name: 520, dtype: float64
    2.0
Name: 520, dtype: float64
    5.0
Name: 520, dtype: float64
    2.0
Name: 520, dtype: float64
    3.0
Name: 520, dtype: float64
    2.0
Name: 521, dtype: float64
    2.0
Name: 521, dtype: float64
    1.0
Name: 521, dtype: float64
    0.0
Name: 521, dtype: float64
    1.0
Name: 521, dtype: float64
    5.0
Name: 521, dtype: float64
    2.0
Name: 521, dtype: float64
    3.0
Name: 521, dtype: float64
    2.0
Name: 522, dtype: float64
    2.0
Name: 522, dtype: float64
    1.0
Name: 522, dtype: float64
    0.0
Name: 522, dtype: float64
    1.0
Name: 522, dtype: float64
    5.0
Name: 522, dtype: float64
    2.0
Name: 522, dty

Name: 559, dtype: float64
    1.0
Name: 560, dtype: float64
    2.0
Name: 560, dtype: float64
    1.0
Name: 560, dtype: float64
    0.0
Name: 560, dtype: float64
    2.0
Name: 560, dtype: float64
    5.0
Name: 560, dtype: float64
    3.0
Name: 560, dtype: float64
    3.0
Name: 560, dtype: float64
    2.0
Name: 561, dtype: float64
    2.0
Name: 561, dtype: float64
    1.0
Name: 561, dtype: float64
    0.0
Name: 561, dtype: float64
    1.0
Name: 561, dtype: float64
    5.0
Name: 561, dtype: float64
    2.0
Name: 561, dtype: float64
    3.0
Name: 561, dtype: float64
    1.0
Name: 562, dtype: float64
    2.0
Name: 562, dtype: float64
    1.0
Name: 562, dtype: float64
    0.0
Name: 562, dtype: float64
    1.0
Name: 562, dtype: float64
    5.0
Name: 562, dtype: float64
    3.0
Name: 562, dtype: float64
    3.0
Name: 562, dtype: float64
    1.0
Name: 563, dtype: float64
    2.0
Name: 563, dtype: float64
    1.0
Name: 563, dtype: float64
    0.0
Name: 563, dtype: float64
    2.0
Name: 563, dty

Name: 600, dtype: float64
    1.0
Name: 601, dtype: float64
    2.0
Name: 601, dtype: float64
    1.0
Name: 601, dtype: float64
    0.0
Name: 601, dtype: float64
    2.0
Name: 601, dtype: float64
    5.0
Name: 601, dtype: float64
    2.0
Name: 601, dtype: float64
    3.0
Name: 601, dtype: float64
    2.0
Name: 602, dtype: float64
    2.0
Name: 602, dtype: float64
    1.0
Name: 602, dtype: float64
    0.0
Name: 602, dtype: float64
    1.0
Name: 602, dtype: float64
    5.0
Name: 602, dtype: float64
    3.0
Name: 602, dtype: float64
    3.0
Name: 602, dtype: float64
    2.0
Name: 603, dtype: float64
    2.0
Name: 603, dtype: float64
    1.0
Name: 603, dtype: float64
    0.0
Name: 603, dtype: float64
    2.0
Name: 603, dtype: float64
    5.0
Name: 603, dtype: float64
    3.0
Name: 603, dtype: float64
    3.0
Name: 603, dtype: float64
    2.0
Name: 604, dtype: float64
    2.0
Name: 604, dtype: float64
    1.0
Name: 604, dtype: float64
    0.0
Name: 604, dtype: float64
    2.0
Name: 604, dty

Name: 641, dtype: float64
    3.0
Name: 641, dtype: float64
    3.0
Name: 641, dtype: float64
    1.0
Name: 642, dtype: float64
    2.0
Name: 642, dtype: float64
    1.0
Name: 642, dtype: float64
    0.0
Name: 642, dtype: float64
    2.0
Name: 642, dtype: float64
    5.0
Name: 642, dtype: float64
    2.0
Name: 642, dtype: float64
    3.0
Name: 642, dtype: float64
    2.0
Name: 643, dtype: float64
    2.0
Name: 643, dtype: float64
    1.0
Name: 643, dtype: float64
    0.0
Name: 643, dtype: float64
    2.0
Name: 643, dtype: float64
    5.0
Name: 643, dtype: float64
    2.0
Name: 643, dtype: float64
    3.0
Name: 643, dtype: float64
    2.0
Name: 644, dtype: float64
    2.0
Name: 644, dtype: float64
    1.0
Name: 644, dtype: float64
    0.0
Name: 644, dtype: float64
    2.0
Name: 644, dtype: float64
    6.0
Name: 644, dtype: float64
    2.0
Name: 644, dtype: float64
    3.0
Name: 644, dtype: float64
    1.0
Name: 645, dtype: float64
    2.0
Name: 645, dtype: float64
    1.0
Name: 645, dty

In [111]:
params = ['letter_size_norm',  'word_spacing_norm', 'pen_pressure_norm', 'slant_angle_norm','angle_norm']
with open('../data/transcripts/writer_onto_distance.csv','w') as fp:
  fp.write(f'writer-one,writer-two,distance\n')
  for i in range (len(writer_summary)-1):
    for j in range (i+1, len(writer_summary)):
      rowi = writer_summary.iloc[i]
      rowj = writer_summary.iloc[j]
      dists = 0
      for param in params:
         dtype = param
         dists += abs(rowi[dtype]['mean'] - rowj[dtype]['mean'])
         #print((i,j, rowi[dtype]['mean'],int(rowj[dtype]['mean'], dists))
      fp.write(f'{ontology.names[writer_summary.index[i]]},{ontology.names[writer_summary.index[j]]},{dists}\n')

In [112]:
distances=pd.read_csv('../data/transcripts/writer_onto_distance.csv')
distances

Unnamed: 0,to,from,distance
0,0,145,0.153196
1,0,173,0.180360
2,0,210,0.209875
3,0,284,0.140098
4,0,359,0.560066
...,...,...,...
196873,691,48,0.152975
196874,691,49,0.329091
196875,692,48,0.380381
196876,692,49,0.353645


In [119]:
distance_to_known = {}
for i in range(len(distances)):
    row = distances.iloc[i,:]
    if row[0] < 50:
        current_max = distance_to_known.get(row[1],(0,99999999))
        if current_max[1] > abs(row[2]):
            distance_to_known[int(row[1])] = [int(row[0]),abs(row[2])]
        
distance_to_known   

{145: [0, 0.15319555120035952],
 173: [0, 0.18036024122606584],
 210: [0, 0.20987485282801635],
 284: [0, 0.1400975385870535],
 359: [0, 0.5600664567622538],
 361: [0, 0.23797428061935255],
 89: [0, 0.11399948557277467],
 90: [0, 0.15222944064811633],
 91: [0, 0.12125358227655095],
 92: [0, 0.6405145185557255],
 94: [0, 0.22596569752649345],
 95: [0, 0.1476642281429846],
 96: [0, 1.3807200831688151],
 98: [0, 0.2411840537156131],
 99: [0, 0.13594951560716526],
 101: [0, 0.2386726726149624],
 102: [0, 0.3492417922457849],
 103: [0, 0.245673093000754],
 104: [0, 0.3394608815195924],
 105: [0, 0.1379473760823709],
 121: [0, 0.15004273695622475],
 124: [0, 0.16982538920418122],
 125: [0, 0.24891108038768184],
 126: [0, 0.3948958083016828],
 127: [0, 0.20322917267053112],
 128: [0, 0.08606600370190917],
 129: [0, 0.25548265306013274],
 130: [0, 0.1724265433252007],
 131: [0, 0.6409300205320283],
 132: [0, 0.39119069042895743],
 133: [0, 0.7851030082719681],
 134: [0, 0.3941230977218255],
 1

In [120]:
with open('../data/transcripts/writer_onto_distance_to_knowns.csv','w') as fp:
    fp.write(f'writer-id,known-writer-id,distance\n')
    for k,v in distance_to_known.items():
        fp.write(f'{int(k)},{int(v[0])},{v[1]}\n')